In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [7]:
df = pd.read_csv('advertising.csv', parse_dates= ['Timestamp'])
df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.90,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.50,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,Fundamental modular algorithm,Duffystad,1,Lebanon,2016-02-11 21:49:00,1
996,51.30,45,67782.17,134.42,Grass-roots cohesive monitoring,New Darlene,1,Bosnia and Herzegovina,2016-04-22 02:07:01,1
997,51.63,51,42415.72,120.37,Expanded intangible solution,South Jessica,1,Mongolia,2016-02-01 17:24:57,1
998,55.55,19,41920.79,187.95,Proactive bandwidth-monitored policy,West Steven,0,Guatemala,2016-03-24 02:35:54,0


In [8]:
# Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Daily Time Spent on Site  1000 non-null   float64       
 1   Age                       1000 non-null   int64         
 2   Area Income               1000 non-null   float64       
 3   Daily Internet Usage      1000 non-null   float64       
 4   Ad Topic Line             1000 non-null   object        
 5   City                      1000 non-null   object        
 6   Male                      1000 non-null   int64         
 7   Country                   1000 non-null   object        
 8   Timestamp                 1000 non-null   datetime64[ns]
 9   Clicked on Ad             1000 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(3)
memory usage: 78.2+ KB


In [10]:
# Check summary statistics for numerical columns
df.describe().round(2)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Timestamp,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000,1000.0
mean,65.0,36.01,55000.0,180.0,0.48,2016-04-10 10:34:06.636000256,0.5
min,32.6,19.0,13996.5,104.78,0.0,2016-01-01 02:52:10,0.0
25%,51.36,29.0,47031.8,138.83,0.0,2016-02-18 02:55:42,0.0
50%,68.22,35.0,57012.3,183.13,0.0,2016-04-07 17:27:29.500000,0.5
75%,78.55,42.0,65470.63,218.79,1.0,2016-05-31 03:18:14,1.0
max,91.43,61.0,79484.8,269.96,1.0,2016-07-24 00:22:16,1.0
std,15.85,8.79,13414.63,43.9,0.5,,0.5


In [12]:
# Check summary statistics for categorical columns
df.describe(include= 'object')

Unnamed: 0,Ad Topic Line,City,Country
count,1000,1000,1000
unique,1000,969,237
top,Cloned 5thgeneration orchestration,Lisamouth,France
freq,1,3,9


In [13]:
df.Timestamp.nunique()

1000

In [14]:
df.drop(['Timestamp', 'Ad Topic Line', 'City'], axis= 1, inplace= True)

In [16]:
df.duplicated().sum()

0

In [15]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Country,Clicked on Ad
0,68.95,35,61833.9,256.09,0,Tunisia,0
1,80.23,31,68441.85,193.77,1,Nauru,0
2,69.47,26,59785.94,236.5,0,San Marino,0
3,74.15,29,54806.18,245.89,1,Italy,0
4,68.37,35,73889.99,225.58,0,Iceland,0


In [17]:
df.isna().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Male                        0
Country                     0
Clicked on Ad               0
dtype: int64

# Data Preprocessing

### Split Data into Input features and Target variable

In [18]:
x = df.drop('Clicked on Ad', axis= 1)
y = df['Clicked on Ad']

### Split Data into Train & Test

In [19]:
y.value_counts()

Clicked on Ad
0    500
1    500
Name: count, dtype: int64

In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 0, stratify= y)

### Handle Numerical Columns

In [48]:
num_cols = x_train.columns.drop(['Male', 'Country'])
num_cols

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage'],
      dtype='object')

In [49]:
from sklearn.preprocessing import  RobustScaler

rc = RobustScaler()

x_train[num_cols] = rc.fit_transform(x_train[num_cols])
x_test[num_cols] = rc.transform(x_test[num_cols])

In [50]:
x_train.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Country
600,0.59484,1.083333,-0.569248,-0.580319,1,Kyrgyz Republic
737,0.120824,1.416667,-0.863072,-0.750515,0,Sweden
33,-0.459354,-1.0,-1.483623,0.374008,0,Senegal
519,-1.224016,0.416667,-0.615119,-0.392377,1,Mongolia
341,0.157944,1.083333,-0.246821,-0.840987,0,Mexico


### Handle Categorical Columns

In [51]:
from category_encoders import BinaryEncoder

be = BinaryEncoder()

x_train_be = be.fit_transform(x_train[['Country']])
x_test_be = be.transform(x_test[['Country']])

In [52]:
x_train_be.head()

Unnamed: 0,Country_0,Country_1,Country_2,Country_3,Country_4,Country_5,Country_6,Country_7
600,0,0,0,0,0,0,0,1
737,0,0,0,0,0,0,1,0
33,0,0,0,0,0,0,1,1
519,0,0,0,0,0,1,0,0
341,0,0,0,0,0,1,0,1


In [53]:
x_train = pd.concat([x_train, x_train_be], axis= 1).drop('Country', axis= 1)
x_test = pd.concat([x_test, x_test_be], axis= 1).drop('Country', axis= 1)

In [54]:
x_train.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Country_0,Country_1,Country_2,Country_3,Country_4,Country_5,Country_6,Country_7
600,0.59484,1.083333,-0.569248,-0.580319,1,0,0,0,0,0,0,0,1
737,0.120824,1.416667,-0.863072,-0.750515,0,0,0,0,0,0,0,1,0
33,-0.459354,-1.0,-1.483623,0.374008,0,0,0,0,0,0,0,1,1
519,-1.224016,0.416667,-0.615119,-0.392377,1,0,0,0,0,0,1,0,0
341,0.157944,1.083333,-0.246821,-0.840987,0,0,0,0,0,0,1,0,1


# Machine Learning

In [55]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty= None, random_state= 42)

lr.fit(x_train, y_train)

print('Training Accuracy :', lr.score(x_train, y_train) * 100)
print('Test Accuracy :', lr.score(x_test, y_test) * 100)

Training Accuracy : 97.375
Test Accuracy : 95.0


In [59]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty= 'l2', C= 0.5, random_state= 42)

lr.fit(x_train, y_train)

print('Training Accuracy :', lr.score(x_train, y_train) * 100)
print('Test Accuracy :', lr.score(x_test, y_test) * 100)

Training Accuracy : 97.375
Test Accuracy : 94.5
