# Preprocessing

In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [21]:
df.sample(1)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Timestamp,Clicked on Ad,city,province,category,Year,Month,Week,Day,Day_of_Week
551,81.95,31,452890060.0,208.76,Laki-Laki,2016-05-01 21:46:00,No,Surabaya,Jawa Timur,Food,2016,5,17,1,6


## Missing Values Handling

In [18]:
round(df.isnull().sum()*100/df.shape[0],1)

Daily Time Spent on Site    1.3
Age                         0.0
Area Income                 1.3
Daily Internet Usage        1.1
Male                        0.3
Timestamp                   0.0
Clicked on Ad               0.0
city                        0.0
province                    0.0
category                    0.0
Year                        0.0
Month                       0.0
Week                        0.0
Day                         0.0
Day_of_Week                 0.0
dtype: float64

In [23]:
df.describe().transpose()[['mean','50%']]

Unnamed: 0,mean,50%
Daily Time Spent on Site,64.92952,68.11
Age,36.009,35.0
Area Income,384864700.0,399068300.0
Daily Internet Usage,179.8636,182.65
Month,3.817,4.0


In [5]:
df.fillna(df.median(), inplace=True)
df['Male'].fillna(df['Male'].mode()[0], inplace=True)

## Duplicated Data

In [29]:
df.duplicated().sum()

0

## Drop Features

* Feature with too many unique values
* Feature with 1 unique value
* Feature with small variations (mostly distributed only in 2 values)
* Unnecessary features

In [6]:
df.drop(['Timestamp','city','province','Year', 'Week', 'Day'], axis=1, inplace=True)

## Feature Encoding

In [7]:
male_map = {'Perempuan': 0, 'Laki-Laki': 1}
click_ads_map = {'No': 0, 'Yes': 1}
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul'}
day_week_map = {0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}

df['Male'] = df['Male'].map(male_map)
df['Clicked on Ad'] = df['Clicked on Ad'].map(click_ads_map)
df['Month'] = df['Month'].map(month_map)
df['Day_of_Week'] = df['Day_of_Week'].map(day_week_map)

cols = ['category', 'Month', 'Day_of_Week']
df = pd.concat([df, pd.get_dummies(df[cols], prefix_sep = ':')], axis = 1)

In [8]:
df.drop(['category', 'Month', 'Day_of_Week'], axis=1, inplace=True)

## Data Splitting

In [39]:
df.sample(1)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,category:Bank,category:Electronic,category:Fashion,category:Finance,category:Food,category:Furniture,category:Health,category:House,category:Otomotif,category:Travel,Month:Apr,Month:Feb,Month:Jan,Month:Jul,Month:Jun,Month:Mar,Month:May,Day_of_Week:Fri,Day_of_Week:Mon,Day_of_Week:Sat,Day_of_Week:Sun,Day_of_Week:Thu,Day_of_Week:Tue,Day_of_Week:Wed
477,44.96,50,369614000.0,132.71,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [9]:
y = df['Clicked on Ad']
X = df.drop(['Clicked on Ad'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size = 0.2, 
                                                  random_state = 42, 
                                                  stratify = y)

## Feature Selection

### Numerical Features Significancy Test

In [41]:
num_cols = ['Age','Daily Time Spent on Site','Daily Internet Usage','Area Income']
F_statistic, p_value = f_classif(X_train[num_cols], 
                                 y_train)

dict_ = {'Feature': num_cols, 
        'F-score' : F_statistic,
        'p-value' : p_value.round(decimals=6)}

anova_table = pd.DataFrame.from_dict(dict_, orient='index').T
anova_table['F-score'] = pd.to_numeric(anova_table['F-score'])
anova_table['p-value'] = pd.to_numeric(anova_table['p-value'])

anova_table['significance'] = anova_table.apply(lambda x: 'Not Significant' if x['p-value'] >= 0.05 else 'Significant', axis=1)
anova_table = anova_table.merge(X_train[X_train.columns].describe().T.reset_index(), 
                                left_on='Feature', 
                                right_on='index').sort_values(['F-score','count'], 
                                                              ascending=False)
anova_table

Unnamed: 0,Feature,F-score,p-value,significance,index,count,mean,std,min,25%,50%,75%,max
2,Daily Internet Usage,1166.118942,0.0,Significant,Daily Internet Usage,800.0,180.356,43.48803,104.78,140.815,182.65,218.8425,267.01
1,Daily Time Spent on Site,1020.595048,0.0,Significant,Daily Time Spent on Site,800.0,64.94147,15.73389,32.6,51.655,68.11,78.585,91.43
0,Age,237.346766,0.0,Significant,Age,800.0,36.15125,8.727892,19.0,29.0,35.0,42.0,61.0
3,Area Income,214.428709,0.0,Significant,Area Income,800.0,388501600.0,93279470.0,97975500.0,334764900.0,399742900.0,460800500.0,556393600.0
