# Pipeline A-MinMax

In [1]:
# for preprocessing/eda models
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import power_transform
from scipy import stats
from scipy.stats import boxcox
from scipy.stats import kurtosis, skew
import math
from scipy.stats import norm

# feature selection
from sklearn.feature_selection import RFE

# balancing
from imblearn.over_sampling import SMOTE

# accuracy metrics and data split models
from sklearn.model_selection import train_test_split
from sklearn import metrics, model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns', 500) # display max 500 rows
pd.set_option('display.max_rows', 140)

Using TensorFlow backend.


In [2]:
# read in data to skip innitial steps from pipeline B
data = pd.read_csv('online_shoppers_intention-2.csv')
data1 = data.copy() # copy of original df
data1.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
data1.Weekend.value_counts()

False    9462
True     2868
Name: Weekend, dtype: int64

## Imputation 
- assumption that exit rates cannot be zero

In [4]:
# impute our assumption (beccomes a new feature)
data1['ExitRatesImpute'] = data1['ExitRates'].replace(0,np.NaN)
data1['ExitRatesImpute'] = data1['ExitRatesImpute'].fillna(data1['ExitRatesImpute'].median())

## Feature Engineering (Ratios, aggregation)
- calculate ratios for count of page visits 
- combine bounce and exit rates (average and weighted average - new features)
    - they have a strong linear relationship
- divide features that have strong non-linear relationship 
    - solve zero-division by replacing it with 0 (e.g. 1/0 = n/a)

In [5]:
# ratios for counts
data1['totalFracAdmin'] = data1['Administrative']/(data1['Administrative'] + data1['Informational'] + data1['ProductRelated'])
data1['totalFracInfo'] = data1['Informational']/(data1['Administrative'] + data1['Informational'] + data1['ProductRelated'])
data1['totalFracProd'] = data1['ProductRelated']/(data1['Administrative'] + data1['Informational'] + data1['ProductRelated'])

# average combining
data1['BounceExitAvg'] = (data1['BounceRates'] + data1['ExitRates'])/2

# weighted-average feature joining
data1['BounceExitW1'] = data1['BounceRates'] * 0.6 + data1['ExitRates'] * 0.4
data1['BounceExitW2'] = data1['BounceRates'] * 0.7 + data1['ExitRates'] * 0.3
data1['BounceExitW3'] = data1['BounceRates'] * 0.4 + data1['ExitRates'] * 0.6
data1['BounceExitW4'] = data1['BounceRates'] * 0.3 + data1['ExitRates'] * 0.7

# bounce and exit rates vs page values ratio 
data1['BouncePageRatio'] = data1['BounceRates']/data1['PageValues']
data1['ExitPageRatio'] = data1['ExitRates']/data1['PageValues']

# durations vs page values, bounce and exit rates
data1['InfoPageRatio'] = data1['Informational_Duration']/data1['PageValues']
data1['ProdRelPageRatio'] = data1['ProductRelated_Duration']/data1['PageValues']
data1['InfoBounceRatio'] = data1['Informational_Duration']/data1['BounceRates']
data1['AdminBounceRatio'] = data1['Administrative_Duration']/data1['BounceRates']
data1['ProdRelBounceRatio'] = data1['ProductRelated_Duration']/data1['BounceRates']
data1['InfoExitRatio'] = data1['Informational_Duration']/data1['ExitRates']
data1['AdminBounceRatio'] = data1['Administrative_Duration']/data1['ExitRates']
data1['ProdRelExitRatio'] = data1['ProductRelated_Duration']/data1['ExitRates']

# page values, bounce and exit rates vs durations
#data1['PageInfoRatio'] = data1['PageValues']/data1['Informational_Duration']
#data1['PageProdRelRatio'] = data1['PageValues']/data1['ProductRelated_Duration']
#data1['BounceInfoRatio'] = data1['BounceRates']/data1['Informational_Duration']
#data1['BounceAdminRatio'] = data1['BounceRates']/data1['Administrative_Duration']
#data1['BounceProdRelRatio'] = data1['BounceRates']/data1['ProductRelated_Duration']
#data1['ExitInfoRatio'] = data1['ExitRates']/data1['Informational_Duration']
#data1['BounceAdminRatio'] = data1['ExitRates']/data1['Administrative_Duration']
#data1['ExitProdRelRatio'] = data1['ExitRates']/data1['ProductRelated_Duration']

# as there are many zero values --> e.g. x/0 (zero-division) = N/A or inf can occur 
# if there is zero-division treat that as a zero
data1 = data1.fillna(0) # fill N/A with 0 
data1 = data1.replace(np.inf, 0) # replace inf wit 0
data1 = data1.replace(-0, 0) # for some reason we also get -0 just fix it to 0 

In [6]:
print('Original number of features: ', len(data.columns))
print('Number of features added: ', len(data1.columns) - len(data.columns))
print('Total number of features after feature engineering: ', len(data1.columns) + len(data.columns))

Original number of features:  18
Number of features added:  18
Total number of features after feature engineering:  54


## Standardization

In [7]:
# make a copy
standardize = data1.copy()  
# select continous features (exclude Special Day)
standardize = standardize.select_dtypes(include='float64').drop('SpecialDay', axis = 1)

In [8]:
# import MinMaxScaler module
from sklearn.preprocessing import MinMaxScaler

# use MinMaxScaler function 
scaler = MinMaxScaler()

# min-max standerdize all continous columns
standardize[standardize.columns] = scaler.fit_transform(standardize[standardize.columns]) # fit the scaler to the model 
standardize_done = standardize.add_suffix('_Scaled') # add suffix (new features)

# add new standerdized features to data1
data1 = pd.concat([data1, standardize_done], axis = 1 )

#use .describe() to prove standardization worked
standardize_done.describe()

Unnamed: 0,Administrative_Duration_Scaled,Informational_Duration_Scaled,ProductRelated_Duration_Scaled,BounceRates_Scaled,ExitRates_Scaled,PageValues_Scaled,ExitRatesImpute_Scaled,totalFracAdmin_Scaled,totalFracInfo_Scaled,totalFracProd_Scaled,BounceExitAvg_Scaled,BounceExitW1_Scaled,BounceExitW2_Scaled,BounceExitW3_Scaled,BounceExitW4_Scaled,BouncePageRatio_Scaled,ExitPageRatio_Scaled,InfoPageRatio_Scaled,ProdRelPageRatio_Scaled,InfoBounceRatio_Scaled,AdminBounceRatio_Scaled,ProdRelBounceRatio_Scaled,InfoExitRatio_Scaled,ProdRelExitRatio_Scaled
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,0.023779,0.013522,0.018676,0.110957,0.215364,0.016279,0.215459,0.079955,0.01446,0.905098,0.16316,0.15272,0.142279,0.173601,0.184042,0.001512,0.001537,0.001563,0.000596,0.00119,0.001255,0.001096,0.005869,0.006603
std,0.052013,0.055209,0.029913,0.242442,0.242983,0.051328,0.242704,0.126751,0.05037,0.141883,0.237375,0.237535,0.238127,0.237646,0.238347,0.015647,0.01321,0.0192,0.012077,0.016869,0.011059,0.015312,0.03003,0.019537
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.002878,0.0,0.071429,0.0,0.070613,0.0,0.0,0.860465,0.039532,0.033333,0.025805,0.046154,0.052593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000338
50%,0.002207,0.0,0.009362,0.015562,0.125782,0.0,0.126436,0.021661,0.0,0.963359,0.079255,0.066667,0.056848,0.088884,0.1,0.0,0.0,0.0,0.0,0.0,4e-05,0.0,0.0,0.001863
75%,0.027438,0.0,0.022887,0.084063,0.25,0.0,0.249341,0.115385,0.0,1.0,0.162709,0.140852,0.129167,0.175638,0.189772,0.0,0.0,0.0,0.0,0.0,0.000868,0.000268,0.0,0.006279
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
print('Number of features before standardization: ', len(data1.columns) - len(standardize_done.columns))
print('Number of features added: ', len(standardize_done.columns))
print('Total number of features after standardization: ', len(data1.columns))

Number of features before standardization:  36
Number of features added:  24
Total number of features after standardization:  60


In [None]:
data1.to_csv('PipelineA-MinMax.csv')