In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/sample.csv


# Importing the Dependencies & Data Preprocessing

## Basic & Data PreProcessing Packages

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

## Loading the Dataset

In [3]:
train = pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv')


In [4]:
train.head()

Unnamed: 0,HomePage,HomePage_Duration,LandingPage,LandingPage_Duration,ProductDescriptionPage,ProductDescriptionPage_Duration,GoogleMetric:Bounce Rates,GoogleMetric:Exit Rates,GoogleMetric:Page Values,SeasonalPurchase,...,SearchEngine,Zone,Type of Traffic,CustomerType,Gender,Cookies Setting,Education,Marital Status,WeekendPurchase,Made_Purchase
0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.0,...,1.0,9.0,3.0,Returning_Visitor,Not Specified,Deny,Not Specified,Other,0.0,False
1,0.0,0.0,0.0,0.0,2.0,2.666667,0.05,0.14,0.0,0.0,...,2.0,2.0,4.0,Returning_Visitor,Female,Deny,Others,Married,0.0,False
2,0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,0.0,0.0,...,3.0,1.0,4.0,Returning_Visitor,Female,ALL,Others,Married,1.0,False
3,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.4,...,4.0,3.0,3.0,Returning_Visitor,Male,ALL,Diploma,Single,0.0,False
4,1.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,...,2.0,1.0,5.0,Returning_Visitor,Male,Deny,Diploma,Other,1.0,False


## Data Preprocessing

Here, we will first check that how many missing values are present in our training dataset.

In [5]:
train.isnull().sum()

HomePage                           153
HomePage_Duration                  150
LandingPage                        153
LandingPage_Duration               135
ProductDescriptionPage             123
ProductDescriptionPage_Duration    167
GoogleMetric:Bounce Rates          151
GoogleMetric:Exit Rates            129
GoogleMetric:Page Values           132
SeasonalPurchase                   150
Month_SeasonalPurchase             144
OS                                 134
SearchEngine                       122
Zone                               117
Type of Traffic                    143
CustomerType                       144
Gender                             145
Cookies Setting                    144
Education                          136
Marital Status                     130
WeekendPurchase                    121
Made_Purchase                        0
dtype: int64

## Setting The Imputers
Since not all the variables will have the same strategy for imputers. In this project, I am of the belief that the ideal strategy to deal with variables in terms of imputing missing values is as follows:

**Imputing the constant value of Zero**

The variables given below in this section are:
* HomePage
* HomePage_Duration
* LandingPage
* LandingPage_Duration
* ProductDesriptionPage
* ProductDescriptionPage_Duration
* GoogleMetric-Bounce Rate
* GoogleMetric-Exit Rate
* GoogleMetric-Page Value
* SeasonalPurchase

The reason for the same is that it would be best to assume no values/zero values for these numerical variables.

**Imputing the Median Value**

The variables given below in this section are:
* OS
* SearchEngine
* Zone
* Type of Traffic
* WeekendPurchase

I am of the belief that imputing zero would not be ideal for the same for these numerical variables, we could try it with median values

**Imputing the Constant Value 'Not Specified'**

The variables given below in this section are:
* Month_SeasonalPurchase
* Gender
* Cookie Setting
* Education
* Marital Status
* Customer Type

I am of the belief that imputing the value "Not Specified" would be ideal for these categorical variables

## Putting the Variables in List

In [6]:
imputer_constant_ns = SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value='Not Specified')
imputer_constant_zero = SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value= 0.0 )
imputer_median = SimpleImputer(missing_values=np.NaN, strategy='median')
imputer_constant_other = SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value='Other')

In [7]:

train['HomePage'] = imputer_constant_zero.fit_transform(train['HomePage'].values.reshape(-1,1))[:,0]
train['LandingPage'] = imputer_constant_zero.fit_transform(train['LandingPage'].values.reshape(-1,1))[:,0]
train['ProductDescriptionPage'] = imputer_constant_zero.fit_transform(train['ProductDescriptionPage'].values.reshape(-1,1))[:,0]
train['SeasonalPurchase'] = imputer_constant_zero.fit_transform(train['SeasonalPurchase'].values.reshape(-1,1))[:,0]

In [8]:
train['HomePage_Duration'] = imputer_constant_zero.fit_transform(train['HomePage_Duration'].values.reshape(-1,1))[:,0]
train['LandingPage_Duration'] = imputer_constant_zero.fit_transform(train['LandingPage_Duration'].values.reshape(-1,1))[:,0]
train['ProductDescriptionPage_Duration'] = imputer_constant_zero.fit_transform(train['ProductDescriptionPage_Duration'].values.reshape(-1,1))[:,0]
train['GoogleMetric:Bounce Rates'] = imputer_constant_zero.fit_transform(train['GoogleMetric:Bounce Rates'].values.reshape(-1,1))[:,0]
train['GoogleMetric:Exit Rates'] = imputer_constant_zero.fit_transform(train['GoogleMetric:Exit Rates'].values.reshape(-1,1))[:,0]
train['GoogleMetric:Page Values'] = imputer_constant_zero.fit_transform(train['GoogleMetric:Page Values'].values.reshape(-1,1))[:,0]

In [9]:
train['OS'] = imputer_median.fit_transform(train['OS'].values.reshape(-1,1))[:,0]
train['SearchEngine'] = imputer_median.fit_transform(train['SearchEngine'].values.reshape(-1,1))[:,0]
train['Zone'] = imputer_median.fit_transform(train['Zone'].values.reshape(-1,1))[:,0]
train['Type of Traffic'] = imputer_median.fit_transform(train['Type of Traffic'].values.reshape(-1,1))[:,0]
train['WeekendPurchase'] = imputer_median.fit_transform(train['WeekendPurchase'].values.reshape(-1,1))[:,0]


In [10]:
train['Month_SeasonalPurchase'] = imputer_constant_ns.fit_transform(train['Month_SeasonalPurchase'].values.reshape(-1,1))[:,0]
train['Gender'] = imputer_constant_ns.fit_transform(train['Gender'].values.reshape(-1,1))[:,0]
train['Cookies Setting'] = imputer_constant_ns.fit_transform(train['Cookies Setting'].values.reshape(-1,1))[:,0]
train['Education'] = imputer_constant_ns.fit_transform(train['Education'].values.reshape(-1,1))[:,0]
train['Marital Status'] = imputer_constant_ns.fit_transform(train['Marital Status'].values.reshape(-1,1))[:,0]
train['CustomerType'] = imputer_constant_other.fit_transform(train['CustomerType'].values.reshape(-1,1))[:,0]

In [11]:
train.isnull().sum()

HomePage                           0
HomePage_Duration                  0
LandingPage                        0
LandingPage_Duration               0
ProductDescriptionPage             0
ProductDescriptionPage_Duration    0
GoogleMetric:Bounce Rates          0
GoogleMetric:Exit Rates            0
GoogleMetric:Page Values           0
SeasonalPurchase                   0
Month_SeasonalPurchase             0
OS                                 0
SearchEngine                       0
Zone                               0
Type of Traffic                    0
CustomerType                       0
Gender                             0
Cookies Setting                    0
Education                          0
Marital Status                     0
WeekendPurchase                    0
Made_Purchase                      0
dtype: int64

## Encoding Categorical Data as Numerical Data
Since there are also categorical variables, it is best to convert them via Label Encoder so that they too can be taken in the model evaluation

In [12]:
le = LabelEncoder()
train['Month_SeasonalPurchase'] = le.fit_transform(train['Month_SeasonalPurchase'])
train['Gender'] = le.fit_transform(train['Gender'])
train['Cookies Setting'] = le.fit_transform(train['Cookies Setting'])
train['Education'] = le.fit_transform(train['Education'])
train['Marital Status'] = le.fit_transform(train['Marital Status'])
train['CustomerType'] = le.fit_transform(train['CustomerType'])

## Standard Scaler
It is important that all the variables are scaled and follow a Gaussian Distribution, so as to get the best results possible

In [13]:
scaler = StandardScaler()

#Numerical Variables
train['HomePage'] = scaler.fit_transform(train['HomePage'].values.reshape(-1,1))[:,0]
train['HomePage_Duration'] = scaler.fit_transform(train['HomePage_Duration'].values.reshape(-1,1))[:,0]
train['LandingPage'] = scaler.fit_transform(train['LandingPage'].values.reshape(-1,1))[:,0]
train['LandingPage_Duration'] = scaler.fit_transform(train['LandingPage_Duration'].values.reshape(-1,1))[:,0]
train['ProductDescriptionPage'] = scaler.fit_transform(train['ProductDescriptionPage'].values.reshape(-1,1))[:,0]
train['ProductDescriptionPage_Duration'] = scaler.fit_transform(train['ProductDescriptionPage_Duration'].values.reshape(-1,1))[:,0]
train['GoogleMetric:Bounce Rates'] = scaler.fit_transform(train['GoogleMetric:Bounce Rates'].values.reshape(-1,1))[:,0]
train['GoogleMetric:Exit Rates'] = scaler.fit_transform(train['GoogleMetric:Exit Rates'].values.reshape(-1,1))[:,0]
train['GoogleMetric:Page Values'] = scaler.fit_transform(train['GoogleMetric:Page Values'].values.reshape(-1,1))[:,0]
train['SeasonalPurchase'] = scaler.fit_transform(train['SeasonalPurchase'].values.reshape(-1,1))[:,0]
train['OS'] = scaler.fit_transform(train['OS'].values.reshape(-1,1))[:,0]
train['SearchEngine'] = scaler.fit_transform(train['SearchEngine'].values.reshape(-1,1))[:,0]
train['Zone'] = scaler.fit_transform(train['Zone'].values.reshape(-1,1))[:,0]
train['Type of Traffic'] = scaler.fit_transform(train['Type of Traffic'].values.reshape(-1,1))[:,0]
train['WeekendPurchase'] = scaler.fit_transform(train['WeekendPurchase'].values.reshape(-1,1))[:,0]




In [14]:
#Categorical Variables
train['Month_SeasonalPurchase'] = scaler.fit_transform(train['Month_SeasonalPurchase'].values.reshape(-1,1))[:,0]
train['CustomerType'] = scaler.fit_transform(train['CustomerType'].values.reshape(-1,1))[:,0]
train['Gender'] = scaler.fit_transform(train['Gender'].values.reshape(-1,1))[:,0]
train['Cookies Setting'] = scaler.fit_transform(train['Cookies Setting'].values.reshape(-1,1))[:,0]
train['Education'] = scaler.fit_transform(train['Education'].values.reshape(-1,1))[:,0]
train['Marital Status'] = scaler.fit_transform(train['Marital Status'].values.reshape(-1,1))[:,0]

In [15]:
train.head()

Unnamed: 0,HomePage,HomePage_Duration,LandingPage,LandingPage_Duration,ProductDescriptionPage,ProductDescriptionPage_Duration,GoogleMetric:Bounce Rates,GoogleMetric:Exit Rates,GoogleMetric:Page Values,SeasonalPurchase,...,SearchEngine,Zone,Type of Traffic,CustomerType,Gender,Cookies Setting,Education,Marital Status,WeekendPurchase,Made_Purchase
0,-0.679181,-0.439417,-0.389509,-0.237571,-0.676251,-0.584894,3.551143,3.122939,-0.28361,-0.314568,...,-0.78935,2.440184,-0.265764,0.409903,1.193689,-0.280798,0.44116,0.256637,-0.549978,False
1,-0.679181,-0.439417,-0.389509,-0.237571,-0.65393,-0.583562,0.539548,1.919703,-0.28361,-0.314568,...,-0.206234,-0.48191,-0.017352,0.409903,-1.241126,-0.280798,1.334947,-1.362083,-0.549978,False
2,-0.679181,-0.439417,-0.389509,-0.237571,-0.475358,-0.271447,-0.062771,0.114849,-0.28361,-0.314568,...,0.376882,-0.899352,-0.017352,0.409903,-1.241126,-1.088379,1.334947,-1.362083,1.818255,False
3,-0.679181,-0.439417,-0.389509,-0.237571,-0.676251,-0.584894,3.551143,3.122939,-0.28361,1.669121,...,0.959998,-0.064468,-0.265764,0.409903,-0.023718,-1.088379,-1.346414,1.065997,-0.549978,False
4,-0.374187,-0.439417,-0.389509,-0.237571,-0.698573,-0.584894,3.551143,3.122939,-0.28361,-0.314568,...,-0.206234,-0.899352,0.23106,0.409903,-0.023718,-0.280798,-1.346414,0.256637,1.818255,False


In [16]:
x = train.drop("Made_Purchase",axis = 1)
y = train["Made_Purchase"]

# Feature Removal

In [17]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
sfs = SequentialFeatureSelector(dt, n_features_to_select=12,direction = 'backward')
sfs.fit(x, y)



SequentialFeatureSelector(direction='backward',
                          estimator=DecisionTreeClassifier(),
                          n_features_to_select=12)

In [18]:
sfs.get_support()


array([ True,  True,  True, False, False, False, False, False,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
       False,  True, False])

In [19]:
train.columns

Index(['HomePage', 'HomePage_Duration', 'LandingPage', 'LandingPage_Duration',
       'ProductDescriptionPage', 'ProductDescriptionPage_Duration',
       'GoogleMetric:Bounce Rates', 'GoogleMetric:Exit Rates',
       'GoogleMetric:Page Values', 'SeasonalPurchase',
       'Month_SeasonalPurchase', 'OS', 'SearchEngine', 'Zone',
       'Type of Traffic', 'CustomerType', 'Gender', 'Cookies Setting',
       'Education', 'Marital Status', 'WeekendPurchase', 'Made_Purchase'],
      dtype='object')

So, we will drop all the features which are redundant as per our classifiers i.e. we will only go forward with the top 14 features

In [20]:
L = ['LandingPage_Duration',
       'ProductDescriptionPage', 'ProductDescriptionPage_Duration','SeasonalPurchase',
       'Month_SeasonalPurchase', 'OS','Type of Traffic','Gender','Education','Marital Status']

In [21]:
len(L)

10

In [22]:
x = x.drop(L,axis = 1)


## Finding the distribution of target values

In [23]:
a = train["Made_Purchase"].value_counts()
print('People who did not make the purchase are:', a[0])
print('People who made purchase are:', a[1])

People who did not make the purchase are: 9065
People who made purchase are: 5666


As we can see above, the data is imbalanced i.e the samples for True (Class 1) are less. Hence, we will sythetically generate more samples using SMOTE

## SMOTE Samples

We will be creating SMOTE Samples, hence we will import the dependencies accordingly

In [24]:
from imblearn.over_sampling import SMOTE
from collections import Counter


In [25]:
oversample = SMOTE()
x_smt, y_smt = oversample.fit_resample(x, y)

In [26]:
counter = Counter(y_smt)
print(counter)

Counter({False: 9065, True: 9065})


# Splitting the Data Between Train and Test

In [27]:
from sklearn.model_selection import train_test_split
x_train_smt, x_test_smt, y_train_smt, y_test_smt = train_test_split(x_smt,y_smt,test_size = 0.1, stratify = y_smt, random_state = 1, shuffle = True)

# Evaluating Models

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import GradientBoostingClassifier

In [29]:
model0 = LogisticRegression()

model0.fit(x_train_smt,y_train_smt)
test_pred0 = model0.predict(x_test_smt)
print("X_test prediction f1 score is:",f1_score(y_test_smt,test_pred0))

X_test prediction f1 score is: 0.4429223744292237


In [30]:
from sklearn.linear_model import Perceptron
model1 = Perceptron(max_iter = 2000, penalty = 'l1', alpha = 0.00012, fit_intercept = True, shuffle = True, verbose = 3, eta0 = 10)
model1.fit(x_train_smt,y_train_smt)
test_pred1 = model1.predict(x_test_smt)
print("X_test prediction f1 score is:",f1_score(y_test_smt,test_pred1))

-- Epoch 1
Norm: 1519.30, NNZs: 7, Bias: 30.000000, T: 16317, Avg. loss: 20.576850
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 2640.90, NNZs: 4, Bias: 20.000000, T: 32634, Avg. loss: 13.252241
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 3573.28, NNZs: 1, Bias: 10.000000, T: 48951, Avg. loss: 8.972000
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 4369.72, NNZs: 4, Bias: 0.000000, T: 65268, Avg. loss: 7.707577
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 5065.91, NNZs: 1, Bias: 0.000000, T: 81585, Avg. loss: 6.878409
Total training time: 0.02 seconds.
-- Epoch 6
Norm: 5708.24, NNZs: 2, Bias: 20.000000, T: 97902, Avg. loss: 6.364665
Total training time: 0.02 seconds.
-- Epoch 7
Norm: 6303.70, NNZs: 1, Bias: -10.000000, T: 114219, Avg. loss: 5.877801
Total training time: 0.02 seconds.
-- Epoch 8
Norm: 6858.57, NNZs: 6, Bias: 20.000000, T: 130536, Avg. loss: 5.828761
Total training time: 0.02 seconds.
-- Epoch 9
Norm: 7373.56, NNZs: 1, Bias: 10.000000, T: 146853

In [31]:

model2 = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, max_depth=3, verbose = 1)

model2.fit(x_train_smt,y_train_smt)
test_pred2 = model2.predict(x_test_smt)
print("X_test prediction f1 score is:",f1_score(y_test_smt,test_pred2))

      Iter       Train Loss   Remaining Time 
         1           1.3682            2.33s
         2           1.3536            2.19s
         3           1.3412            2.11s
         4           1.3311            2.06s
         5           1.3227            2.02s
         6           1.3154            1.99s
         7           1.3086            1.96s
         8           1.3032            1.93s
         9           1.2980            1.91s
        10           1.2939            1.90s
        20           1.2720            1.69s
        30           1.2570            1.48s
        40           1.2433            1.27s
        50           1.2296            1.06s
        60           1.2161            0.85s
        70           1.2067            0.63s
        80           1.1988            0.42s
        90           1.1922            0.21s
       100           1.1870            0.00s
X_test prediction f1 score is: 0.5804195804195804


Here, I am of the belief that a combination of Perceptron and Gradient Boosting can work well.

# Test Data

In [32]:
test = pd.read_csv("/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv")

## Imputing the test dataset columns

In [33]:
test['HomePage'] = imputer_constant_zero.fit_transform(test['HomePage'].values.reshape(-1,1))[:,0]
test['LandingPage'] = imputer_constant_zero.fit_transform(test['LandingPage'].values.reshape(-1,1))[:,0]
test['ProductDescriptionPage'] = imputer_constant_zero.fit_transform(test['ProductDescriptionPage'].values.reshape(-1,1))[:,0]
test['SeasonalPurchase'] = imputer_constant_zero.fit_transform(test['SeasonalPurchase'].values.reshape(-1,1))[:,0]

In [34]:
test['HomePage_Duration'] = imputer_constant_zero.fit_transform(test['HomePage_Duration'].values.reshape(-1,1))[:,0]
test['LandingPage_Duration'] = imputer_constant_zero.fit_transform(test['LandingPage_Duration'].values.reshape(-1,1))[:,0]
test['ProductDescriptionPage_Duration'] = imputer_constant_zero.fit_transform(test['ProductDescriptionPage_Duration'].values.reshape(-1,1))[:,0]
test['GoogleMetric:Bounce Rates'] = imputer_constant_zero.fit_transform(test['GoogleMetric:Bounce Rates'].values.reshape(-1,1))[:,0]
test['GoogleMetric:Exit Rates'] = imputer_constant_zero.fit_transform(test['GoogleMetric:Exit Rates'].values.reshape(-1,1))[:,0]
test['GoogleMetric:Page Values'] = imputer_constant_zero.fit_transform(test['GoogleMetric:Page Values'].values.reshape(-1,1))[:,0]

In [35]:
test['OS'] = imputer_median.fit_transform(test['OS'].values.reshape(-1,1))[:,0]
test['SearchEngine'] = imputer_median.fit_transform(test['SearchEngine'].values.reshape(-1,1))[:,0]
test['Zone'] = imputer_median.fit_transform(test['Zone'].values.reshape(-1,1))[:,0]
test['Type of Traffic'] = imputer_median.fit_transform(test['Type of Traffic'].values.reshape(-1,1))[:,0]
test['WeekendPurchase'] = imputer_median.fit_transform(test['WeekendPurchase'].values.reshape(-1,1))[:,0]


In [36]:
test['Month_SeasonalPurchase'] = imputer_constant_ns.fit_transform(test['Month_SeasonalPurchase'].values.reshape(-1,1))[:,0]
test['Gender'] = imputer_constant_ns.fit_transform(test['Gender'].values.reshape(-1,1))[:,0]
test['Cookies Setting'] = imputer_constant_ns.fit_transform(test['Cookies Setting'].values.reshape(-1,1))[:,0]
test['Education'] = imputer_constant_ns.fit_transform(test['Education'].values.reshape(-1,1))[:,0]
test['Marital Status'] = imputer_constant_ns.fit_transform(test['Marital Status'].values.reshape(-1,1))[:,0]
test['CustomerType'] = imputer_constant_other.fit_transform(test['CustomerType'].values.reshape(-1,1))[:,0]

In [37]:
test.isnull().sum()

HomePage                           0
HomePage_Duration                  0
LandingPage                        0
LandingPage_Duration               0
ProductDescriptionPage             0
ProductDescriptionPage_Duration    0
GoogleMetric:Bounce Rates          0
GoogleMetric:Exit Rates            0
GoogleMetric:Page Values           0
SeasonalPurchase                   0
Month_SeasonalPurchase             0
OS                                 0
SearchEngine                       0
Zone                               0
Type of Traffic                    0
CustomerType                       0
Gender                             0
Cookies Setting                    0
Education                          0
Marital Status                     0
WeekendPurchase                    0
dtype: int64

## Label Encoding the test data set

In [38]:
le = LabelEncoder()
test['Month_SeasonalPurchase'] = le.fit_transform(test['Month_SeasonalPurchase'])
test['Gender'] = le.fit_transform(test['Gender'])
test['Cookies Setting'] = le.fit_transform(test['Cookies Setting'])
test['Education'] = le.fit_transform(test['Education'])
test['Marital Status'] = le.fit_transform(test['Marital Status'])
test['CustomerType'] = le.fit_transform(test['CustomerType'])

## Scaling the test data set

In [39]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

test['HomePage'] = scaler.fit_transform(test['HomePage'].values.reshape(-1,1))[:,0]
test['HomePage_Duration'] = scaler.fit_transform(test['HomePage_Duration'].values.reshape(-1,1))[:,0]
test['LandingPage'] = scaler.fit_transform(test['LandingPage'].values.reshape(-1,1))[:,0]
test['LandingPage_Duration'] = scaler.fit_transform(test['LandingPage_Duration'].values.reshape(-1,1))[:,0]
test['ProductDescriptionPage'] = scaler.fit_transform(test['ProductDescriptionPage'].values.reshape(-1,1))[:,0]
test['ProductDescriptionPage_Duration'] = scaler.fit_transform(test['ProductDescriptionPage_Duration'].values.reshape(-1,1))[:,0]
test['GoogleMetric:Bounce Rates'] = scaler.fit_transform(test['GoogleMetric:Bounce Rates'].values.reshape(-1,1))[:,0]
test['GoogleMetric:Exit Rates'] = scaler.fit_transform(test['GoogleMetric:Exit Rates'].values.reshape(-1,1))[:,0]
test['GoogleMetric:Page Values'] = scaler.fit_transform(test['GoogleMetric:Page Values'].values.reshape(-1,1))[:,0]
test['SeasonalPurchase'] = scaler.fit_transform(test['SeasonalPurchase'].values.reshape(-1,1))[:,0]
test['OS'] = scaler.fit_transform(test['OS'].values.reshape(-1,1))[:,0]
test['SearchEngine'] = scaler.fit_transform(test['SearchEngine'].values.reshape(-1,1))[:,0]
test['Zone'] = scaler.fit_transform(test['Zone'].values.reshape(-1,1))[:,0]
test['Type of Traffic'] = scaler.fit_transform(test['Type of Traffic'].values.reshape(-1,1))[:,0]
test['WeekendPurchase'] = scaler.fit_transform(test['WeekendPurchase'].values.reshape(-1,1))[:,0]


In [40]:
test['Month_SeasonalPurchase'] = scaler.fit_transform(test['Month_SeasonalPurchase'].values.reshape(-1,1))[:,0]
test['CustomerType'] = scaler.fit_transform(test['CustomerType'].values.reshape(-1,1))[:,0]
test['Gender'] = scaler.fit_transform(test['Gender'].values.reshape(-1,1))[:,0]
test['Cookies Setting'] = scaler.fit_transform(test['Cookies Setting'].values.reshape(-1,1))[:,0]
test['Education'] = scaler.fit_transform(test['Education'].values.reshape(-1,1))[:,0]
test['Marital Status'] = scaler.fit_transform(test['Marital Status'].values.reshape(-1,1))[:,0]

## Droping the Features from Test Data Set

In [41]:
test = test.drop(L,axis = 1)

In [42]:
test.shape

(6599, 11)

# Making the Final Prediction

In [43]:
from sklearn.linear_model import Perceptron
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier



per = Perceptron(max_iter = 2000, penalty = 'l1', alpha = 0.00012, fit_intercept = True, shuffle = True, verbose =2, eta0 = 10)
gbc = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, max_depth=3, verbose = 1)

vc = VotingClassifier(estimators = [('per',per),('gbc',gbc)],voting = 'hard')

vc.fit(x_smt,y_smt)
y_final = vc.predict(test)

-- Epoch 1
Norm: 1638.56, NNZs: 6, Bias: 20.000000, T: 18130, Avg. loss: 19.863513
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 2868.12, NNZs: 5, Bias: 10.000000, T: 36260, Avg. loss: 12.047756
Total training time: 0.01 seconds.
-- Epoch 3
Norm: 3838.86, NNZs: 2, Bias: 0.000000, T: 54390, Avg. loss: 9.106429
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 4701.48, NNZs: 4, Bias: 20.000000, T: 72520, Avg. loss: 7.305218
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 5452.09, NNZs: 1, Bias: 0.000000, T: 90650, Avg. loss: 6.599299
Total training time: 0.02 seconds.
-- Epoch 6
Norm: 6119.80, NNZs: 3, Bias: 0.000000, T: 108780, Avg. loss: 6.126892
Total training time: 0.02 seconds.
-- Epoch 7
Norm: 6752.12, NNZs: 2, Bias: 10.000000, T: 126910, Avg. loss: 5.548178
Total training time: 0.02 seconds.
-- Epoch 8
Norm: 7321.27, NNZs: 3, Bias: -10.000000, T: 145040, Avg. loss: 5.691353
Total training time: 0.03 seconds.
-- Epoch 9
Norm: 7873.33, NNZs: 0, Bias: 10.000000, T: 163170

In [44]:
y_final


array([False, False, False, ..., False, False, False])

# Final submission

In [45]:
submission = pd.DataFrame(columns = ['id','Made_Purchase'])
submission['id'] = [i for i in range(len(y_final))]
submission['Made_Purchase'] = y_final


In [46]:
submission['Made_Purchase'].value_counts()

False    5587
True     1012
Name: Made_Purchase, dtype: int64

In [47]:
submission.to_csv('submission.csv',index = False)