In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv
/kaggle/input/e-commerce-shoppers-behaviour-understanding/sample.csv


In [2]:
# important imports
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
test_data = pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv')
train_data = pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv')

In [4]:
train_features = train_data.drop('Made_Purchase', axis=1)
train_labels = train_data['Made_Purchase']

In [5]:
from tabulate import tabulate
col = train_features.columns # saves column names in an array

def check_nan(df1, df2, columns):
  data = []
  for i in columns:
    data.append([i, df1[i].isnull().sum(), df2[i].isnull().sum(), df1[i].dtype])
  print(tabulate(data, headers=["Columns", "NullRows_train", "NullRows_test", "DataType"]))

In [6]:
# computing null rows and datatype
check_nan(train_data, test_data, col)

Columns                            NullRows_train    NullRows_test  DataType
-------------------------------  ----------------  ---------------  ----------
HomePage                                      153               51  float64
HomePage_Duration                             150               55  float64
LandingPage                                   153               56  float64
LandingPage_Duration                          135               75  float64
ProductDescriptionPage                        123               58  float64
ProductDescriptionPage_Duration               167               63  float64
GoogleMetric:Bounce Rates                     151               66  float64
GoogleMetric:Exit Rates                       129               51  float64
GoogleMetric:Page Values                      132               45  float64
SeasonalPurchase                              150               45  float64
Month_SeasonalPurchase                        144               58  object
OS       

# Data Preprocessing

In [7]:
# gives the column numbers for numerical and categorical data
num_idx = train_features.columns.get_indexer(train_features.select_dtypes(['int64','float64']).columns)
cat_idx = train_features.columns.get_indexer(train_features.select_dtypes('object').columns)

In [8]:
num_cols = train_features.select_dtypes(['int64','float64']).columns
num_cols

Index(['HomePage', 'HomePage_Duration', 'LandingPage', 'LandingPage_Duration',
       'ProductDescriptionPage', 'ProductDescriptionPage_Duration',
       'GoogleMetric:Bounce Rates', 'GoogleMetric:Exit Rates',
       'GoogleMetric:Page Values', 'SeasonalPurchase', 'OS', 'SearchEngine',
       'Zone', 'Type of Traffic', 'WeekendPurchase'],
      dtype='object')

In [9]:
# returns an pandas dataframe of columns with numerical data
num_train = train_features[col[num_idx]]
num_test = test_data[col[num_idx]]
num_train.shape

(14731, 15)

In [10]:
# returns an pandas dataframe of columns with categorical data
cat_train = train_features[col[cat_idx]]
cat_test = test_data[col[cat_idx]]
cat_train.shape

(14731, 6)

In [11]:
# num_train['TotalPageVisits'] = num_train['HomePage'] + num_train['LandingPage'] + num_train['ProductDescriptionPage']
# num_train['TotalTimeOnWebsite'] = num_train['HomePage_Duration'] + num_train['LandingPage_Duration'] + num_train['ProductDescriptionPage_Duration']
# num_train['BounceAndExitRate'] = num_train['GoogleMetric:Bounce Rates'] + num_train['GoogleMetric:Exit Rates']
# num_train['PurchaseFrequency'] = num_train['TotalPageVisits'] / num_train['SeasonalPurchase']
# num_train['SeasonalPurchaseBinary'] = num_train['SeasonalPurchase'].apply(lambda x: 1 if x > 0 else 0)

In [12]:
# sklearn imports

from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, Normalizer, MinMaxScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [13]:
# a pipeline to convert categorical data into numerical data
cat_pipe = Pipeline([('impute', SimpleImputer(strategy = 'most_frequent')),
                     ('encode', OneHotEncoder(handle_unknown='ignore'))])

In [14]:
# applying pipeline to the train and test data
cat_trans_train = cat_pipe.fit_transform(cat_train)
cat_trans_test = cat_pipe.transform(cat_test)

In [15]:
num_pipe = Pipeline([('impute', KNNImputer(n_neighbors=10, missing_values=np.nan)),
                     ('scaler', PowerTransformer())])

In [16]:
# applying pipeline to the train and test data
num_trans_train = num_pipe.fit_transform(num_train)
num_trans_test = num_pipe.transform(num_test)

In [17]:
pca = PCA()
pca.fit(num_trans_train)

# determine the number of components to cover up to 99% of the variance
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.99) + 1
pca = PCA(n_components=n_components)

num_trans_train = pca.fit_transform(num_trans_train)
num_trans_test = pca.transform(num_trans_test)

In [18]:
cat_trans_train = cat_trans_train.toarray() # converts sparse matrix to numpy array
cat_trans_test = cat_trans_test.toarray()
type(cat_trans_test), type(num_trans_train)

(numpy.ndarray, numpy.ndarray)

In [19]:
num_trans_train.shape, cat_trans_train.shape

((14731, 13), (14731, 26))

In [20]:
# contains the preprocessed data
traindata_trans = np.concatenate((num_trans_train, cat_trans_train), axis=1)
testdata_trans = np.concatenate((num_trans_test, cat_trans_test), axis=1)

# The Dummy Classifier

In [21]:
"""
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_features, train_labels)
"""

'\nfrom sklearn.dummy import DummyClassifier\ndummy_clf = DummyClassifier(strategy="most_frequent")\ndummy_clf.fit(train_features, train_labels)\n'

In [22]:
# y_pred = dummy_clf.predict(test_data)

# Other Classifiers

In [23]:
# sklearn imports

from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

In [24]:
est=[('lr', LogisticRegression(solver='liblinear',penalty='l2', random_state=0)),
    ('nn', MLPClassifier(learning_rate='adaptive', activation='logistic', solver='sgd')),
    ('ab', AdaBoostClassifier(base_estimator=LogisticRegression())),
    ('svc', SVC(kernel='sigmoid', gamma='auto', degree=5, probability=True))]

pipe = VotingClassifier(estimators = est, voting = 'soft')
pipe.fit(traindata_trans, train_labels)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(random_state=0,
                                                 solver='liblinear')),
                             ('nn',
                              MLPClassifier(activation='logistic',
                                            learning_rate='adaptive',
                                            solver='sgd')),
                             ('ab',
                              AdaBoostClassifier(base_estimator=LogisticRegression())),
                             ('svc',
                              SVC(degree=5, gamma='auto', kernel='sigmoid',
                                  probability=True))],
                 voting='soft')

In [25]:
y_pred = pipe.predict(testdata_trans)

In [26]:
submission = pd.DataFrame(columns = ['id','Made_Purchase'])
    
submission['id']=[i for i in range(len(y_pred))]
submission['Made_Purchase']=y_pred

In [27]:
submission['Made_Purchase'].value_counts()

False    5492
True     1107
Name: Made_Purchase, dtype: int64

In [28]:
submission.to_csv('submission.csv', index = False)