In [None]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### E-commerce Shopper's Behaviour Understanding

#### Understand the online shopper purchasing pattern through Machine learning

Assume that you are working in a consultancy company and one of your client is running an e-commerce company. They are interested in understanding the customer behavior regarding the shopping. They have already collected the users’ session data for a year. Each row belongs to a different user. The “Made_purchase” is an indicator that whether the user has made a purchase or not during that year. Your client is also interested in predicting that column using other attributes of the users. The client also informs you that the data is collected by non-experts. So, it might have some percentage of error in some columns. 

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (14,7)
import seaborn as sns

%matplotlib inline

In [None]:
# Suppress warning
import warnings
warnings.filterwarnings("ignore")

# Enable code autocompletion
%config Completer.use_jedi = False

In [None]:
# Initialization of random state parameter in estimators and CV splitters
#rng= np.random.RandomState(0)
rng = 42

#### Loading Data

In [None]:
df_train= pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/train_data_v2.csv')
df_test= pd.read_csv('/kaggle/input/e-commerce-shoppers-behaviour-understanding/test_data_v2.csv')
print(df_train.shape)
print(df_test.shape)

#### Exploratory Data Analysis

In [None]:
df_train.head()

In [None]:
df_train.info()

- 14731 entries, 22 columns including label
- Missing values in most columns
- 15 numerical and 6 categorical columns

In [None]:
df_train.dtypes.value_counts()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.columns = df_train.columns.str.replace(' ', "_")
df_test.columns = df_test.columns.str.replace(' ', '_')
df_train.columns

Drop duplicate values

In [None]:
df_train.duplicated().sum()

In [None]:
df_train = df_train.drop_duplicates()

In [None]:
df_train.shape

#### Labels

In [None]:
plt.figure(figsize= (6,8))

# plotting the counts of each binary class label
sns.countplot(data= df_train, x= 'Made_Purchase', palette= 'viridis')

plt.title('Distribution of Labels')
plt.xlabel('Made Purchase')
plt.ylabel('Number of Entries')
plt.show()



In [None]:
made_purchase_yes = df_train["Made_Purchase"].value_counts().iloc[1]/df_train.shape[0]
made_purchase_no = df_train["Made_Purchase"].value_counts().iloc[0]/df_train.shape[0]

print('Percentage of made purchase: ', round(made_purchase_yes*100,2))
print('Percentage of not made purchase: ', round(made_purchase_no*100,2))


There is class imbalance in the problem

#### Feature Matrix and Label Vector

In [None]:
X = df_train.drop(['Made_Purchase'], axis= 1)
y = df_train['Made_Purchase']


In [None]:
print(X.shape, y.shape)

Separating categorical and numerical columns

In [None]:
cat_cols = X.select_dtypes(include= 'object').columns.to_list()
num_cols = X.select_dtypes(exclude= 'object').columns.to_list()

Categorical Features

In [None]:
cat_cols

In [None]:
X[cat_cols].nunique().sort_values()

In [None]:
# Distribution of categorical features

fig = plt.figure(figsize=(14,10))

for i, feature in enumerate(cat_cols):
    ax = plt.subplot(3,2,i+1)
    sns.countplot(data=X, x=feature, palette="viridis", ax=ax)
    plt.xlabel("values")
    plt.title(feature)
plt.subplots_adjust(wspace=0.5, hspace=0.5)    

- Month_SeasonalPurchase has high variability.
- Do Feature Engineering of Month_SeasonalPurchase 

In [None]:
X["Month_SeasonalPurchase"] = X["Month_SeasonalPurchase"].map(
                  {"Oct": "Winter", 
                   "Nov": "Winter", 
                   "Dec": "Winter", 
                   "Feb": "Winter", 
                   "Mar": "Summer", 
                   "May": "Summer",
                   "Jun": "Summer",
                   "Jul": "Monsoon",
                   "Aug": "Monsoon",
                   "Sep": "Monsoon"})

In [None]:
df_test["Month_SeasonalPurchase"] = df_test["Month_SeasonalPurchase"].map(
                  {"Oct": "Winter", 
                   "Nov": "Winter", 
                   "Dec": "Winter", 
                   "Feb": "Winter", 
                   "Mar": "Summer", 
                   "May": "Summer",
                   "Jun": "Summer",
                   "Jul": "Monsoon",
                   "Aug": "Monsoon",
                   "Sep": "Monsoon"})

In [None]:
# Distribution of categorical features

fig = plt.figure(figsize=(14,10))

for i, feature in enumerate(cat_cols):
    ax = plt.subplot(3,2,i+1)
    sns.countplot(data=X, x=feature, palette="viridis", ax=ax)
    plt.xlabel("values")
    plt.title(feature)
plt.subplots_adjust(wspace=0.5, hspace=0.5)   

Numerical Features

In [None]:
num_cols

In [None]:
# Cardinality of Numerical Features
X[num_cols].nunique().sort_values()

In [None]:
# Missing values in numerical features
X[num_cols].isnull().sum()

In [None]:
# Descriptive Statistics
X[num_cols].describe().T

In [None]:
for name in num_cols[0: len(num_cols)-1]:
    sns.histplot(X[name].values) # histogram plot
    plt.title(name, fontsize= 16)
    plt.xlabel('Range', fontsize= 16)
    plt.ylabel('Frequency', fontsize= 16)
    plt.show()

In [None]:
# Correlation heatmap for numerical features

plt.figure(figsize=(14,7))

sns.heatmap(X[num_cols].corr(), cmap='coolwarm', annot=True)
plt.title("Correlation heatmap for numerical features")
plt.show()


Drop highly correlated features (> 0.6)

In [None]:
X.drop(["HomePage_Duration", "LandingPage_Duration", "ProductDescriptionPage_Duration", "GoogleMetric:Exit_Rates"], axis=1, inplace=True)

In [None]:
df_test.drop(["HomePage_Duration", "LandingPage_Duration", "ProductDescriptionPage_Duration", "GoogleMetric:Exit_Rates"], axis=1, inplace=True)

In [None]:
to_remove = ["HomePage_Duration", "LandingPage_Duration", "ProductDescriptionPage_Duration", "GoogleMetric:Exit_Rates"]
num_cols = [_ for _ in num_cols if _ not in to_remove]


In [None]:
# Correlation heatmap for numerical features

plt.figure(figsize=(14,7))

sns.heatmap(X[num_cols].corr(), cmap='coolwarm', annot=True)
plt.title("Correlation heatmap for numerical features")
plt.show()


In [None]:
sns.pairplot(X[num_cols])
plt.show()

In [None]:
# Kernel Density Estimate (KDE) plot
plt.figure(figsize=(14, 17))
plt.title("KDEplot for numerical features")
plt.xlim(-25, 25)
sns.kdeplot(data= X[num_cols])
plt.show()

In [None]:
## Boxplots
plt.figure(figsize=(14, 7))
sns.boxplot(data= X[num_cols], orient='horizontal', palette= 'viridis')
plt.title("Boxplot for numerical features")
plt.xlabel('Values')
plt.ylabel('Features')
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
sns.violinplot(data= X[num_cols], orient='horizontal', palette= 'viridis', scale= 'width')
plt.title("Violinplot for numerical features")
plt.xlabel('Values')
plt.ylabel('Features')
plt.show()

#### Data Preprocessing

In [None]:
 from sklearn.preprocessing import LabelEncoder
 label_encoder = LabelEncoder()
 y = label_encoder.fit_transform(y)

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

from sklearn.feature_selection import GenericUnivariateSelect, SelectKBest, RFE, SequentialFeatureSelector
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate, GridSearchCV,RandomizedSearchCV, train_test_split

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier, VotingClassifier, StackingClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, recall_score, accuracy_score, f1_score

from sklearn import set_config

from xgboost import XGBClassifier, XGBRFClassifier

from imblearn.pipeline import Pipeline as imb_pipeline
from imblearn.over_sampling import SMOTENC


import optuna
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
# Categorical encoding
categorical_transformers = Pipeline([
                                    ('selector', ColumnTransformer([
                                                                    ('select', 'passthrough', cat_cols)
                                                                    ])),
                                    ('cat_imputer', SimpleImputer(strategy= 'most_frequent')),                               
                                    ('one-hot', OneHotEncoder(handle_unknown= 'ignore', sparse= False))
                                    ])

#Scaling for numerical features
numerical_transformers = Pipeline([
                              ('selector', ColumnTransformer([
                                                             ('select', 'passthrough', num_cols)
                                                             ])),
                              ('num_imputer', KNNImputer(n_neighbors=10, missing_values=np.nan)),  
                              ('scaler', RobustScaler()),
                              ('pca', PCA(n_components=11))      
                              ])

#Data Preprocess Pipeline
preprocess_pipe = FeatureUnion(transformer_list=[
                                                     ('categorical', categorical_transformers),
                                                     ('numerical', numerical_transformers)
                                                     ])

In [None]:
from sklearn import set_config
set_config(display= 'diagram')

# displays HTML representation 
preprocess_pipe



#### Model Building

In [None]:
def construct_pipeline(classifier, preprocessor):
    '''constructs pipeline object by combining a preprocessing stage and an estimator
    Args:
        classifier: Estimator object
        preprocessor: Preprocessing transformer object
        
    Returns:
        pipeline object
    '''
    full_pipe= Pipeline([
                        ('preprocess', preprocessor),
                        ('clf', classifier )
                        ])
    return full_pipe
    
def train_model(classifier, preprocessor, X_train, y_train):
    '''Trains a model with on given preprocssing and classification stages
    Args:
        classifier: Estimator object
        preprocessor: Preprocessing transformer object
        X_train: Training data feature matrix
        y_train: Training data label vector
        
    Returns:
        trained model
    '''
    X_train, X_test, y_train, y_test= train_test_split(X, y,
                                                       test_size= 0.3,
                                                       stratify= y,
                                                       random_state= rng)
    
    full_pipe= construct_pipeline(classifier, preprocessor)
    full_pipe.fit(X_train, y_train)
    
    print("-------------------Metrics on training set --------------------------")
    y_train_pred = full_pipe.predict(X_train)
    print("F1 score on training set: ", f1_score(y_train, y_train_pred, average='micro'))
    print()
    print(classification_report(y_train, y_train_pred))

    
    print("-------------------Metrics on test set --------------------------")
    y_test_pred = full_pipe.predict(X_test)
    print("F1 score on test set: " ,f1_score(y_test, y_test_pred, average='micro'))
    print()
    print(classification_report(y_test, y_test_pred))
    disp = ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred)
    
    return full_pipe

    
def tune_model(model,
               param_grid,
               X_train, y_train,
               cv= StratifiedShuffleSplit(n_splits = 10, 
                                          test_size= 0.3,
                                          random_state= rng)):
    '''Do hyper parameter tuning using RandomizedSearchCV strategy
    
    Args:
        model: Model to be tuned
        param_grid: dict of parameters
        X_train: Feature matrix
        y_train: Label matrix
        
    Returns: 
        best parameters
        best estimator
    '''
    search= RandomizedSearchCV(model, 
                          param_distributions= param_grid,
                          cv= cv,
                          scoring= 'f1_micro',
                          n_jobs= -1,
                          return_train_score= True,
                          random_state= rng)
    search.fit(X_train, y_train)
    
    print("Best parameters: ", search.best_params_)
    print("-------------------Best model performance --------------------------")
    
    mean_train_score= search.cv_results_['mean_train_score'][search.best_index_]
    mean_test_score= search.cv_results_['mean_test_score'][search.best_index_]
    std_train_score= search.cv_results_['std_train_score'][search.best_index_]
    std_test_score= search.cv_results_['std_test_score'][search.best_index_]

    print(f"Score of the model on the train set:\n"
        f"{mean_train_score:.3f} +/- {std_train_score:.6f}")

    print(f"Score of the model on the test set:\n"
        f"{mean_test_score:.3f} +/- {std_test_score:.6f}")
    
    y_pred = search.best_estimator_.predict(X_train)
    print(classification_report(y_train, y_pred))
    ConfusionMatrixDisplay.from_predictions(y_train, y_pred)
    
    print("------------------------------------------------------------------")
    
    return search.best_estimator_

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.3, stratify= y, random_state= rng)
# data preprocessing
X_train = preprocess_pipe.fit_transform(X_train)
X_test = preprocess_pipe.transform(X_test)
print(X_train.shape, X_test.shape)

In [None]:
# create an instance of PCA
pca = PCA()

# fit and transform the selected columns of train_df
pca.fit(X_train)

# plot the cumulative explained variance ratio
import matplotlib.pyplot as plt
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.show()

In [None]:
# determine the number of components to cover up to 99% of the variance
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.99) + 1
n_components

#### Spot-Check model performance (without hyperparameter tuning)

In [None]:
# model params
lgbm_params = {
    'device_type' : 'gpu'
}

catb_params = {
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

xgb_params = {
    'predictor': 'gpu_predictor',
    'tree_method': 'gpu_hist',
    'gpu_id' : 0,
    'verbosity': 0
}

In [None]:
models = [
    ('KNN', KNeighborsClassifier()),
    ('LogReg', LogisticRegression(random_state=rng)),
    ('SVM', SVC(random_state=rng)),
    ('DT', DecisionTreeClassifier(random_state=rng)),
    ('RF', RandomForestClassifier(random_state=rng)),
    ('ET', ExtraTreesClassifier(random_state=rng)),
    ('Ada', AdaBoostClassifier(random_state=rng)),
    ('GradBoost', GradientBoostingClassifier(random_state=rng)),    
    ('LGBM', LGBMClassifier	(**lgbm_params)),
    ('CATB', CatBoostClassifier(**catb_params)),
    ('XGB', XGBClassifier(**xgb_params)),
   
]

scores = dict()

for name, model in models:
    y_hat = model.fit(X_train, y_train).predict(X_test)
    scores[name] = f1_score(y_test, y_hat, average="micro")

In [None]:
scores_df = pd.DataFrame([scores]).T.rename(columns={0:"F1-Score"})
scores_df.sort_values("F1-Score", ascending=False)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

sns.barplot(
    data=scores_df,
    x="F1-Score",
    y=scores_df.index,
    orient="h",
    ax=ax
)

for idx in range(0, len(scores_df)):
    x = scores_df["F1-Score"][idx]
    ax.annotate(
        text=f"F1: {np.round(x,3)}",
        xy=(x-0.01, idx),
        va='center', ha='right'
    )
    
plt.show()

We select GradBoost, AdaBoost, CatBoost, LogReg and tune their parameters

Logistic Regression

In [None]:
logreg_clf= LogisticRegression(random_state= rng, max_iter= 4000)
logreg_clf_model= train_model(logreg_clf, preprocess_pipe, X, y)

In [None]:
logreg_clf_pipe= construct_pipeline(LogisticRegression(
    penalty= 'elasticnet',
    solver='saga', 
    max_iter=10000,
    random_state= rng), 
    preprocess_pipe)

param_grid = {'clf__C': ( 0.001, 0.01, 0.1, 1, 10, 100),
             'clf__l1_ratio': (0, 0.1, 0.3, 0.5, 0.7, 0.9, 1)}
             
logreg_clf_model_tuned= tune_model(logreg_clf_pipe, param_grid, X, y)

Gradient Boosting 

In [None]:
gradboost_clf= GradientBoostingClassifier(random_state= rng)
gradboost_clf_model= train_model(gradboost_clf, preprocess_pipe, X, y)

In [None]:
gradboost_clf_pipe = construct_pipeline(gradboost_clf, preprocess_pipe)
param_grid = {
             'clf__n_estimators': (50, 100, 150, 200),
             'clf__learning_rate': (1e-3, 1e-2, 1e-1, 1)
             }
gradboost_clf_model_tuned= tune_model(gradboost_clf_pipe, param_grid, X, y)

AdaBoost 

In [None]:
ada_clf= AdaBoostClassifier(random_state= rng)
ada_clf_model= train_model(ada_clf, preprocess_pipe, X, y)

In [None]:
ada_clf_pipe= construct_pipeline(ada_clf, preprocess_pipe)
param_grid = {
            'clf__n_estimators': (50, 75, 100, 125, 150),
            'clf__learning_rate': (1e-3, 1e-2, 1e-1, 1)
            }
ada_clf_model_tuned= tune_model(ada_clf_pipe, param_grid, X, y)

CatBoost

In [None]:
from catboost import CatBoostClassifier
catboost_clf = CatBoostClassifier(**catb_params)
catboost_clf_model= train_model(catboost_clf, preprocess_pipe, X, y)

In [None]:
from catboost import CatBoostClassifier
catboost_clf = CatBoostClassifier(**catb_params)
catboost_clf_model= train_model(catboost_clf, preprocess_pipe, X, y)

In [None]:
catb_params

In [None]:
catboost_clf_pipe = construct_pipeline(catboost_clf, preprocess_pipe)
param_grid = {
            'clf__iterations': np.arange(10, 500, 10),
            'clf__depth': np.arange(2, 16, 2),
            'clf__learning_rate': [0.01, 0.05, 0.1]
            }
catboost_clf_model_tuned = tune_model(catboost_clf_pipe, param_grid, X, y)

SVM

In [None]:
svm_clf= SVC(random_state= rng)
svm_clf_model= train_model(svm_clf, preprocess_pipe, X, y)


In [None]:
svm_clf_pipe= construct_pipeline(SVC(max_iter=10000, random_state= rng), preprocess_pipe)
param_grid= {
             'clf__C':[0.001, 0.01, 1, 10, 100, 200, 400, 600, 800, 1000],
             'clf__kernel':['linear','rbf']
            }
svm_clf_model_tuned= tune_model(svm_clf_pipe, param_grid, X, y)


In [None]:
gradboost_best_params = {'n_estimators': 50,
                         'learning_rate': 0.1
                   }

adaboost_best_params = {'n_estimators': 75,
                        'learning_rate': 0.1
                       }

logreg_best_params = {'l1_ratio': 0.7,
                      'C': 100
                     }

catboost_best_params = {'learning_rate': 0.01,
                        'iterations': 300, 
                        'depth': 2}

svm_best_params = {'C': 1,
                   'kernel': 'rbf'} 
                   

##### Stacking Classifier

In [None]:
estimators = [
    ('svm', SVC(**svm_best_params,
                max_iter=10000,
                random_state=rng
               ))   , 
    ('adaboost', AdaBoostClassifier(**adaboost_best_params,
                                    random_state= rng,
                                   )),
    ('catboost', CatBoostClassifier(**catboost_best_params,
                                   **catb_params,
                                   random_state=rng
                                   ))]
                 

In [None]:
stack_clf= train_model(StackingClassifier(estimators=estimators, 
                                          final_estimator=LogisticRegression(**logreg_best_params,
                                                                            random_state=rng)),
                       preprocess_pipe,
                       X, y)

Voting Classifier

In [None]:
voting_estimators = [
    ('svm', SVC(**svm_best_params,
                max_iter=10000,
                random_state=rng
               ))   , 
    ('adaboost', AdaBoostClassifier(**adaboost_best_params,
                                    random_state= rng,
                                   )),
    ('catboost', CatBoostClassifier(**catboost_best_params,
                                   **catb_params,
                                   random_state=rng
                                   )),
    ('logreg', LogisticRegression(**logreg_best_params,
                                 random_state=rng
                                 ))]
                 

In [None]:
voting_clf = train_model(VotingClassifier(estimators=voting_estimators), preprocess_pipe, X, y)
                        

In [None]:
prediction = stack_clf.predict(df_test)

In [None]:
prediction.shape

In [None]:
prediction = label_encoder.inverse_transform(prediction)

In [None]:
prediction

In [None]:
submission = pd.DataFrame(columns = ['id', 'Made_Purchase'])
submission['id'] = [_ for _ in range(len(prediction))]
submission['Made_Purchase'] = prediction

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)