In [10]:
from time import process_time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [4]:
#https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("online_shoppers_intention.csv")

In [8]:
#Identify categorical attributes
categorical_features = ["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]
df_cat = df[categorical_features]

df_onehot = pd.get_dummies(df, columns = categorical_features, prefix = categorical_features)

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_onehot['Revenue'] = label_encoder.fit_transform(df['Revenue'])

Split the Data and apply SMOTE Oversampling

In [11]:
#Specify independent/ dependent values
X = df_onehot.drop(columns = "Revenue")
y = df_onehot["Revenue"]

#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

#SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [28]:
#Define columns that need data normalization/ standardization
numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

cat_features = [col for col in X_train if col not in numeric_features]

In [13]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [47]:
X_train_smote_copy = X_train_smote.copy()
X_train_smote_num = X_train_smote_copy[numeric_features]

X_test_copy = X_test.copy()
X_test_num = X_test_copy[numeric_features]

Data Normalization

In [48]:
nscaler = MinMaxScaler()

X_train_smote_normalized_num = nscaler.fit_transform(X_train_smote_num)
X_train_smote_normalized_combined = np.concatenate((X_train_smote_normalized_num, X_train_smote_copy[cat_features]), axis = 1)

X_test_normalized_num = nscaler.transform(X_test_num)
X_test_normalized_combined = np.concatenate((X_test_normalized_num, X_test_copy[cat_features]), axis = 1)


In [49]:
#Initialize the DecisionTreeClassifier
tree_raw_imbalanced = DecisionTreeClassifier(criterion = "entropy")

#Time Measurement
start_time = process_time()

#Fit the Classifier to the data
tree_raw_imbalanced.fit(X_train_smote_normalized_combined, y_train_smote)

#Predict new Data
y_pred = tree_raw_imbalanced.predict(X_test_normalized_combined)

#Time Measurement
end_time = process_time()

#Results
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
time = end_time - start_time
print(cr)
print(cm)
print(time)

              precision    recall  f1-score   support

           0       0.93      0.92      0.92      3145
           1       0.56      0.60      0.58       554

    accuracy                           0.87      3699
   macro avg       0.74      0.76      0.75      3699
weighted avg       0.87      0.87      0.87      3699

[[2880  265]
 [ 221  333]]
0.125


Data Standardization

In [50]:
sscaler = StandardScaler()

X_train_smote_standardized_num = sscaler.fit_transform(X_train_smote_num)
X_train_smote_standardized_combined = np.concatenate((X_train_smote_standardized_num, X_train_smote_copy[cat_features]), axis = 1)

X_test_standardized_num = sscaler.transform(X_test_num)
X_test_standardized_combined = np.concatenate((X_test_standardized_num, X_test_copy[cat_features]), axis = 1)

In [51]:
#Initialize the DecisionTreeClassifier
tree_raw_imbalanced = DecisionTreeClassifier(criterion = "entropy")

#Time Measurement
start_time = process_time()

#Fit the Classifier to the data
tree_raw_imbalanced.fit(X_train_smote_standardized_combined, y_train_smote)

#Predict new Data
y_pred = tree_raw_imbalanced.predict(X_test_standardized_combined)

#Time Measurement
end_time = process_time()

#Results
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
time = end_time - start_time
print(cr)
print(cm)
print(time)

              precision    recall  f1-score   support

           0       0.93      0.92      0.92      3145
           1       0.57      0.61      0.59       554

    accuracy                           0.87      3699
   macro avg       0.75      0.76      0.76      3699
weighted avg       0.88      0.87      0.87      3699

[[2887  258]
 [ 215  339]]
0.109375


Conclusion > Data Standardization leads to increased precision, recall, f1-score when predicting successfull purchases