In [1]:
#Data processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

#Model performance metrics
from time import process_time
from memory_profiler import profile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Feature selection and models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

#Data scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
%load_ext memory_profiler

Models

In [3]:
#Random Forest Model
def RFClassifier(X_train, y_train, X_test, y_test):
  #Initialize the Random Forest Classifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Fit the classifier to the data
  forest_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

# Modified Models to include SMOTE and standardization

In [14]:
#Random Forest function
def RFClassifierMOD(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_estimators):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = num_estimators)

  #Time Measurement
  start_time = process_time()
  
  #Data preprocessing
  smote = SMOTE()
  X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
  # Subset the numeric attributes
  X_train_smote_numeric = X_train_smote[numeric_attributes]
  X_test_numeric = X_test[numeric_attributes]

  scaler = StandardScaler()
  X_train_smote_S = scaler.fit_transform(X_train_smote_numeric)
  X_test_numeric_S = scaler.transform(X_test_numeric)
        
  # Concatenate the standardized numeric attributes with the categorical attributes
  X_train_combined = np.concatenate((X_train_smote_S, X_train_smote[cat_attributes]), axis=1)
  X_test_combined = np.concatenate((X_test_numeric_S, X_test[cat_attributes]), axis=1)

  #Fit the Classifier to the data
  forest_raw_imbalanced.fit(X_train_combined, y_train_smote)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test_combined)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

# Crossvalidation Models

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit

K-Fold Validation Models

In [16]:
#Random Forest Function
def RFClassifierKFold(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_estimators, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = num_estimators)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  kfold = KFold(n_splits = num_folds)
    
  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = kfold)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in kfold.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

# Time-Series crossvalidation

In [17]:
#Random Forest Function
def RFClassifierTS(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_estimators, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = num_estimators)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  tscv = TimeSeriesSplit(n_splits = num_folds)
    
  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = tscv)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in tscv.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

# Data Import and Processing

In [8]:
#https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("online_shoppers_intention.csv")

In [9]:
#Identify categorical attributes
categorical_features = ["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]
df_cat = df[categorical_features]

df_onehot = pd.get_dummies(df, columns = categorical_features, prefix = categorical_features)

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_onehot['Revenue'] = label_encoder.fit_transform(df['Revenue'])

Control SMOTE data

In [10]:
#Specify independent/ dependent values
X = df_onehot.drop(columns = "Revenue")
y = df_onehot["Revenue"]

#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Filtered Data

Pearson Correlation Filters

In [11]:
#Correlation of Onehot encoded dataset

corr = df_onehot.corr()

revenue_correlation = corr["Revenue"]
sorted_pearson_correlation = revenue_correlation.abs().sort_values(ascending = False)

sorted_pearson_correlation_df = pd.DataFrame(sorted_pearson_correlation)
SPC_topquantile = sorted_pearson_correlation_df.quantile(0.75)
filtered_df = sorted_pearson_correlation_df[sorted_pearson_correlation_df >= SPC_topquantile]
filtered_df.dropna(inplace = True)
#18 attributes were kept, were in the top quantile
     
df_pearson = df_onehot[filtered_df.index.tolist()]

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])

#Specify independent/ dependent values
X_p = df_pearson.drop(columns = "Revenue")
y_p = df_pearson["Revenue"]

#Split the Data
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size = 0.3)

smote = SMOTE()
X_p_train_smote, y_p_train_smote = smote.fit_resample(X_p_train, y_p_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])


Random Forest Features

In [12]:
#Filters of RF Classifier

rf_classifier = RandomForestClassifier()

#Fit random forest classifier
rf_classifier.fit(X_train_smote, y_train_smote)

feature_importances = rf_classifier.feature_importances_

rf_df = pd.DataFrame({"Feature": X_train_smote.columns, "Importance": feature_importances})

sorted_features = np.argsort(feature_importances)[::-1]

#Sorting features
rf_df_sorted = rf_df.sort_values("Importance", ascending = False)
rf_df_sorted = rf_df_sorted.reset_index(drop = True)
rf_df_sorted

rf_df_sorted.describe()
RF_topquantile = rf_df_sorted['Importance'].quantile(0.75)
RFfiltered_df = rf_df_sorted.loc[rf_df_sorted['Importance'] >= RF_topquantile]
#19 features were kept after keeping the top quartile of results
filtered_attributes_rf = RFfiltered_df.index.tolist()
df_rf = df_onehot[RFfiltered_df["Feature"]]

df_rf["Revenue"] = df_onehot["Revenue"]

#Specify independent/ dependent values
X_rf = df_rf.drop(columns = "Revenue")
y_rf = df_rf["Revenue"]

#Split the Data
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size = 0.3)

smote = SMOTE()
X_rf_train_smote, y_rf_train_smote = smote.fit_resample(X_rf_train, y_rf_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rf["Revenue"] = df_onehot["Revenue"]


In [13]:
#Define columns that need data normalization/ standardization
numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

p_numeric_features = ['Administrative', 'Administrative_Duration', 'Informational', 
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration','BounceRates',
       'ExitRates', 'PageValues', 'SpecialDay']

rf_numeric_features = ['Administrative', 'Administrative_Duration',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues']

cat_features = [col for col in X_train if col not in numeric_features]
p_cat_features = [col for col in X_p_train if col not in numeric_features]
rf_cat_features = [col for col in X_rf_train if col not in numeric_features]

# Changing num_estimators

# Modified Functions to include oversampling/scaling within the function

Pearson Correlation Filtered Data

In [18]:
%memit RFClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 100)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3125
           1       0.61      0.74      0.67       574

    accuracy                           0.89      3699
   macro avg       0.78      0.82      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2851  274]
 [ 151  423]]
2.453125
peak memory: 235.53 MiB, increment: 25.13 MiB


In [21]:
%memit RFClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 500)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3125
           1       0.61      0.75      0.68       574

    accuracy                           0.89      3699
   macro avg       0.78      0.83      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2854  271]
 [ 142  432]]
10.265625
peak memory: 1494.98 MiB, increment: 99.66 MiB


In [19]:
%memit RFClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 1000)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3125
           1       0.61      0.75      0.67       574

    accuracy                           0.89      3699
   macro avg       0.78      0.83      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2856  269]
 [ 146  428]]
16.625
peak memory: 396.10 MiB, increment: 187.78 MiB


# K-Fold Crossvalidation 

Random forest features

In [25]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7322
           1       0.73      0.54      0.62      1309

    accuracy                           0.90      8631
   macro avg       0.83      0.75      0.78      8631
weighted avg       0.89      0.90      0.89      8631

10.671875
peak memory: 1293.09 MiB, increment: 9.95 MiB


In [23]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 500, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7322
           1       0.73      0.54      0.62      1309

    accuracy                           0.90      8631
   macro avg       0.83      0.75      0.78      8631
weighted avg       0.89      0.90      0.89      8631

36.625
peak memory: 1332.65 MiB, increment: 52.67 MiB


In [24]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 1000, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7322
           1       0.73      0.54      0.62      1309

    accuracy                           0.90      8631
   macro avg       0.83      0.75      0.78      8631
weighted avg       0.89      0.90      0.89      8631

69.03125
peak memory: 1380.50 MiB, increment: 86.07 MiB


In [33]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 10)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7322
           1       0.73      0.53      0.61      1309

    accuracy                           0.90      8631
   macro avg       0.82      0.75      0.78      8631
weighted avg       0.89      0.90      0.89      8631

23.953125
peak memory: 1328.65 MiB, increment: 12.66 MiB


In [34]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 50)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7322
           1       0.71      0.54      0.61      1309

    accuracy                           0.90      8631
   macro avg       0.82      0.75      0.78      8631
weighted avg       0.89      0.90      0.89      8631

136.09375
peak memory: 1333.60 MiB, increment: 12.47 MiB


# Time-Series Crossvalidation

Random Forest features

In [26]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 5)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      6091
           1       0.73      0.52      0.60      1099

    accuracy                           0.90      7190
   macro avg       0.82      0.74      0.77      7190
weighted avg       0.89      0.90      0.89      7190

7.359375
peak memory: 1302.00 MiB, increment: 13.80 MiB


In [30]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 500, 5)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      6091
           1       0.73      0.52      0.60      1099

    accuracy                           0.90      7190
   macro avg       0.82      0.74      0.77      7190
weighted avg       0.89      0.90      0.89      7190

23.09375
peak memory: 1378.89 MiB, increment: 57.43 MiB


In [31]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 1000, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6091
           1       0.72      0.52      0.60      1099

    accuracy                           0.90      7190
   macro avg       0.82      0.74      0.77      7190
weighted avg       0.89      0.90      0.89      7190

43.8125
peak memory: 1443.12 MiB, increment: 97.33 MiB


In [29]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 10)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      6649
           1       0.72      0.51      0.60      1191

    accuracy                           0.90      7840
   macro avg       0.82      0.74      0.77      7840
weighted avg       0.89      0.90      0.89      7840

13.703125
peak memory: 1325.43 MiB, increment: 19.48 MiB


In [32]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 50)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7171
           1       0.72      0.51      0.60      1279

    accuracy                           0.90      8450
   macro avg       0.82      0.74      0.77      8450
weighted avg       0.89      0.90      0.89      8450

69.0625
peak memory: 1323.23 MiB, increment: 51.21 MiB
