In [10]:
#Data processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

#Model performance metrics
from time import process_time
from memory_profiler import profile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Feature selection and models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

#Data scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [36]:
%load_ext memory_profiler

Models

In [33]:
#Decision Tree Function
def DTClassifier(X_train, y_train, X_test, y_test):
  #Initialize the DecisionTreeClassifier
  tree_raw_imbalanced = DecisionTreeClassifier(criterion = "entropy")

  #Time Measurement
  start_time = process_time()

  #Fit the Classifier to the data
  tree_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = tree_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

In [34]:
#Random Forest Model
def RFClassifier(X_train, y_train, X_test, y_test):
  #Initialize the Random Forest Classifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Fit the classifier to the data
  forest_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

In [70]:
#Logistic Regression Model
def LRClassifier(X_train, y_train, X_test, y_test):
  #Initialize the Logistic Regression Classifier
  lr_raw_imbalanced = LogisticRegression(max_iter= 1000)

  #Time Measurement
  start_time = process_time()

  #Fit the classifier to the data
  lr_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = lr_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

In [4]:
#https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("online_shoppers_intention.csv")

In [5]:
#Identify categorical attributes
categorical_features = ["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]
df_cat = df[categorical_features]

df_onehot = pd.get_dummies(df, columns = categorical_features, prefix = categorical_features)

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_onehot['Revenue'] = label_encoder.fit_transform(df['Revenue'])

Control SMOTE data

In [6]:
#Specify independent/ dependent values
X = df_onehot.drop(columns = "Revenue")
y = df_onehot["Revenue"]

#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Filtered Data

Pearson Correlation Filters

In [7]:
#Correlation of Onehot encoded dataset

corr = df_onehot.corr()

revenue_correlation = corr["Revenue"]
sorted_pearson_correlation = revenue_correlation.abs().sort_values(ascending = False)

#Filter out for attributes with correlation > 0.09
filtered_correlation = sorted_pearson_correlation[sorted_pearson_correlation > 0.09]
filtered_attributes = filtered_correlation.index.tolist()
df_pearson = df_onehot[filtered_attributes]

#12 attributes (Onehot encoded) are kept

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])

#Specify independent/ dependent values
X_p = df_pearson.drop(columns = "Revenue")
y_p = df_pearson["Revenue"]

#Split the Data
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size = 0.3)

smote = SMOTE()
X_p_train_smote, y_p_train_smote = smote.fit_resample(X_p_train, y_p_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])


Random Forest Features

In [8]:
#Filters of RF Classifier

rf_classifier = RandomForestClassifier()

#Fit random forest classifier
rf_classifier.fit(X_train_smote, y_train_smote)

feature_importances = rf_classifier.feature_importances_

rf_df = pd.DataFrame({"Feature": X_train_smote.columns, "Importance": feature_importances})

sorted_features = np.argsort(feature_importances)[::-1]

#Sorting features
rf_df_sorted = rf_df.sort_values("Importance", ascending = False)
rf_df_sorted = rf_df_sorted.reset_index(drop = True)
rf_df_sorted

#Filter out for attributes with random forest score > 0.009
filtered_rf = rf_df_sorted[rf_df_sorted['Importance'] > 0.009]
#filtered_attributes_rf = filtered_rf.index.tolist()
df_rf = df_onehot[filtered_rf["Feature"]]

#24 features are kept after random forest feature selection
df_rf["Revenue"] = df_onehot["Revenue"]

#Specify independent/ dependent values
X_rf = df_rf.drop(columns = "Revenue")
y_rf = df_rf["Revenue"]

#Split the Data
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size = 0.3)

smote = SMOTE()
X_rf_train_smote, y_rf_train_smote = smote.fit_resample(X_rf_train, y_rf_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rf["Revenue"] = df_onehot["Revenue"]


RFE Feature Selection

In [9]:
#Filter for features with RFE

df_X_rfe = df_onehot.drop(columns = "Revenue")
df_y_rfe = df_onehot["Revenue"]

# Instantiate the model and RFE selector
model = LogisticRegression(solver = "liblinear")
rfe_selector = RFE(model, n_features_to_select = 20)

# Perform RFE feature selection
selected_features = rfe_selector.fit_transform(df_X_rfe, df_y_rfe)

# Get the mask of selected features
feature_mask = rfe_selector.support_

# Get the ranking of features (optional)
feature_ranking = rfe_selector.ranking_

selected_indices = [i for i, mask in enumerate(feature_mask) if mask]
print("Selected feature indices:", selected_indices)

df_rfe = df_onehot.iloc[:, selected_indices]

#20 features are kept after random forest feature selection
df_rfe["Revenue"] = df_onehot["Revenue"]

#Specify independent/ dependent values
X_rfe = df_rfe.drop(columns = "Revenue")
y_rfe = df_rfe["Revenue"]

#Split the Data
X_rfe_train, X_rfe_test, y_rfe_train, y_rfe_test = train_test_split(X_rfe, y_rfe, test_size = 0.3)

smote = SMOTE()
X_rfe_train_smote, y_rfe_train_smote = smote.fit_resample(X_rfe_train, y_rfe_train)

Selected feature indices: [6, 7, 9, 10, 12, 13, 17, 18, 19, 22, 30, 39, 50, 52, 56, 62, 64, 67, 73, 74]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rfe["Revenue"] = df_onehot["Revenue"]


In [24]:
#Define columns that need data normalization/ standardization
numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

p_numeric_features = ['Administrative', 'Administrative_Duration', 'Informational', 
       'ProductRelated', 'ProductRelated_Duration','BounceRates', 'ExitRates', 'PageValues']

rf_numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues']

rfe_numeric_features = ['BounceRates', 'ExitRates', 'SpecialDay']

cat_features = [col for col in X_train if col not in numeric_features]
p_cat_features = [col for col in X_p_train if col not in numeric_features]
rf_cat_features = [col for col in X_rf_train if col not in numeric_features]
rfe_cat_features = [col for col in X_rfe_train if col not in numeric_features]


# Control Model Performance

Unfiltered Data

In [42]:
%memit DTClassifier(X_train_smote, y_train_smote, X_test, y_test)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3124
           1       0.55      0.59      0.57       575

    accuracy                           0.86      3699
   macro avg       0.73      0.75      0.74      3699
weighted avg       0.86      0.86      0.86      3699

[[2844  280]
 [ 238  337]]
0.109375
peak memory: 288.85 MiB, increment: 4.43 MiB


In [43]:
%memit RFClassifier(X_train_smote, y_train_smote, X_test, y_test)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3124
           1       0.68      0.62      0.65       575

    accuracy                           0.89      3699
   macro avg       0.80      0.78      0.79      3699
weighted avg       0.89      0.89      0.89      3699

[[2954  170]
 [ 220  355]]
1.34375
peak memory: 306.59 MiB, increment: 21.88 MiB


In [44]:
%memit LRClassifier(X_train_smote, y_train_smote, X_test, y_test)

              precision    recall  f1-score   support

           0       0.93      0.92      0.92      3124
           1       0.58      0.62      0.60       575

    accuracy                           0.87      3699
   macro avg       0.75      0.77      0.76      3699
weighted avg       0.87      0.87      0.87      3699

[[2862  262]
 [ 218  357]]
0.09375
peak memory: 294.27 MiB, increment: 9.27 MiB


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pearson correlation features

In [45]:
%memit DTClassifier(X_p_train_smote, y_p_train_smote, X_p_test, y_p_test)

              precision    recall  f1-score   support

           0       0.93      0.88      0.91      3102
           1       0.52      0.65      0.58       597

    accuracy                           0.85      3699
   macro avg       0.72      0.77      0.74      3699
weighted avg       0.86      0.85      0.85      3699

[[2737  365]
 [ 208  389]]
0.09375
peak memory: 286.80 MiB, increment: 0.88 MiB


In [47]:
%memit RFClassifier(X_p_train_smote, y_p_train_smote, X_p_test, y_p_test)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3102
           1       0.60      0.74      0.66       597

    accuracy                           0.88      3699
   macro avg       0.77      0.82      0.79      3699
weighted avg       0.89      0.88      0.88      3699

[[2810  292]
 [ 157  440]]
1.5
peak memory: 301.36 MiB, increment: 17.08 MiB


In [48]:
%memit LRClassifier(X_p_train_smote, y_p_train_smote, X_p_test, y_p_test)

              precision    recall  f1-score   support

           0       0.94      0.91      0.93      3102
           1       0.60      0.69      0.64       597

    accuracy                           0.88      3699
   macro avg       0.77      0.80      0.78      3699
weighted avg       0.88      0.88      0.88      3699

[[2832  270]
 [ 186  411]]
0.046875
peak memory: 287.21 MiB, increment: 1.91 MiB


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest features

In [49]:
%memit DTClassifier(X_rf_train_smote, y_rf_train_smote, X_rf_test, y_rf_test)

              precision    recall  f1-score   support

           0       0.93      0.90      0.92      3129
           1       0.54      0.61      0.57       570

    accuracy                           0.86      3699
   macro avg       0.73      0.76      0.75      3699
weighted avg       0.87      0.86      0.86      3699

[[2830  299]
 [ 220  350]]
0.09375
peak memory: 286.21 MiB, increment: 0.35 MiB


In [50]:
%memit RFClassifier(X_rf_train_smote, y_rf_train_smote, X_rf_test, y_rf_test)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3129
           1       0.65      0.68      0.67       570

    accuracy                           0.89      3699
   macro avg       0.80      0.81      0.80      3699
weighted avg       0.90      0.89      0.90      3699

[[2922  207]
 [ 182  388]]
1.453125
peak memory: 300.47 MiB, increment: 14.25 MiB


In [51]:
%memit LRClassifier(X_rf_train_smote, y_rf_train_smote, X_rf_test, y_rf_test)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3129
           1       0.59      0.65      0.62       570

    accuracy                           0.88      3699
   macro avg       0.76      0.78      0.77      3699
weighted avg       0.88      0.88      0.88      3699

[[2866  263]
 [ 198  372]]
0.0625
peak memory: 287.60 MiB, increment: 4.20 MiB


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Recursive Feature Elimination Features

In [52]:
%memit DTClassifier(X_rfe_train_smote, y_rfe_train_smote, X_rfe_test, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.74      0.80      3126
           1       0.23      0.43      0.30       573

    accuracy                           0.69      3699
   macro avg       0.55      0.58      0.55      3699
weighted avg       0.78      0.69      0.72      3699

[[2299  827]
 [ 325  248]]
0.046875
peak memory: 285.92 MiB, increment: 1.21 MiB


In [54]:
%memit RFClassifier(X_rfe_train_smote, y_rfe_train_smote, X_rfe_test, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.78      0.83      3126
           1       0.26      0.40      0.31       573

    accuracy                           0.73      3699
   macro avg       0.57      0.59      0.57      3699
weighted avg       0.78      0.73      0.75      3699

[[2452  674]
 [ 342  231]]
1.046875
peak memory: 326.17 MiB, increment: 42.30 MiB


In [55]:
%memit LRClassifier(X_rfe_train_smote, y_rfe_train_smote, X_rfe_test, y_rfe_test)

              precision    recall  f1-score   support

           0       0.92      0.63      0.75      3126
           1       0.26      0.70      0.38       573

    accuracy                           0.64      3699
   macro avg       0.59      0.67      0.56      3699
weighted avg       0.82      0.64      0.69      3699

[[1980 1146]
 [ 173  400]]
0.046875
peak memory: 285.88 MiB, increment: 2.70 MiB


# Data Normalization

In [23]:
#Subsetting data to scale numeric features

#Control
X_train_smote_NUM = X_train_smote[numeric_features]
X_test_NUM = X_test[numeric_features]

#Pearson correlation features
X_p_train_smote_NUM = X_p_train_smote[p_numeric_features]
X_p_test_NUM = X_p_test[p_numeric_features]

#Random Forest features
X_rf_train_smote_NUM = X_rf_train_smote[rf_numeric_features]
X_rf_test_NUM = X_rf_test[rf_numeric_features]

#Recursive Feature Elimination features
X_rfe_train_smote_NUM = X_rfe_train_smote[rfe_numeric_features]
X_rfe_test_NUM = X_rfe_test[rfe_numeric_features]

In [32]:
nscaler = MinMaxScaler()

#Scaling control
X_train_smote_NUM_N = nscaler.fit_transform(X_train_smote_NUM)
X_train_smote_COMBINED_N = np.concatenate((X_train_smote_NUM_N, X_train_smote[cat_features]), axis = 1)
                                          
X_test_NUM_N = nscaler.transform(X_test_NUM)
X_test_COMBINED_N = np.concatenate((X_test_NUM_N, X_test[cat_features]), axis = 1)

#Scaling Pearson correlation features
X_p_train_smote_NUM_N = nscaler.fit_transform(X_p_train_smote_NUM)
X_p_train_smote_COMBINED_N = np.concatenate((X_p_train_smote_NUM_N, X_p_train_smote[p_cat_features]), axis = 1)
                                          
X_p_test_NUM_N = nscaler.transform(X_p_test_NUM)
X_p_test_COMBINED_N = np.concatenate((X_p_test_NUM_N, X_p_test[p_cat_features]), axis = 1)

#Scaling Random Forest features
X_rf_train_smote_NUM_N = nscaler.fit_transform(X_rf_train_smote_NUM)
X_rf_train_smote_COMBINED_N = np.concatenate((X_rf_train_smote_NUM_N, X_rf_train_smote[rf_cat_features]), axis = 1)
                                          
X_rf_test_NUM_N = nscaler.transform(X_rf_test_NUM)
X_rf_test_COMBINED_N = np.concatenate((X_rf_test_NUM_N, X_rf_test[rf_cat_features]), axis = 1)

#Scaling Recursive Feature Elimination features
X_rfe_train_smote_NUM_N = nscaler.fit_transform(X_rfe_train_smote_NUM)
X_rfe_train_smote_COMBINED_N = np.concatenate((X_rfe_train_smote_NUM_N, X_rfe_train_smote[rfe_cat_features]), axis = 1)
                                          
X_rfe_test_NUM_N = nscaler.transform(X_rfe_test_NUM)
X_rfe_test_COMBINED_N = np.concatenate((X_rfe_test_NUM_N, X_rfe_test[rfe_cat_features]), axis = 1)

Running Models on Normalized Data

Control features - Normalized Data

In [37]:
%memit DTClassifier(X_train_smote_COMBINED_N, y_train_smote, X_test_COMBINED_N, y_test)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3124
           1       0.55      0.59      0.57       575

    accuracy                           0.86      3699
   macro avg       0.74      0.75      0.74      3699
weighted avg       0.87      0.86      0.86      3699

[[2848  276]
 [ 235  340]]
0.109375
peak memory: 285.46 MiB, increment: 0.09 MiB


In [38]:
%memit RFClassifier(X_train_smote_COMBINED_N, y_train_smote, X_test_COMBINED_N, y_test)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3124
           1       0.67      0.61      0.64       575

    accuracy                           0.89      3699
   macro avg       0.80      0.78      0.79      3699
weighted avg       0.89      0.89      0.89      3699

[[2954  170]
 [ 223  352]]
1.28125
peak memory: 307.96 MiB, increment: 22.50 MiB


In [74]:
%memit LRClassifier(X_train_smote_COMBINED_N, y_train_smote, X_test_COMBINED_N, y_test)

              precision    recall  f1-score   support

           0       0.88      0.98      0.93      3124
           1       0.71      0.27      0.39       575

    accuracy                           0.87      3699
   macro avg       0.80      0.63      0.66      3699
weighted avg       0.85      0.87      0.84      3699

[[3060   64]
 [ 418  157]]
0.09375
peak memory: 320.32 MiB, increment: 8.36 MiB


Pearson correlation features - normalized data

In [57]:
%memit DTClassifier(X_p_train_smote_COMBINED_N, y_p_train_smote, X_p_test_COMBINED_N, y_p_test)

              precision    recall  f1-score   support

           0       0.93      0.88      0.90      3102
           1       0.51      0.64      0.57       597

    accuracy                           0.84      3699
   macro avg       0.72      0.76      0.74      3699
weighted avg       0.86      0.84      0.85      3699

[[2739  363]
 [ 215  382]]
0.078125
peak memory: 284.83 MiB, increment: 1.07 MiB


In [59]:
%memit RFClassifier(X_p_train_smote_COMBINED_N, y_p_train_smote, X_p_test_COMBINED_N, y_p_test)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3102
           1       0.60      0.75      0.67       597

    accuracy                           0.88      3699
   macro avg       0.78      0.83      0.80      3699
weighted avg       0.89      0.88      0.88      3699

[[2809  293]
 [ 152  445]]
1.5
peak memory: 301.49 MiB, increment: 16.65 MiB


In [58]:
%memit LRClassifier(X_p_train_smote_COMBINED_N, y_p_train_smote, X_p_test_COMBINED_N, y_p_test)

              precision    recall  f1-score   support

           0       0.93      0.90      0.91      3102
           1       0.55      0.64      0.59       597

    accuracy                           0.86      3699
   macro avg       0.74      0.77      0.75      3699
weighted avg       0.87      0.86      0.86      3699

[[2786  316]
 [ 216  381]]
0.046875
peak memory: 286.19 MiB, increment: 1.36 MiB


Random Forest features - normalized data

In [60]:
%memit DTClassifier(X_rf_train_smote_COMBINED_N, y_rf_train_smote, X_rf_test_COMBINED_N, y_rf_test)

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      3129
           1       0.53      0.60      0.56       570

    accuracy                           0.86      3699
   macro avg       0.73      0.75      0.74      3699
weighted avg       0.86      0.86      0.86      3699

[[2831  298]
 [ 230  340]]
0.09375
peak memory: 285.58 MiB, increment: 0.88 MiB


In [61]:
%memit RFClassifier(X_rf_train_smote_COMBINED_N, y_rf_train_smote, X_rf_test_COMBINED_N, y_rf_test)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3129
           1       0.65      0.67      0.66       570

    accuracy                           0.89      3699
   macro avg       0.80      0.80      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2924  205]
 [ 188  382]]
1.484375
peak memory: 299.86 MiB, increment: 14.28 MiB


In [73]:
%memit LRClassifier(X_rf_train_smote_COMBINED_N, y_rf_train_smote, X_rf_test_COMBINED_N, y_rf_test)

              precision    recall  f1-score   support

           0       0.90      0.93      0.92      3129
           1       0.54      0.44      0.48       570

    accuracy                           0.86      3699
   macro avg       0.72      0.69      0.70      3699
weighted avg       0.85      0.86      0.85      3699

[[2921  208]
 [ 321  249]]
0.0625
peak memory: 314.86 MiB, increment: 2.89 MiB


Recursive Feature Elimination - normalized data

In [63]:
%memit DTClassifier(X_rfe_train_smote_COMBINED_N, y_rfe_train_smote, X_rfe_test_COMBINED_N, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.74      0.80      3126
           1       0.23      0.44      0.31       573

    accuracy                           0.69      3699
   macro avg       0.56      0.59      0.55      3699
weighted avg       0.78      0.69      0.72      3699

[[2302  824]
 [ 321  252]]
0.046875
peak memory: 285.40 MiB, increment: 0.59 MiB


In [65]:
%memit RFClassifier(X_rfe_train_smote_COMBINED_N, y_rfe_train_smote, X_rfe_test_COMBINED_N, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.78      0.83      3126
           1       0.25      0.40      0.31       573

    accuracy                           0.72      3699
   macro avg       0.57      0.59      0.57      3699
weighted avg       0.78      0.72      0.75      3699

[[2447  679]
 [ 343  230]]
1.0625
peak memory: 326.44 MiB, increment: 40.98 MiB


In [72]:
%memit LRClassifier(X_rfe_train_smote_COMBINED_N, y_rfe_train_smote, X_rfe_test_COMBINED_N, y_rfe_test)

              precision    recall  f1-score   support

           0       0.92      0.62      0.74      3126
           1       0.25      0.70      0.37       573

    accuracy                           0.63      3699
   macro avg       0.59      0.66      0.56      3699
weighted avg       0.82      0.63      0.68      3699

[[1940 1186]
 [ 170  403]]
0.046875
peak memory: 314.08 MiB, increment: 2.12 MiB


# Data Standardization

In [66]:
sscaler = StandardScaler()

#Scaling control
X_train_smote_NUM_S = sscaler.fit_transform(X_train_smote_NUM)
X_train_smote_COMBINED_S = np.concatenate((X_train_smote_NUM_S, X_train_smote[cat_features]), axis = 1)
                                          
X_test_NUM_S = sscaler.transform(X_test_NUM)
X_test_COMBINED_S = np.concatenate((X_test_NUM_S, X_test[cat_features]), axis = 1)

#Scaling Pearson correlation features
X_p_train_smote_NUM_S = sscaler.fit_transform(X_p_train_smote_NUM)
X_p_train_smote_COMBINED_S = np.concatenate((X_p_train_smote_NUM_S, X_p_train_smote[p_cat_features]), axis = 1)
                                          
X_p_test_NUM_S = sscaler.transform(X_p_test_NUM)
X_p_test_COMBINED_S = np.concatenate((X_p_test_NUM_S, X_p_test[p_cat_features]), axis = 1)

#Scaling Random Forest features
X_rf_train_smote_NUM_S = sscaler.fit_transform(X_rf_train_smote_NUM)
X_rf_train_smote_COMBINED_S = np.concatenate((X_rf_train_smote_NUM_S, X_rf_train_smote[rf_cat_features]), axis = 1)
                                          
X_rf_test_NUM_S = sscaler.transform(X_rf_test_NUM)
X_rf_test_COMBINED_S = np.concatenate((X_rf_test_NUM_S, X_rf_test[rf_cat_features]), axis = 1)

#Scaling Recursive Feature Elimination features
X_rfe_train_smote_NUM_S = sscaler.fit_transform(X_rfe_train_smote_NUM)
X_rfe_train_smote_COMBINED_S = np.concatenate((X_rfe_train_smote_NUM_S, X_rfe_train_smote[rfe_cat_features]), axis = 1)
                                          
X_rfe_test_NUM_S = sscaler.transform(X_rfe_test_NUM)
X_rfe_test_COMBINED_S = np.concatenate((X_rfe_test_NUM_S, X_rfe_test[rfe_cat_features]), axis = 1)

Control data - Standardized data

In [67]:
%memit DTClassifier(X_train_smote_COMBINED_S, y_train_smote, X_test_COMBINED_S, y_test)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3124
           1       0.54      0.60      0.57       575

    accuracy                           0.86      3699
   macro avg       0.73      0.75      0.74      3699
weighted avg       0.87      0.86      0.86      3699

[[2835  289]
 [ 232  343]]
0.109375
peak memory: 308.53 MiB, increment: 0.61 MiB


In [68]:
%memit RFClassifier(X_train_smote_COMBINED_S, y_train_smote, X_test_COMBINED_S, y_test)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3124
           1       0.68      0.62      0.65       575

    accuracy                           0.90      3699
   macro avg       0.81      0.78      0.79      3699
weighted avg       0.89      0.90      0.89      3699

[[2960  164]
 [ 219  356]]
1.296875
peak memory: 328.95 MiB, increment: 20.42 MiB


In [71]:
%memit LRClassifier(X_train_smote_COMBINED_S, y_train_smote, X_test_COMBINED_S, y_test)

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      3124
           1       0.69      0.38      0.49       575

    accuracy                           0.88      3699
   macro avg       0.79      0.67      0.71      3699
weighted avg       0.86      0.88      0.86      3699

[[3029   95]
 [ 359  216]]
0.125
peak memory: 320.32 MiB, increment: 8.38 MiB


Pearson Correlation 

In [76]:
%memit DTClassifier(X_p_train_smote_COMBINED_S, y_p_train_smote, X_p_test_COMBINED_S, y_p_test)

              precision    recall  f1-score   support

           0       0.93      0.88      0.90      3102
           1       0.51      0.65      0.57       597

    accuracy                           0.84      3699
   macro avg       0.72      0.76      0.74      3699
weighted avg       0.86      0.84      0.85      3699

[[2733  369]
 [ 210  387]]
0.078125
peak memory: 312.39 MiB, increment: 0.43 MiB


In [77]:
%memit RFClassifier(X_p_train_smote_COMBINED_S, y_p_train_smote, X_p_test_COMBINED_S, y_p_test)

              precision    recall  f1-score   support

           0       0.95      0.90      0.93      3102
           1       0.60      0.74      0.66       597

    accuracy                           0.88      3699
   macro avg       0.77      0.82      0.79      3699
weighted avg       0.89      0.88      0.88      3699

[[2805  297]
 [ 155  442]]
1.515625
peak memory: 325.98 MiB, increment: 13.58 MiB


In [78]:
%memit LRClassifier(X_p_train_smote_COMBINED_S, y_p_train_smote, X_p_test_COMBINED_S, y_p_test)

              precision    recall  f1-score   support

           0       0.94      0.90      0.92      3102
           1       0.58      0.70      0.64       597

    accuracy                           0.87      3699
   macro avg       0.76      0.80      0.78      3699
weighted avg       0.88      0.87      0.88      3699

[[2807  295]
 [ 182  415]]
0.015625
peak memory: 311.66 MiB, increment: 1.96 MiB


Random Forest features - standardized data

In [79]:
%memit DTClassifier(X_rf_train_smote_COMBINED_S, y_rf_train_smote, X_rf_test_COMBINED_S, y_rf_test)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3129
           1       0.54      0.59      0.57       570

    accuracy                           0.86      3699
   macro avg       0.73      0.75      0.74      3699
weighted avg       0.87      0.86      0.86      3699

[[2844  285]
 [ 231  339]]
0.078125
peak memory: 311.00 MiB, increment: 0.68 MiB


In [80]:
%memit RFClassifier(X_rf_train_smote_COMBINED_S, y_rf_train_smote, X_rf_test_COMBINED_S, y_rf_test)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3129
           1       0.65      0.68      0.66       570

    accuracy                           0.89      3699
   macro avg       0.80      0.80      0.80      3699
weighted avg       0.90      0.89      0.90      3699

[[2924  205]
 [ 185  385]]
1.421875
peak memory: 325.11 MiB, increment: 14.11 MiB


In [81]:
%memit LRClassifier(X_rf_train_smote_COMBINED_S, y_rf_train_smote, X_rf_test_COMBINED_S, y_rf_test)

              precision    recall  f1-score   support

           0       0.91      0.94      0.93      3129
           1       0.60      0.52      0.55       570

    accuracy                           0.87      3699
   macro avg       0.76      0.73      0.74      3699
weighted avg       0.87      0.87      0.87      3699

[[2932  197]
 [ 276  294]]
0.046875
peak memory: 311.34 MiB, increment: 3.64 MiB


Recursive Feature Elimination features - standardized data

In [82]:
%memit DTClassifier(X_rfe_train_smote_COMBINED_S, y_rfe_train_smote, X_rfe_test_COMBINED_S, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.73      0.80      3126
           1       0.23      0.43      0.30       573

    accuracy                           0.69      3699
   macro avg       0.55      0.58      0.55      3699
weighted avg       0.78      0.69      0.72      3699

[[2291  835]
 [ 325  248]]
0.046875
peak memory: 308.95 MiB, increment: 0.50 MiB


In [83]:
%memit RFClassifier(X_rfe_train_smote_COMBINED_S, y_rfe_train_smote, X_rfe_test_COMBINED_S, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.78      0.83      3126
           1       0.25      0.40      0.31       573

    accuracy                           0.72      3699
   macro avg       0.57      0.59      0.57      3699
weighted avg       0.78      0.72      0.75      3699

[[2445  681]
 [ 342  231]]
1.296875
peak memory: 350.70 MiB, increment: 41.75 MiB


In [84]:
%memit LRClassifier(X_rfe_train_smote_COMBINED_S, y_rfe_train_smote, X_rfe_test_COMBINED_S, y_rfe_test)

              precision    recall  f1-score   support

           0       0.92      0.62      0.74      3126
           1       0.25      0.70      0.37       573

    accuracy                           0.63      3699
   macro avg       0.59      0.66      0.56      3699
weighted avg       0.81      0.63      0.68      3699

[[1945 1181]
 [ 174  399]]
0.03125
peak memory: 312.72 MiB, increment: 2.69 MiB
