In [1]:
#Data processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

#Model performance metrics
from time import process_time
from memory_profiler import profile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Feature selection and models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

#Data scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
%load_ext memory_profiler

Models

In [3]:
#Decision Tree Function
def DTClassifier(X_train, y_train, X_test, y_test):
  #Initialize the DecisionTreeClassifier
  tree_raw_imbalanced = DecisionTreeClassifier(criterion = "entropy")

  #Time Measurement
  start_time = process_time()

  #Fit the Classifier to the data
  tree_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = tree_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

In [4]:
#Random Forest Model
def RFClassifier(X_train, y_train, X_test, y_test):
  #Initialize the Random Forest Classifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Fit the classifier to the data
  forest_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

In [104]:
#Logistic Regression Model
def LRClassifier(X_train, y_train, X_test, y_test):
  #Initialize the Logistic Regression Classifier
  lr_raw_imbalanced = LogisticRegression(max_iter= 100000)

  #Time Measurement
  start_time = process_time()

  #Fit the classifier to the data
  lr_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = lr_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

# Modified Models to include SMOTE and standardization

In [71]:
#Decision Tree Function
def DTClassifierMOD(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes):
  #Initialize the DecisionTreeClassifier
  tree_raw_imbalanced = DecisionTreeClassifier(criterion = "entropy")

  #Time Measurement
  start_time = process_time()
  
  #Data preprocessing
  smote = SMOTE()
  X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
  # Subset the numeric attributes
  X_train_smote_numeric = X_train_smote[numeric_attributes]
  X_test_numeric = X_test[numeric_attributes]

  scaler = StandardScaler()
  X_train_smote_S = scaler.fit_transform(X_train_smote_numeric)
  X_test_numeric_S = scaler.transform(X_test_numeric)
        
  # Concatenate the standardized numeric attributes with the categorical attributes
  X_train_combined = np.concatenate((X_train_smote_S, X_train_smote[cat_attributes]), axis=1)
  X_test_combined = np.concatenate((X_test_numeric_S, X_test[cat_attributes]), axis=1)

  #Fit the Classifier to the data
  tree_raw_imbalanced.fit(X_train_combined, y_train_smote)

  #Predict new Data
  y_pred = tree_raw_imbalanced.predict(X_test_combined)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

In [72]:
#Random Forest function
def RFClassifierMOD(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()
  
  #Data preprocessing
  smote = SMOTE()
  X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
  # Subset the numeric attributes
  X_train_smote_numeric = X_train_smote[numeric_attributes]
  X_test_numeric = X_test[numeric_attributes]

  scaler = StandardScaler()
  X_train_smote_S = scaler.fit_transform(X_train_smote_numeric)
  X_test_numeric_S = scaler.transform(X_test_numeric)
        
  # Concatenate the standardized numeric attributes with the categorical attributes
  X_train_combined = np.concatenate((X_train_smote_S, X_train_smote[cat_attributes]), axis=1)
  X_test_combined = np.concatenate((X_test_numeric_S, X_test[cat_attributes]), axis=1)

  #Fit the Classifier to the data
  forest_raw_imbalanced.fit(X_train_combined, y_train_smote)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test_combined)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

In [102]:
#Logistic Regression function
def LRClassifierMOD(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes):
  #Initialize the DecisionTreeClassifier
  lr_raw_imbalanced = LogisticRegression(max_iter= 10000)

  #Time Measurement
  start_time = process_time()
  
  #Data preprocessing
  smote = SMOTE()
  X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
  # Subset the numeric attributes
  X_train_smote_numeric = X_train_smote[numeric_attributes]
  X_test_numeric = X_test[numeric_attributes]

  scaler = StandardScaler()
  X_train_smote_S = scaler.fit_transform(X_train_smote_numeric)
  X_test_numeric_S = scaler.transform(X_test_numeric)
        
  # Concatenate the standardized numeric attributes with the categorical attributes
  X_train_combined = np.concatenate((X_train_smote_S, X_train_smote[cat_attributes]), axis=1)
  X_test_combined = np.concatenate((X_test_numeric_S, X_test[cat_attributes]), axis=1)

  #Fit the Classifier to the data
  lr_raw_imbalanced.fit(X_train_combined, y_train_smote)

  #Predict new Data
  y_pred = lr_raw_imbalanced.predict(X_test_combined)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

# Crossvalidation Models

In [135]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit

K-Fold Validation Models

In [81]:
#Decision Tree Function
def DTClassifierKFold(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  tree_raw_imbalanced = DecisionTreeClassifier(criterion = "entropy")

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  kfold = KFold(n_splits = num_folds)
    
  results = cross_val_score(tree_raw_imbalanced, X_train, y_train, cv = kfold)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in kfold.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    tree_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = tree_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

In [98]:
#Random Forest Function
def RFClassifierKFold(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  kfold = KFold(n_splits = num_folds)
    
  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = kfold)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in kfold.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

In [106]:
#Logistic Regression Function
def LRClassifierKFold(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  lr_raw_imbalanced = LogisticRegression(max_iter= 100000)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  kfold = KFold(n_splits = num_folds)
    
  results = cross_val_score(lr_raw_imbalanced, X_train, y_train, cv = kfold)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in kfold.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    lr_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = lr_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

# Time-Series crossvalidation

In [148]:
#Decision Tree Function
def DTClassifierTS(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  tree_raw_imbalanced = DecisionTreeClassifier(criterion = "entropy")

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  knn = KNeighborsClassifier(n_neighbors=1)
  tscv = TimeSeriesSplit(n_splits = num_folds)
    
  results = cross_val_score(tree_raw_imbalanced, X_train, y_train, cv = tscv)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in tscv.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    tree_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = tree_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

In [149]:
#Random Forest Function
def RFClassifierTS(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  knn = KNeighborsClassifier(n_neighbors=1)
  tscv = TimeSeriesSplit(n_splits = num_folds)
    
  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = tscv)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in tscv.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

In [150]:
#Logistic Regression Function
def LRClassifierTS(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_folds):
  #Initialize the DecisionTreeClassifier
  lr_raw_imbalanced = LogisticRegression(max_iter= 100000)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  knn = KNeighborsClassifier(n_neighbors=1)
  tscv = TimeSeriesSplit(n_splits = num_folds)
    
  results = cross_val_score(lr_raw_imbalanced, X_train, y_train, cv = tscv)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
    
  for train_index, val_index in tscv.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    lr_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = lr_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print(time)

# Data Import and Processing

In [9]:
#https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("online_shoppers_intention.csv")

In [10]:
#Identify categorical attributes
categorical_features = ["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]
df_cat = df[categorical_features]

df_onehot = pd.get_dummies(df, columns = categorical_features, prefix = categorical_features)

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_onehot['Revenue'] = label_encoder.fit_transform(df['Revenue'])

Control SMOTE data

In [11]:
#Specify independent/ dependent values
X = df_onehot.drop(columns = "Revenue")
y = df_onehot["Revenue"]

#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Filtered Data

Pearson Correlation Filters

In [12]:
#Correlation of Onehot encoded dataset

corr = df_onehot.corr()

revenue_correlation = corr["Revenue"]
sorted_pearson_correlation = revenue_correlation.abs().sort_values(ascending = False)

#Filter out for attributes with correlation > 0.09
filtered_correlation = sorted_pearson_correlation[sorted_pearson_correlation > 0.09]
filtered_attributes = filtered_correlation.index.tolist()
df_pearson = df_onehot[filtered_attributes]

#12 attributes (Onehot encoded) are kept

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])

#Specify independent/ dependent values
X_p = df_pearson.drop(columns = "Revenue")
y_p = df_pearson["Revenue"]

#Split the Data
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size = 0.3)

smote = SMOTE()
X_p_train_smote, y_p_train_smote = smote.fit_resample(X_p_train, y_p_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])


Random Forest Features

In [13]:
#Filters of RF Classifier

rf_classifier = RandomForestClassifier()

#Fit random forest classifier
rf_classifier.fit(X_train_smote, y_train_smote)

feature_importances = rf_classifier.feature_importances_

rf_df = pd.DataFrame({"Feature": X_train_smote.columns, "Importance": feature_importances})

sorted_features = np.argsort(feature_importances)[::-1]

#Sorting features
rf_df_sorted = rf_df.sort_values("Importance", ascending = False)
rf_df_sorted = rf_df_sorted.reset_index(drop = True)
rf_df_sorted

#Filter out for attributes with random forest score > 0.009
filtered_rf = rf_df_sorted[rf_df_sorted['Importance'] > 0.009]
#filtered_attributes_rf = filtered_rf.index.tolist()
df_rf = df_onehot[filtered_rf["Feature"]]

#24 features are kept after random forest feature selection
df_rf["Revenue"] = df_onehot["Revenue"]

#Specify independent/ dependent values
X_rf = df_rf.drop(columns = "Revenue")
y_rf = df_rf["Revenue"]

#Split the Data
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size = 0.3)

smote = SMOTE()
X_rf_train_smote, y_rf_train_smote = smote.fit_resample(X_rf_train, y_rf_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rf["Revenue"] = df_onehot["Revenue"]


RFE Feature Selection

In [14]:
#Filter for features with RFE

df_X_rfe = df_onehot.drop(columns = "Revenue")
df_y_rfe = df_onehot["Revenue"]

# Instantiate the model and RFE selector
model = LogisticRegression(solver = "liblinear")
rfe_selector = RFE(model, n_features_to_select = 20)

# Perform RFE feature selection
selected_features = rfe_selector.fit_transform(df_X_rfe, df_y_rfe)

# Get the mask of selected features
feature_mask = rfe_selector.support_

# Get the ranking of features (optional)
feature_ranking = rfe_selector.ranking_

selected_indices = [i for i, mask in enumerate(feature_mask) if mask]
print("Selected feature indices:", selected_indices)

df_rfe = df_onehot.iloc[:, selected_indices]

#20 features are kept after random forest feature selection
df_rfe["Revenue"] = df_onehot["Revenue"]

#Specify independent/ dependent values
X_rfe = df_rfe.drop(columns = "Revenue")
y_rfe = df_rfe["Revenue"]

#Split the Data
X_rfe_train, X_rfe_test, y_rfe_train, y_rfe_test = train_test_split(X_rfe, y_rfe, test_size = 0.3)

smote = SMOTE()
X_rfe_train_smote, y_rfe_train_smote = smote.fit_resample(X_rfe_train, y_rfe_train)

Selected feature indices: [6, 7, 9, 10, 12, 13, 17, 18, 19, 22, 30, 39, 50, 52, 56, 62, 64, 67, 73, 74]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rfe["Revenue"] = df_onehot["Revenue"]


In [38]:
#Define columns that need data normalization/ standardization
numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

p_numeric_features = ['Administrative', 'Administrative_Duration', 'Informational', 
       'ProductRelated', 'ProductRelated_Duration','BounceRates', 'ExitRates', 'PageValues']

rf_numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues']

rfe_numeric_features = ['BounceRates', 'ExitRates', 'SpecialDay']

cat_features = [col for col in X_train if col not in numeric_features]
p_cat_features = [col for col in X_p_train if col not in numeric_features]
rf_cat_features = [col for col in X_rf_train if col not in numeric_features]
rfe_cat_features = [col for col in X_rfe_train if col not in numeric_features]


# Data Standardization

In [16]:
#Subsetting data to scale numeric features

#Control
X_train_smote_NUM = X_train_smote[numeric_features]
X_test_NUM = X_test[numeric_features]

#Pearson correlation features
X_p_train_smote_NUM = X_p_train_smote[p_numeric_features]
X_p_test_NUM = X_p_test[p_numeric_features]

#Random Forest features
X_rf_train_smote_NUM = X_rf_train_smote[rf_numeric_features]
X_rf_test_NUM = X_rf_test[rf_numeric_features]

#Recursive Feature Elimination features
X_rfe_train_smote_NUM = X_rfe_train_smote[rfe_numeric_features]
X_rfe_test_NUM = X_rfe_test[rfe_numeric_features]

In [17]:
sscaler = StandardScaler()

#Scaling control
X_train_smote_NUM_S = sscaler.fit_transform(X_train_smote_NUM)
X_train_smote_COMBINED_S = np.concatenate((X_train_smote_NUM_S, X_train_smote[cat_features]), axis = 1)
                                          
X_test_NUM_S = sscaler.transform(X_test_NUM)
X_test_COMBINED_S = np.concatenate((X_test_NUM_S, X_test[cat_features]), axis = 1)

#Scaling Pearson correlation features
X_p_train_smote_NUM_S = sscaler.fit_transform(X_p_train_smote_NUM)
X_p_train_smote_COMBINED_S = np.concatenate((X_p_train_smote_NUM_S, X_p_train_smote[p_cat_features]), axis = 1)
                                          
X_p_test_NUM_S = sscaler.transform(X_p_test_NUM)
X_p_test_COMBINED_S = np.concatenate((X_p_test_NUM_S, X_p_test[p_cat_features]), axis = 1)

#Scaling Random Forest features
X_rf_train_smote_NUM_S = sscaler.fit_transform(X_rf_train_smote_NUM)
X_rf_train_smote_COMBINED_S = np.concatenate((X_rf_train_smote_NUM_S, X_rf_train_smote[rf_cat_features]), axis = 1)
                                          
X_rf_test_NUM_S = sscaler.transform(X_rf_test_NUM)
X_rf_test_COMBINED_S = np.concatenate((X_rf_test_NUM_S, X_rf_test[rf_cat_features]), axis = 1)

#Scaling Recursive Feature Elimination features
X_rfe_train_smote_NUM_S = sscaler.fit_transform(X_rfe_train_smote_NUM)
X_rfe_train_smote_COMBINED_S = np.concatenate((X_rfe_train_smote_NUM_S, X_rfe_train_smote[rfe_cat_features]), axis = 1)
                                          
X_rfe_test_NUM_S = sscaler.transform(X_rfe_test_NUM)
X_rfe_test_COMBINED_S = np.concatenate((X_rfe_test_NUM_S, X_rfe_test[rfe_cat_features]), axis = 1)

Control data - Standardized data

In [19]:
%memit DTClassifier(X_train_smote_COMBINED_S, y_train_smote, X_test_COMBINED_S, y_test)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3156
           1       0.51      0.57      0.54       543

    accuracy                           0.86      3699
   macro avg       0.72      0.74      0.73      3699
weighted avg       0.86      0.86      0.86      3699

[[2861  295]
 [ 233  310]]
0.109375
peak memory: 241.14 MiB, increment: 0.92 MiB


In [20]:
%memit RFClassifier(X_train_smote_COMBINED_S, y_train_smote, X_test_COMBINED_S, y_test)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3156
           1       0.69      0.61      0.65       543

    accuracy                           0.90      3699
   macro avg       0.81      0.78      0.80      3699
weighted avg       0.90      0.90      0.90      3699

[[3009  147]
 [ 210  333]]
1.28125
peak memory: 266.11 MiB, increment: 24.98 MiB


In [21]:
%memit LRClassifier(X_train_smote_COMBINED_S, y_train_smote, X_test_COMBINED_S, y_test)

              precision    recall  f1-score   support

           0       0.90      0.97      0.94      3156
           1       0.71      0.40      0.51       543

    accuracy                           0.89      3699
   macro avg       0.81      0.69      0.72      3699
weighted avg       0.88      0.89      0.87      3699

[[3069   87]
 [ 326  217]]
0.109375
peak memory: 249.85 MiB, increment: 9.67 MiB


Pearson Correlation 

In [22]:
%memit DTClassifier(X_p_train_smote_COMBINED_S, y_p_train_smote, X_p_test_COMBINED_S, y_p_test)

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      3133
           1       0.51      0.64      0.57       566

    accuracy                           0.85      3699
   macro avg       0.72      0.77      0.74      3699
weighted avg       0.87      0.85      0.86      3699

[[2788  345]
 [ 201  365]]
0.078125
peak memory: 242.38 MiB, increment: 0.85 MiB


In [23]:
%memit RFClassifier(X_p_train_smote_COMBINED_S, y_p_train_smote, X_p_test_COMBINED_S, y_p_test)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3133
           1       0.60      0.76      0.67       566

    accuracy                           0.89      3699
   macro avg       0.78      0.83      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2847  286]
 [ 137  429]]
1.453125
peak memory: 260.68 MiB, increment: 18.30 MiB


In [24]:
%memit LRClassifier(X_p_train_smote_COMBINED_S, y_p_train_smote, X_p_test_COMBINED_S, y_p_test)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3133
           1       0.61      0.70      0.65       566

    accuracy                           0.89      3699
   macro avg       0.78      0.81      0.79      3699
weighted avg       0.89      0.89      0.89      3699

[[2879  254]
 [ 171  395]]
0.015625
peak memory: 243.70 MiB, increment: 1.95 MiB


Random Forest features - standardized data

In [25]:
%memit DTClassifier(X_rf_train_smote_COMBINED_S, y_rf_train_smote, X_rf_test_COMBINED_S, y_rf_test)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3132
           1       0.54      0.60      0.57       567

    accuracy                           0.86      3699
   macro avg       0.73      0.75      0.74      3699
weighted avg       0.87      0.86      0.86      3699

[[2845  287]
 [ 226  341]]
0.09375
peak memory: 242.57 MiB, increment: 0.21 MiB


In [26]:
%memit RFClassifier(X_rf_train_smote_COMBINED_S, y_rf_train_smote, X_rf_test_COMBINED_S, y_rf_test)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3132
           1       0.65      0.67      0.66       567

    accuracy                           0.89      3699
   macro avg       0.79      0.80      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2924  208]
 [ 186  381]]
1.265625
peak memory: 261.07 MiB, increment: 18.50 MiB


In [27]:
%memit LRClassifier(X_rf_train_smote_COMBINED_S, y_rf_train_smote, X_rf_test_COMBINED_S, y_rf_test)

              precision    recall  f1-score   support

           0       0.91      0.94      0.92      3132
           1       0.58      0.50      0.54       567

    accuracy                           0.87      3699
   macro avg       0.75      0.72      0.73      3699
weighted avg       0.86      0.87      0.86      3699

[[2930  202]
 [ 284  283]]
0.03125
peak memory: 246.77 MiB, increment: 3.40 MiB


Recursive Feature Elimination features - standardized data

In [28]:
%memit DTClassifier(X_rfe_train_smote_COMBINED_S, y_rfe_train_smote, X_rfe_test_COMBINED_S, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.74      0.80      3127
           1       0.23      0.43      0.30       572

    accuracy                           0.69      3699
   macro avg       0.55      0.58      0.55      3699
weighted avg       0.78      0.69      0.72      3699

[[2306  821]
 [ 325  247]]
0.046875
peak memory: 244.36 MiB, increment: 0.27 MiB


In [29]:
%memit RFClassifier(X_rfe_train_smote_COMBINED_S, y_rfe_train_smote, X_rfe_test_COMBINED_S, y_rfe_test)

              precision    recall  f1-score   support

           0       0.88      0.80      0.84      3127
           1       0.26      0.38      0.31       572

    accuracy                           0.74      3699
   macro avg       0.57      0.59      0.58      3699
weighted avg       0.78      0.74      0.76      3699

[[2515  612]
 [ 353  219]]
1.03125
peak memory: 284.61 MiB, increment: 40.24 MiB


In [30]:
%memit LRClassifier(X_rfe_train_smote_COMBINED_S, y_rfe_train_smote, X_rfe_test_COMBINED_S, y_rfe_test)

              precision    recall  f1-score   support

           0       0.93      0.61      0.74      3127
           1       0.26      0.73      0.38       572

    accuracy                           0.63      3699
   macro avg       0.59      0.67      0.56      3699
weighted avg       0.82      0.63      0.68      3699

[[1918 1209]
 [ 152  420]]
0.03125
peak memory: 245.09 MiB, increment: 2.71 MiB


# Modified Functions to include oversampling/scaling within the function

Unfiltered Data

In [74]:
%memit DTClassifierMOD(X_train, y_train, X_test, y_test, numeric_features, cat_features)

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      3156
           1       0.49      0.54      0.51       543

    accuracy                           0.85      3699
   macro avg       0.70      0.72      0.71      3699
weighted avg       0.86      0.85      0.85      3699

[[2851  305]
 [ 250  293]]
0.953125
peak memory: 272.12 MiB, increment: 19.28 MiB


In [75]:
%memit RFClassifierMOD(X_train, y_train, X_test, y_test, numeric_features, cat_features)

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      3156
           1       0.69      0.64      0.66       543

    accuracy                           0.90      3699
   macro avg       0.81      0.80      0.80      3699
weighted avg       0.90      0.90      0.90      3699

[[2998  158]
 [ 195  348]]
2.15625
peak memory: 290.22 MiB, increment: 42.00 MiB


In [76]:
%memit LRClassifierMOD(X_train, y_train, X_test, y_test, numeric_features, cat_features)

              precision    recall  f1-score   support

           0       0.90      0.97      0.94      3156
           1       0.71      0.40      0.51       543

    accuracy                           0.89      3699
   macro avg       0.81      0.68      0.72      3699
weighted avg       0.88      0.89      0.87      3699

[[3069   87]
 [ 328  215]]
0.796875
peak memory: 276.15 MiB, increment: 25.96 MiB


Pearson Correlation Filtered Data

In [78]:
%memit DTClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features)

              precision    recall  f1-score   support

           0       0.94      0.90      0.92      3133
           1       0.54      0.67      0.60       566

    accuracy                           0.86      3699
   macro avg       0.74      0.78      0.76      3699
weighted avg       0.88      0.86      0.87      3699

[[2809  324]
 [ 189  377]]
0.09375
peak memory: 255.44 MiB, increment: 7.17 MiB


In [79]:
%memit RFClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3133
           1       0.61      0.76      0.67       566

    accuracy                           0.89      3699
   macro avg       0.78      0.83      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2854  279]
 [ 138  428]]
1.515625
peak memory: 274.02 MiB, increment: 18.59 MiB


In [80]:
%memit LRClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3133
           1       0.61      0.70      0.65       566

    accuracy                           0.89      3699
   macro avg       0.78      0.81      0.79      3699
weighted avg       0.89      0.89      0.89      3699

[[2883  250]
 [ 169  397]]
0.03125
peak memory: 255.56 MiB, increment: 6.85 MiB


Random Forest Features

In [83]:
%memit DTClassifierMOD(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features)

              precision    recall  f1-score   support

           0       0.93      0.90      0.91      3132
           1       0.53      0.63      0.57       567

    accuracy                           0.86      3699
   macro avg       0.73      0.76      0.74      3699
weighted avg       0.87      0.86      0.86      3699

[[2816  316]
 [ 212  355]]
1.03125
peak memory: 261.37 MiB, increment: 10.89 MiB


In [86]:
%memit RFClassifierMOD(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features)

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3132
           1       0.65      0.67      0.66       567

    accuracy                           0.89      3699
   macro avg       0.79      0.80      0.80      3699
weighted avg       0.89      0.89      0.89      3699

[[2927  205]
 [ 189  378]]
2.125
peak memory: 277.26 MiB, increment: 21.01 MiB


In [85]:
%memit LRClassifierMOD(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features)

              precision    recall  f1-score   support

           0       0.91      0.94      0.92      3132
           1       0.58      0.50      0.54       567

    accuracy                           0.87      3699
   macro avg       0.75      0.72      0.73      3699
weighted avg       0.86      0.87      0.86      3699

[[2930  202]
 [ 284  283]]
0.390625
peak memory: 258.92 MiB, increment: 9.64 MiB


RFE Features

In [91]:
%memit DTClassifierMOD(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features)

              precision    recall  f1-score   support

           0       0.87      0.74      0.80      3127
           1       0.22      0.40      0.29       572

    accuracy                           0.69      3699
   macro avg       0.55      0.57      0.54      3699
weighted avg       0.77      0.69      0.72      3699

[[2309  818]
 [ 341  231]]
0.625
peak memory: 257.93 MiB, increment: 0.07 MiB


In [93]:
%memit RFClassifierMOD(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features)

              precision    recall  f1-score   support

           0       0.88      0.80      0.84      3127
           1       0.27      0.39      0.32       572

    accuracy                           0.74      3699
   macro avg       0.57      0.60      0.58      3699
weighted avg       0.78      0.74      0.76      3699

[[2513  614]
 [ 350  222]]
1.953125
peak memory: 297.46 MiB, increment: 39.48 MiB


In [94]:
%memit LRClassifierMOD(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features)

              precision    recall  f1-score   support

           0       0.93      0.62      0.74      3127
           1       0.26      0.73      0.38       572

    accuracy                           0.64      3699
   macro avg       0.59      0.68      0.56      3699
weighted avg       0.82      0.64      0.69      3699

[[1937 1190]
 [ 152  420]]
0.359375
peak memory: 259.96 MiB, increment: 7.57 MiB


# K-Fold Crossvalidation 

In [82]:
%memit DTClassifierKFold(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      7266
           1       0.57      0.57      0.57      1365

    accuracy                           0.86      8631
   macro avg       0.74      0.74      0.74      8631
weighted avg       0.86      0.86      0.86      8631

3.28125
peak memory: 278.20 MiB, increment: 23.98 MiB


In [100]:
%memit RFClassifierKFold(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7266
           1       0.76      0.51      0.61      1365

    accuracy                           0.90      8631
   macro avg       0.84      0.74      0.78      8631
weighted avg       0.89      0.90      0.89      8631

9.265625
peak memory: 284.53 MiB, increment: 36.75 MiB


In [107]:
%memit LRClassifierKFold(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      7266
           1       0.73      0.38      0.50      1365

    accuracy                           0.88      8631
   macro avg       0.81      0.68      0.71      8631
weighted avg       0.87      0.88      0.86      8631

4.859375
peak memory: 294.61 MiB, increment: 45.50 MiB


Pearson correlation features

In [108]:
%memit DTClassifierKFold(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7289
           1       0.74      0.56      0.64      1342

    accuracy                           0.90      8631
   macro avg       0.83      0.76      0.79      8631
weighted avg       0.89      0.90      0.89      8631

5.9375
peak memory: 262.94 MiB, increment: 13.77 MiB


In [109]:
%memit RFClassifierKFold(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7289
           1       0.73      0.56      0.63      1342

    accuracy                           0.90      8631
   macro avg       0.83      0.76      0.79      8631
weighted avg       0.89      0.90      0.89      8631

5.96875
peak memory: 262.59 MiB, increment: 10.62 MiB


In [110]:
%memit LRClassifierKFold(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      7289
           1       0.75      0.37      0.50      1342

    accuracy                           0.88      8631
   macro avg       0.82      0.67      0.72      8631
weighted avg       0.87      0.88      0.87      8631

0.53125
peak memory: 259.11 MiB, increment: 7.21 MiB


Random forest features

In [111]:
%memit DTClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7290
           1       0.74      0.53      0.62      1341

    accuracy                           0.90      8631
   macro avg       0.83      0.75      0.78      8631
weighted avg       0.89      0.90      0.89      8631

9.34375
peak memory: 268.55 MiB, increment: 19.80 MiB


In [112]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7290
           1       0.73      0.53      0.61      1341

    accuracy                           0.90      8631
   macro avg       0.82      0.74      0.77      8631
weighted avg       0.89      0.90      0.89      8631

9.28125
peak memory: 268.79 MiB, increment: 22.36 MiB


In [113]:
%memit LRClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      7290
           1       0.73      0.37      0.49      1341

    accuracy                           0.88      8631
   macro avg       0.81      0.67      0.71      8631
weighted avg       0.87      0.88      0.86      8631

3.515625
peak memory: 269.89 MiB, increment: 23.53 MiB


Recursive feature elimination features

In [114]:
%memit DTClassifierKFold(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features, 5)

              precision    recall  f1-score   support

           0       0.86      0.93      0.90      7295
           1       0.33      0.17      0.23      1336

    accuracy                           0.82      8631
   macro avg       0.59      0.55      0.56      8631
weighted avg       0.78      0.82      0.79      8631

8.109375
peak memory: 276.32 MiB, increment: 29.09 MiB


In [115]:
%memit RFClassifierKFold(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features, 5)

              precision    recall  f1-score   support

           0       0.86      0.93      0.90      7295
           1       0.33      0.18      0.23      1336

    accuracy                           0.82      8631
   macro avg       0.60      0.56      0.56      8631
weighted avg       0.78      0.82      0.79      8631

8.09375
peak memory: 272.44 MiB, increment: 25.52 MiB


In [116]:
%memit LRClassifierKFold(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features, 5)

              precision    recall  f1-score   support

           0       0.85      1.00      0.92      7295
           1       0.47      0.01      0.02      1336

    accuracy                           0.84      8631
   macro avg       0.66      0.50      0.47      8631
weighted avg       0.79      0.84      0.78      8631

1.6875
peak memory: 267.58 MiB, increment: 20.64 MiB


# Time-Series Crossvalidation

Unfiltered Data

In [151]:
%memit DTClassifierTS(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.91      0.92      0.92      6063
           1       0.55      0.54      0.55      1127

    accuracy                           0.86      7190
   macro avg       0.73      0.73      0.73      7190
weighted avg       0.86      0.86      0.86      7190

2.125
peak memory: 309.43 MiB, increment: 31.02 MiB


In [152]:
%memit RFClassifierTS(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      6063
           1       0.76      0.50      0.60      1127

    accuracy                           0.90      7190
   macro avg       0.84      0.74      0.77      7190
weighted avg       0.89      0.90      0.89      7190

6.421875
peak memory: 309.37 MiB, increment: 31.11 MiB


In [153]:
%memit LRClassifierTS(X_train, y_train, X_test, y_test, numeric_features, cat_features, 5)

              precision    recall  f1-score   support

           0       0.90      0.97      0.93      6063
           1       0.73      0.39      0.51      1127

    accuracy                           0.88      7190
   macro avg       0.81      0.68      0.72      7190
weighted avg       0.87      0.88      0.87      7190

4.359375
peak memory: 318.91 MiB, increment: 39.76 MiB


Pearson correlation data

In [155]:
%memit DTClassifierTS(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      6041
           1       0.58      0.55      0.57      1149

    accuracy                           0.87      7190
   macro avg       0.75      0.74      0.74      7190
weighted avg       0.86      0.87      0.86      7190

0.265625
peak memory: 290.96 MiB, increment: 11.77 MiB


In [156]:
%memit RFClassifierTS(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      6041
           1       0.77      0.53      0.63      1149

    accuracy                           0.90      7190
   macro avg       0.84      0.75      0.79      7190
weighted avg       0.89      0.90      0.89      7190

3.890625
peak memory: 290.21 MiB, increment: 13.20 MiB


In [157]:
%memit LRClassifierTS(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 5)

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      6041
           1       0.75      0.39      0.51      1149

    accuracy                           0.88      7190
   macro avg       0.82      0.68      0.72      7190
weighted avg       0.87      0.88      0.86      7190

0.3125
peak memory: 286.81 MiB, increment: 4.24 MiB


Random Forest features

In [158]:
%memit DTClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      6086
           1       0.55      0.57      0.56      1104

    accuracy                           0.86      7190
   macro avg       0.73      0.74      0.74      7190
weighted avg       0.86      0.86      0.86      7190

2.40625
peak memory: 300.51 MiB, increment: 21.34 MiB


In [159]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6086
           1       0.72      0.52      0.60      1104

    accuracy                           0.90      7190
   macro avg       0.82      0.74      0.77      7190
weighted avg       0.89      0.90      0.89      7190

6.0625
peak memory: 295.32 MiB, increment: 17.92 MiB


In [160]:
%memit LRClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 5)

              precision    recall  f1-score   support

           0       0.90      0.97      0.93      6086
           1       0.73      0.39      0.51      1104

    accuracy                           0.88      7190
   macro avg       0.82      0.68      0.72      7190
weighted avg       0.87      0.88      0.87      7190

2.859375
peak memory: 299.93 MiB, increment: 22.76 MiB


Recursive Feature Elimination features

In [162]:
%memit DTClassifierTS(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features, 5)

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      6074
           1       0.24      0.23      0.24      1116

    accuracy                           0.76      7190
   macro avg       0.55      0.55      0.55      7190
weighted avg       0.76      0.76      0.76      7190

1.671875
peak memory: 290.16 MiB, increment: 11.05 MiB


In [163]:
%memit RFClassifierTS(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features, 5)

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      6074
           1       0.33      0.16      0.22      1116

    accuracy                           0.82      7190
   macro avg       0.59      0.55      0.56      7190
weighted avg       0.78      0.82      0.79      7190

5.6875
peak memory: 303.44 MiB, increment: 25.98 MiB


In [165]:
%memit LRClassifierTS(X_rfe_train, y_rfe_train, X_rfe_test, y_rfe_test, rfe_numeric_features, rfe_cat_features, 5)

              precision    recall  f1-score   support

           0       0.85      1.00      0.92      6074
           1       0.42      0.00      0.01      1116

    accuracy                           0.84      7190
   macro avg       0.63      0.50      0.46      7190
weighted avg       0.78      0.84      0.77      7190

1.625
peak memory: 294.23 MiB, increment: 15.43 MiB
