In [1]:
#Data processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

#Model performance metrics
from time import process_time
from memory_profiler import profile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#Feature selection and models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

#Data scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
%load_ext memory_profiler

In [17]:
#Additional Metrics
from sklearn.metrics import brier_score_loss
from sklearn.metrics import matthews_corrcoef

Models

In [3]:
#Random Forest Model
def RFClassifier(X_train, y_train, X_test, y_test):
  #Initialize the Random Forest Classifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = 100)

  #Time Measurement
  start_time = process_time()

  #Fit the classifier to the data
  forest_raw_imbalanced.fit(X_train, y_train)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  print(time)

# Modified Models to include SMOTE and standardization

In [18]:
#Random Forest function
def RFClassifierMOD(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_estimators):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = num_estimators)

  #Time Measurement
  start_time = process_time()
  
  #Data preprocessing
  smote = SMOTE()
  X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
  # Subset the numeric attributes
  X_train_smote_numeric = X_train_smote[numeric_attributes]
  X_test_numeric = X_test[numeric_attributes]

  scaler = StandardScaler()
  X_train_smote_S = scaler.fit_transform(X_train_smote_numeric)
  X_test_numeric_S = scaler.transform(X_test_numeric)
        
  # Concatenate the standardized numeric attributes with the categorical attributes
  X_train_combined = np.concatenate((X_train_smote_S, X_train_smote[cat_attributes]), axis=1)
  X_test_combined = np.concatenate((X_test_numeric_S, X_test[cat_attributes]), axis=1)

  #Fit the Classifier to the data
  forest_raw_imbalanced.fit(X_train_combined, y_train_smote)

  #Predict new Data
  y_pred = forest_raw_imbalanced.predict(X_test_combined)

  #Time Measurement
  end_time = process_time()

  #Results
  cr = classification_report(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  time = end_time - start_time
  print(cr)
  print(cm)
  mcc = matthews_corrcoef(y_test, y_pred)
  brier_score = brier_score_loss(y_test, y_pred)
  print("Matthew's Correlation", mcc)
  print("Brier's Score", brier_score)
  print(time)

# Crossvalidation Models

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit

K-Fold Validation Models

In [29]:
#Random Forest Function
def RFClassifierKFold(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_estimators, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = num_estimators)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  kfold = KFold(n_splits = num_folds)
    
  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = kfold)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
  mcc_scores = []
  brier_scores = []
    
  for train_index, val_index in kfold.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)
    
    mcc_scores.append(matthews_corrcoef(y_val_fold, y_pred))
    brier_scores.append(brier_score_loss(y_val_fold, y_pred))

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
  
  print(summary_report)
  
  print("Matthew's Correlation", sum(mcc_scores) / len(mcc_scores))
  print("Brier's Score", sum(brier_scores) / len(brier_scores))
  print(time)

# Time-Series crossvalidation

In [33]:
#Random Forest Function
def RFClassifierTS(X_train, y_train, X_test, y_test, numeric_attributes, cat_attributes, num_estimators, num_folds):
  #Initialize the DecisionTreeClassifier
  forest_raw_imbalanced = RandomForestClassifier(n_estimators = num_estimators)

  #Time Measurement
  start_time = process_time()
    
  #Specify number of folds (k) for cross validation
  tscv = TimeSeriesSplit(n_splits = num_folds)
    
  results = cross_val_score(forest_raw_imbalanced, X_train, y_train, cv = tscv)

  #Lists for metrics
  confusion_matrices = []
  classification_reports = []
    
  all_predictions = []
  all_true_labels = []
  mcc_scores = []
  brier_scores = []
    
  for train_index, val_index in tscv.split(X_train):
    X_train_fold_values, X_val_fold_values = X_train.values[train_index], X_train.values[val_index]
    y_train_fold_values, y_val_fold_values = y_train.values[train_index], y_train.values[val_index]
    
    X_train_fold = pd.DataFrame(X_train_fold_values, columns=X_train.columns)
    X_val_fold = pd.DataFrame(X_val_fold_values, columns=X_train.columns)
    y_train_fold = pd.Series(y_train_fold_values, index=X_train_fold.index)
    y_val_fold = pd.Series(y_val_fold_values, index=X_val_fold.index)
    
    X_train_numeric = X_train_fold.loc[:, numeric_attributes]
    X_val_numeric = X_val_fold.loc[:, numeric_attributes]
    
    scaler = StandardScaler()
    X_train_smote_S = scaler.fit_transform(X_train_numeric)
    X_test_numeric_S = scaler.transform(X_val_numeric)
    
    # Concatenate the standardized numeric attributes with the categorical attributes
    X_train_combined = np.concatenate((X_train_smote_S, X_train_fold[cat_attributes]), axis=1)
    X_test_combined = np.concatenate((X_test_numeric_S, X_val_fold[cat_attributes]), axis=1)
    
    #Data preprocessing
    smote = SMOTE()
    X_train_fold_SMOTE, y_train_fold_SMOTE = smote.fit_resample(X_train_combined, y_train_fold)

    #Fit the Classifier to the data
    forest_raw_imbalanced.fit(X_train_combined, y_train_fold)

    #Predict new Data
    y_pred = forest_raw_imbalanced.predict(X_test_combined)
    
    cr = classification_report(y_val_fold, y_pred)
    classification_reports.append(cr)
    
    cm = confusion_matrix(y_val_fold, y_pred)
    confusion_matrices.append(cm)
    
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)
    
    mcc_scores.append(matthews_corrcoef(y_val_fold, y_pred))
    brier_scores.append(brier_score_loss(y_val_fold, y_pred))

  #Time Measurement
  end_time = process_time()

  time = end_time - start_time
  summary_report = classification_report(all_true_labels, all_predictions)
  
  """
  for fold in range(num_folds):
    print("Confusion matrix for fold", fold+1, ":\n", confusion_matrices[fold])
    print("Classification report for fold", fold+1, ":\n", classification_reports[fold])
    print()"""
    
  print(summary_report)
  print("Matthew's Correlation", sum(mcc_scores) / len(mcc_scores))
  print("Brier's Score", sum(brier_scores) / len(brier_scores))
  print(time)

# Data Import and Processing

In [8]:
#https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("online_shoppers_intention.csv")

In [9]:
#Identify categorical attributes
categorical_features = ["Month", "OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend"]
df_cat = df[categorical_features]

df_onehot = pd.get_dummies(df, columns = categorical_features, prefix = categorical_features)

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_onehot['Revenue'] = label_encoder.fit_transform(df['Revenue'])

Control SMOTE data

In [10]:
#Specify independent/ dependent values
X = df_onehot.drop(columns = "Revenue")
y = df_onehot["Revenue"]

#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Filtered Data

Pearson Correlation Filters

In [11]:
#Correlation of Onehot encoded dataset

corr = df_onehot.corr()

revenue_correlation = corr["Revenue"]
sorted_pearson_correlation = revenue_correlation.abs().sort_values(ascending = False)

sorted_pearson_correlation_df = pd.DataFrame(sorted_pearson_correlation)
SPC_topquantile = sorted_pearson_correlation_df.quantile(0.75)
filtered_df = sorted_pearson_correlation_df[sorted_pearson_correlation_df >= SPC_topquantile]
filtered_df.dropna(inplace = True)
#18 attributes were kept, were in the top quantile
     
df_pearson = df_onehot[filtered_df.index.tolist()]

#Tranform categorical attributes
label_encoder = LabelEncoder()
df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])

#Specify independent/ dependent values
X_p = df_pearson.drop(columns = "Revenue")
y_p = df_pearson["Revenue"]

#Split the Data
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size = 0.3)

smote = SMOTE()
X_p_train_smote, y_p_train_smote = smote.fit_resample(X_p_train, y_p_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pearson['Revenue'] = label_encoder.fit_transform(df_pearson['Revenue'])


Random Forest Features

In [12]:
#Filters of RF Classifier

rf_classifier = RandomForestClassifier()

#Fit random forest classifier
rf_classifier.fit(X_train_smote, y_train_smote)

feature_importances = rf_classifier.feature_importances_

rf_df = pd.DataFrame({"Feature": X_train_smote.columns, "Importance": feature_importances})

sorted_features = np.argsort(feature_importances)[::-1]

#Sorting features
rf_df_sorted = rf_df.sort_values("Importance", ascending = False)
rf_df_sorted = rf_df_sorted.reset_index(drop = True)
rf_df_sorted

rf_df_sorted.describe()
RF_topquantile = rf_df_sorted['Importance'].quantile(0.75)
RFfiltered_df = rf_df_sorted.loc[rf_df_sorted['Importance'] >= RF_topquantile]
#19 features were kept after keeping the top quartile of results
filtered_attributes_rf = RFfiltered_df.index.tolist()
df_rf = df_onehot[RFfiltered_df["Feature"]]

df_rf["Revenue"] = df_onehot["Revenue"]

#Specify independent/ dependent values
X_rf = df_rf.drop(columns = "Revenue")
y_rf = df_rf["Revenue"]

#Split the Data
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size = 0.3)

smote = SMOTE()
X_rf_train_smote, y_rf_train_smote = smote.fit_resample(X_rf_train, y_rf_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rf["Revenue"] = df_onehot["Revenue"]


In [13]:
#Define columns that need data normalization/ standardization
numeric_features = ['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']

p_numeric_features = ['Administrative', 'Administrative_Duration', 'Informational', 
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration','BounceRates',
       'ExitRates', 'PageValues', 'SpecialDay']

rf_numeric_features = ['Administrative', 'Administrative_Duration',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues']

cat_features = [col for col in X_train if col not in numeric_features]
p_cat_features = [col for col in X_p_train if col not in numeric_features]
rf_cat_features = [col for col in X_rf_train if col not in numeric_features]

# Changing num_estimators

# Modified Functions to include oversampling/scaling within the function

Pearson Correlation Filtered Data

In [20]:
%memit RFClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 100)

              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3142
           1       0.62      0.72      0.67       557

    accuracy                           0.89      3699
   macro avg       0.78      0.82      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2896  246]
 [ 155  402]]
Matthew's Correlation 0.6053729457758509
Brier's Score 0.10840767775074345
2.515625
peak memory: 235.11 MiB, increment: 26.63 MiB


# K-Fold Crossvalidation 

Random forest features

In [31]:
%memit RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7322
           1       0.73      0.56      0.63      1309

    accuracy                           0.90      8631
   macro avg       0.83      0.76      0.79      8631
weighted avg       0.90      0.90      0.90      8631

Matthew's Correlation 0.5861738150298498
Brier's Score 0.09778750819410346
9.6875
peak memory: 244.31 MiB, increment: 13.37 MiB


# Time-Series Crossvalidation

Random Forest features

In [34]:
%memit RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 5)

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6097
           1       0.73      0.56      0.64      1093

    accuracy                           0.90      7190
   macro avg       0.83      0.76      0.79      7190
weighted avg       0.90      0.90      0.90      7190

Matthew's Correlation 0.5876567535109676
Brier's Score 0.09791376912378304
6.40625
peak memory: 236.40 MiB, increment: 16.63 MiB


# Variance testing

In [38]:
for i in range(5):
    RFClassifierMOD(X_p_train, y_p_train, X_p_test, y_p_test, p_numeric_features, p_cat_features, 100)

              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3142
           1       0.62      0.72      0.67       557

    accuracy                           0.89      3699
   macro avg       0.78      0.82      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2897  245]
 [ 157  400]]
Matthew's Correlation 0.6033965770959435
Brier's Score 0.10867802108678021
2.4375
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3142
           1       0.62      0.72      0.67       557

    accuracy                           0.89      3699
   macro avg       0.79      0.82      0.80      3699
weighted avg       0.90      0.89      0.89      3699

[[2899  243]
 [ 157  400]]
Matthew's Correlation 0.6047371915042924
Brier's Score 0.10813733441470667
2.5625
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      3142
           1       

In [43]:
negative_precision = [0.95, 0.95, 0.95, 0.95, 0.95]
negative_recall = [0.92, 0.92, 0.92, 0.92, 0.92]
negative_f1_score = [0.94, 0.94, 0.94, 0.94, 0.93]

positive_precision = [0.62, 0.62, 0.62, 0.63, 0.62]
positive_recall = [0.72, 0.72, 0.72, 0.71, 0.71]
positive_f1_score = [0.67, 0.67, 0.67, 0.67, 0.66]

accuracy = [0.89, 0.89, 0.89, 0.89, 0.89]
matthews_correlation = [0.6033965770959435, 0.6047371915042924, 0.605388806151251, 0.6041407710751439, 0.5994121189853286]
briers_score = [0.10867802108678021, 0.10813733441470667, 0.10813733441470667, 0.10759664774263314, 0.10975939443092728]
time = [2.4375, 2.5625, 2.640625, 2.40625, 2.421875]

negative_precision_variance = np.var(negative_precision)
negative_recall_variance = np.var(negative_recall)
negative_f1_score_variance = np.var(negative_f1_score)

positive_precision_variance = np.var(positive_precision)
positive_recall_variance = np.var(positive_recall)
positive_f1_score_variance = np.var(positive_f1_score)

accuracy_variance = np.var(accuracy)
matthews_correlation_variance = np.var(matthews_correlation)
briers_score_variance = np.var(briers_score)
time_variance = np.var(time)

# Print the variances
print("Negative Precision Variance:", negative_precision_variance)
print("Negative Recall Variance:", negative_recall_variance)
print("Negative F1-score Variance:", negative_f1_score_variance)

print("Positive Precision Variance:", positive_precision_variance)
print("Positive Recall Variance:", positive_recall_variance)
print("Positive F1-score Variance:", positive_f1_score_variance)

print("Accuracy Variance:", accuracy_variance)
print("Matthew's Correlation Variance:", matthews_correlation_variance)
print("Brier's Score Variance:", briers_score_variance)
print("Time Variance:", time_variance)

Negative Precision Variance: 0.0
Negative Recall Variance: 1.232595164407831e-32
Negative F1-score Variance: 1.5999999999999674e-05
Positive Precision Variance: 1.600000000000003e-05
Positive Recall Variance: 2.400000000000004e-05
Positive F1-score Variance: 1.600000000000003e-05
Accuracy Variance: 0.0
Matthew's Correlation Variance: 4.4388481056110005e-06
Brier's Score Variance: 5.379094223386346e-07
Time Variance: 0.00845703125


In [42]:
for i in range(5):
    RFClassifierKFold(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 5)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7322
           1       0.74      0.56      0.64      1309

    accuracy                           0.90      8631
   macro avg       0.83      0.76      0.79      8631
weighted avg       0.90      0.90      0.90      8631

Matthew's Correlation 0.592160875692654
Brier's Score 0.09616526022191343
9.828125
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7322
           1       0.74      0.56      0.64      1309

    accuracy                           0.90      8631
   macro avg       0.83      0.76      0.79      8631
weighted avg       0.90      0.90      0.90      8631

Matthew's Correlation 0.5909500464579447
Brier's Score 0.09639761379655543
9.671875
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7322
           1       0.74      0.54      0.63      1309

    accuracy   

In [44]:
negative_precision = [0.92, 0.92, 0.92, 0.92, 0.92]
negative_recall = [0.97, 0.97, 0.97, 0.96, 0.96]
negative_f1_score = [0.94, 0.94, 0.94, 0.94, 0.94]

positive_precision = [0.74, 0.74, 0.74, 0.74, 0.74]
positive_recall = [0.56, 0.55, 0.55, 0.55, 0.56]
positive_f1_score = [0.64, 0.63, 0.63, 0.63, 0.64]

accuracy = [0.90, 0.90, 0.90, 0.90, 0.90]
matthews_correlation = [0.5897665120780788, 0.5833569514943414, 0.584062157761428, 0.5860958315439827, 0.589897709880703]
briers_score = [0.09674443320958588, 0.09801878823216033, 0.09778744109806689, 0.09743961524448788, 0.09674476868976872]
time = [9.453125, 9.578125, 9.546875, 9.96875, 9.9375]

negative_precision_variance = np.var(negative_precision)
negative_recall_variance = np.var(negative_recall)
negative_f1_score_variance = np.var(negative_f1_score)

positive_precision_variance = np.var(positive_precision)
positive_recall_variance = np.var(positive_recall)
positive_f1_score_variance = np.var(positive_f1_score)

accuracy_variance = np.var(accuracy)
matthews_correlation_variance = np.var(matthews_correlation)
briers_score_variance = np.var(briers_score)
time_variance = np.var(time)

# Print the variances
print("Negative Precision Variance:", negative_precision_variance)
print("Negative Recall Variance:", negative_recall_variance)
print("Negative F1-score Variance:", negative_f1_score_variance)

print("Positive Precision Variance:", positive_precision_variance)
print("Positive Recall Variance:", positive_recall_variance)
print("Positive F1-score Variance:", positive_f1_score_variance)

print("Accuracy Variance:", accuracy_variance)
print("Matthew's Correlation Variance:", matthews_correlation_variance)
print("Brier's Score Variance:", briers_score_variance)
print("Time Variance:", time_variance)

Negative Precision Variance: 1.232595164407831e-32
Negative Recall Variance: 2.400000000000004e-05
Negative F1-score Variance: 1.232595164407831e-32
Positive Precision Variance: 0.0
Positive Recall Variance: 2.400000000000004e-05
Positive F1-score Variance: 2.400000000000004e-05
Accuracy Variance: 0.0
Matthew's Correlation Variance: 7.621492401814735e-06
Brier's Score Variance: 2.7592693214958056e-07
Time Variance: 0.04556640625


In [40]:
for i in range(5):
    RFClassifierTS(X_rf_train, y_rf_train, X_rf_test, y_rf_test, rf_numeric_features, rf_cat_features, 100, 5)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94      6097
           1       0.74      0.56      0.64      1093

    accuracy                           0.90      7190
   macro avg       0.83      0.76      0.79      7190
weighted avg       0.90      0.90      0.90      7190

Matthew's Correlation 0.5897683347185898
Brier's Score 0.0968011126564673
6.71875
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      6097
           1       0.74      0.56      0.64      1093

    accuracy                           0.90      7190
   macro avg       0.83      0.76      0.79      7190
weighted avg       0.90      0.90      0.90      7190

Matthew's Correlation 0.5912042015863317
Brier's Score 0.09638386648122392
6.546875
              precision    recall  f1-score   support

           0       0.93      0.96      0.94      6097
           1       0.73      0.57      0.64      1093

    accuracy    

In [45]:
negative_precision = [0.92, 0.92, 0.93, 0.92, 0.92]
negative_recall = [0.97, 0.97, 0.96, 0.96, 0.96]
negative_f1_score = [0.94, 0.94, 0.94, 0.94, 0.94]

positive_precision = [0.74, 0.74, 0.73, 0.74, 0.73]
positive_recall = [0.56, 0.56, 0.55, 0.55, 0.55]
positive_f1_score = [0.64, 0.64, 0.63, 0.63, 0.63]

accuracy = [0.90, 0.90, 0.90, 0.90, 0.90]
matthews_correlation = [0.5897683347185898, 0.5912042015863317, 0.5893192697464371, 0.5847338869126286, 0.5794291285215211]
briers_score = [0.0968011126564673, 0.09638386648122392, 0.09735744089012517, 0.09791376912378304, 0.09916550764951321]
time = [6.71875, 6.546875, 6.390625, 6.53125, 6.515625]

negative_precision_variance = np.var(negative_precision)
negative_recall_variance = np.var(negative_recall)
negative_f1_score_variance = np.var(negative_f1_score)

positive_precision_variance = np.var(positive_precision)
positive_recall_variance = np.var(positive_recall)
positive_f1_score_variance = np.var(positive_f1_score)

accuracy_variance = np.var(accuracy)
matthews_correlation_variance = np.var(matthews_correlation)
briers_score_variance = np.var(briers_score)
time_variance = np.var(time)

# Print the variances
print("Negative Precision Variance:", negative_precision_variance)
print("Negative Recall Variance:", negative_recall_variance)
print("Negative F1-score Variance:", negative_f1_score_variance)

print("Positive Precision Variance:", positive_precision_variance)
print("Positive Recall Variance:", positive_recall_variance)
print("Positive F1-score Variance:", positive_f1_score_variance)

print("Accuracy Variance:", accuracy_variance)
print("Matthew's Correlation Variance:", matthews_correlation_variance)
print("Brier's Score Variance:", briers_score_variance)
print("Time Variance:", time_variance)

Negative Precision Variance: 1.600000000000003e-05
Negative Recall Variance: 2.4000000000000045e-05
Negative F1-score Variance: 1.232595164407831e-32
Positive Precision Variance: 2.400000000000004e-05
Positive Recall Variance: 2.4000000000000048e-05
Positive F1-score Variance: 2.4000000000000048e-05
Accuracy Variance: 0.0
Matthew's Correlation Variance: 1.8622383959054213e-05
Brier's Score Variance: 9.393358493193886e-07
Time Variance: 0.010996093749999996
