# 1. Import library
Note: Combined from various sources, much more comprehensive than the original code provided. Import 也很讲究 XDXD

Also note: For this "Ab_Virus_02_Model_Selection_v01.ipynb" Jupyter Notebook, you can switch back to your normal pip3 or conda environment.

### 1.1 Import OS and Path

In [1]:
import os
from pathlib import Path

### 1.2 Import data structures

In [2]:
import numpy as np
from numpy import arange, logspace
import pandas as pd
import multiprocessing
import logging
import csv
import json

### 1.3 Import visualisation tools

In [3]:
import seaborn as sb
sb.set()
from matplotlib import pyplot
import matplotlib.pyplot as plt
%matplotlib inline

### 1.4 Import Scikit Learn - data analytics

In [4]:
# The code for featurization was borrowed from deepchem. Please refer https://deepchem.io for more information
from scipy import stats
from scipy.stats import randint, uniform
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, auc, roc_curve, roc_auc_score, plot_roc_curve, average_precision_score, mean_squared_error, r2_score, precision_score,recall_score, f1_score
from sklearn.model_selection import cross_val_score

### 1.5 Import Scikit Learn - classifiers

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

### 1.5 Import XGBoost

In [6]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_importance

### 1.6 Import other classifiers

In [7]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import catboost as cb
from catboost import CatBoostClassifier

### 1.7 Import Pytorch Tabnet

In [8]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

# 2. Data structures loading and preparation

### 2.1 Load NumPy files from directory

In [9]:
CURRENT_PATH = os.getcwd()
print(CURRENT_PATH)

X_mean = np.load(os.path.join(os.getcwd(), "CoV_AbDab_mean_X.npy"))
print("Loaded file 'CoV_AbDab_mean_X.npy'")

X_max = np.load(os.path.join(os.getcwd(), "CoV_AbDab_max_X.npy"))
print("Loaded file 'CoV_AbDab_max_X.npy'")

y = np.load(os.path.join(os.getcwd(), "CoV_AbDab_class_Y.npy"))
print("Loaded file 'CoV_AbDab_class_Y.npy'")

print(X_mean.shape)
print(X_max.shape)
print(y.shape)

c:\Users\chanj\Dropbox\NTU_studies\2021_22_Year_2\URECA\Reading\PotentialAB
Loaded file 'CoV_AbDab_mean_X.npy'
Loaded file 'CoV_AbDab_max_X.npy'
Loaded file 'CoV_AbDab_class_Y.npy'
(307, 74)
(307, 74)
(307,)


### 2.2 CSV file preparation

In [10]:
csv_file_name = "CoV_AbDab_02_Model_Selection.csv"
print(csv_file_name)

CoV_AbDab_02_Model_Selection.csv


# 3. Model 1: Random Forest Classifier

### 3.1 Apply Random Forest Classifier with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [11]:
# Apply Random Forest Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Random Forest Classifier
    rf = RandomForestClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    rf.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = rf.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 69.35%
F1 score: 81.19%
ROC AUC score: 51.44%
RepeatedStratifiedKFold = 2
Accuracy: 77.42%
F1 score: 86.54%
ROC AUC score: 58.82%
RepeatedStratifiedKFold = 3
Accuracy: 75.41%
F1 score: 85.44%
ROC AUC score: 55.14%
RepeatedStratifiedKFold = 4
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 5
Accuracy: 72.13%
F1 score: 83.50%
ROC AUC score: 50.90%
RepeatedStratifiedKFold = 6
Accuracy: 72.58%
F1 score: 82.83%
ROC AUC score: 57.32%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 83.50%
ROC AUC score: 53.66%
RepeatedStratifiedKFold = 8
Accuracy: 75.41%
F1 score: 85.44%
ROC AUC score: 55.14%
RepeatedStratifiedKFold = 9
Accuracy: 73.77%
F1 score: 84.00%
ROC AUC score: 56.04%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 11
Accuracy: 74.19%
F1 score: 84.00%
ROC AUC score: 58.43%
RepeatedStratifiedKFold = 12
Accuracy: 75.81%
F1 score: 85.71%
ROC AUC sco

### 3.2 Printing Accuracy score and ROC AUC score

In [12]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([69.35, 77.42, 75.41, 72.13, 72.13, 72.58, 72.58, 75.41, 73.77,
       73.77, 74.19, 75.81, 73.77, 68.85, 68.85, 72.58, 69.35, 75.41,
       70.49, 72.13, 74.19, 75.81, 78.69, 72.13, 62.3 , 77.42, 75.81,
       75.41, 70.49, 77.05, 72.58, 75.81, 67.21, 65.57, 75.41, 75.81,
       72.58, 72.13, 70.49, 75.41, 70.97, 75.81, 70.49, 73.77, 70.49,
       70.97, 70.97, 78.69, 67.21, 77.05])
Mean   | Max    | Min
72.89% | 78.69% | 62.30%
F1 score array: array([81.19, 86.54, 85.44, 83.81, 83.5 , 82.83, 83.5 , 85.44, 84.  ,
       84.62, 84.  , 85.71, 84.  , 81.19, 81.55, 83.5 , 81.19, 85.15,
       81.63, 83.81, 84.62, 85.44, 86.6 , 83.5 , 76.29, 86.54, 85.71,
       85.44, 82.69, 86.27, 83.17, 85.71, 80.  , 78.79, 85.44, 85.15,
       83.81, 83.17, 82.  , 85.15, 82.35, 85.44, 82.  , 84.31, 82.35,
       82.  , 82.35, 87.38, 80.  , 86.  ])
Mean   | Max    | Min
83.64% | 87.38% | 76.29%
ROC AUC score array: array([51.44, 58.82, 55.14, 48.89, 50.9 , 57.32, 53.66, 55.14

### 3.3 Storing into csv

In [13]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['random forest', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['random forest', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['random forest', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 3.4 Apply Random Forest Classifier with Repeated Stratified 5-Fold 10 times, X input: X_max

In [14]:
# Apply Random Forest Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Random Forest Classifier
    rf = RandomForestClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    rf.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = rf.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2
Accuracy: 72.58%
F1 score: 83.81%
ROC AUC score: 51.83%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 4
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 5
Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 6
Accuracy: 72.58%
F1 score: 83.81%
ROC AUC score: 51.83%
RepeatedStratifiedKFold = 7
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 8
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 9
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 10
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 11
Accuracy: 74.19%
F1 score: 84.62%
ROC AUC score: 54.77%
RepeatedStratifiedKFold = 12
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC sco

In [15]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([72.58, 72.58, 73.77, 72.13, 70.49, 72.58, 70.97, 73.77, 72.13,
       72.13, 74.19, 70.97, 73.77, 73.77, 72.13, 69.35, 70.97, 75.41,
       68.85, 73.77, 72.58, 70.97, 73.77, 72.13, 75.41, 70.97, 74.19,
       75.41, 70.49, 72.13, 72.58, 74.19, 73.77, 70.49, 63.93, 70.97,
       67.74, 73.77, 73.77, 73.77, 70.97, 72.58, 73.77, 73.77, 73.77,
       70.97, 70.97, 72.13, 72.13, 73.77])
Mean   | Max    | Min
72.28% | 75.41% | 63.93%
F1 score array: array([84.11, 83.81, 84.91, 83.81, 82.69, 83.81, 83.02, 84.62, 83.81,
       83.81, 84.62, 83.02, 84.91, 84.91, 83.81, 81.55, 83.02, 85.71,
       81.19, 84.91, 84.11, 82.69, 84.91, 83.81, 85.71, 83.02, 84.62,
       85.71, 82.69, 83.17, 84.11, 84.91, 84.91, 82.69, 78.  , 83.02,
       80.39, 84.91, 84.91, 84.91, 83.02, 84.11, 84.91, 84.62, 84.62,
       83.02, 82.69, 83.5 , 83.81, 84.91])
Mean   | Max    | Min
83.77% | 85.71% | 78.00%
ROC AUC score array: array([50.  , 51.83, 50.  , 48.89, 47.78, 51.83, 48.89, 52.01

In [16]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['random forest', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['random forest', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['random forest', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

# 4. Model 2: Decision Tree Classifier

### 4.1 Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [17]:
# Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Decision Tree Classifier
    dectree = DecisionTreeClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    dectree.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = dectree.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 64.52%
F1 score: 75.56%
ROC AUC score: 55.42%
RepeatedStratifiedKFold = 2
Accuracy: 59.68%
F1 score: 72.53%
ROC AUC score: 48.43%
RepeatedStratifiedKFold = 3
Accuracy: 54.10%
F1 score: 69.57%
ROC AUC score: 38.68%
RepeatedStratifiedKFold = 4
Accuracy: 59.02%
F1 score: 72.53%
ROC AUC score: 46.04%
RepeatedStratifiedKFold = 5
Accuracy: 45.90%
F1 score: 60.24%
ROC AUC score: 37.15%
RepeatedStratifiedKFold = 6
Accuracy: 61.29%
F1 score: 73.33%
ROC AUC score: 51.37%
RepeatedStratifiedKFold = 7
Accuracy: 64.52%
F1 score: 76.09%
ROC AUC score: 53.59%
RepeatedStratifiedKFold = 8
Accuracy: 67.21%
F1 score: 77.78%
ROC AUC score: 57.64%
RepeatedStratifiedKFold = 9
Accuracy: 57.38%
F1 score: 69.05%
ROC AUC score: 50.97%
RepeatedStratifiedKFold = 10
Accuracy: 63.93%
F1 score: 76.09%
ROC AUC score: 51.39%
RepeatedStratifiedKFold = 11
Accuracy: 69.35%
F1 score: 78.65%
ROC AUC score: 62.42%
RepeatedStratifiedKFold = 12
Accuracy: 62.90%
F1 score: 75.79%
ROC AUC sco

In [18]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([64.52, 59.68, 54.1 , 59.02, 45.9 , 61.29, 64.52, 67.21, 57.38,
       63.93, 69.35, 62.9 , 49.18, 60.66, 59.02, 58.06, 58.06, 59.02,
       50.82, 73.77, 66.13, 64.52, 65.57, 68.85, 65.57, 56.45, 61.29,
       67.21, 62.3 , 59.02, 62.9 , 70.97, 52.46, 62.3 , 55.74, 61.29,
       59.68, 45.9 , 67.21, 63.93, 53.23, 59.68, 72.13, 57.38, 60.66,
       56.45, 58.06, 67.21, 57.38, 45.9 ])
Mean   | Max    | Min
60.52% | 73.77% | 45.90%
F1 score array: array([75.56, 72.53, 69.57, 72.53, 60.24, 73.33, 76.09, 77.78, 69.05,
       76.09, 78.65, 75.79, 62.65, 71.43, 70.59, 72.92, 69.77, 72.53,
       65.12, 82.98, 77.42, 76.6 , 76.92, 79.57, 77.89, 69.66, 72.73,
       77.27, 75.79, 71.91, 75.27, 80.  , 65.88, 74.16, 68.97, 71.43,
       73.12, 59.26, 78.26, 76.09, 67.42, 72.53, 81.72, 70.45, 73.91,
       70.97, 72.34, 77.27, 69.77, 58.23])
Mean   | Max    | Min
72.76% | 82.98% | 58.23%
ROC AUC score array: array([55.42, 48.43, 38.68, 46.04, 37.15, 51.37, 53.59, 57.64

In [19]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['decision tree', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['decision tree', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['decision tree', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 4.2 Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times, X input: X_max

In [20]:
# Apply Decision Tree Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Decision Tree Classifier
    dectree = DecisionTreeClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    dectree.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = dectree.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2
Accuracy: 74.19%
F1 score: 84.62%
ROC AUC score: 54.77%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 4
Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 5
Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 6
Accuracy: 70.97%
F1 score: 82.69%
ROC AUC score: 50.72%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 83.81%
ROC AUC score: 51.83%
RepeatedStratifiedKFold = 8
Accuracy: 72.13%
F1 score: 83.17%
ROC AUC score: 52.92%
RepeatedStratifiedKFold = 9
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 10
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 11
Accuracy: 75.81%
F1 score: 85.44%
ROC AUC score: 57.71%
RepeatedStratifiedKFold = 12
Accuracy: 70.97%
F1 score: 82.69%
ROC AUC sco

In [21]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([72.58, 74.19, 73.77, 70.49, 70.49, 70.97, 72.58, 72.13, 73.77,
       72.13, 75.81, 70.97, 73.77, 72.13, 72.13, 67.74, 69.35, 72.13,
       68.85, 73.77, 74.19, 67.74, 75.41, 70.49, 75.41, 72.58, 72.58,
       73.77, 68.85, 70.49, 74.19, 72.58, 72.13, 70.49, 68.85, 70.97,
       67.74, 73.77, 73.77, 73.77, 70.97, 69.35, 77.05, 73.77, 70.49,
       70.97, 69.35, 73.77, 72.13, 73.77])
Mean   | Max    | Min
72.02% | 77.05% | 67.74%
F1 score array: array([84.11, 84.62, 84.62, 82.69, 82.69, 82.69, 83.81, 83.17, 84.62,
       83.81, 85.44, 82.69, 84.62, 83.81, 83.81, 80.39, 81.9 , 83.81,
       81.19, 84.91, 84.91, 80.39, 85.44, 82.69, 85.44, 83.81, 83.81,
       84.62, 81.55, 82.  , 84.62, 84.11, 83.5 , 82.69, 80.81, 83.02,
       80.39, 84.62, 84.91, 84.91, 83.02, 81.9 , 86.27, 84.62, 82.69,
       83.02, 81.9 , 84.31, 83.81, 84.62])
Mean   | Max    | Min
83.48% | 86.27% | 80.39%
ROC AUC score array: array([50.  , 54.77, 52.01, 47.78, 47.78, 50.72, 51.83, 52.92

In [22]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['decision tree', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['decision tree', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['decision tree', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

# 5. Model 3: Logistic Regression

### 5.1 Apply Logistic Regression with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [23]:
# Apply Logistic Regression with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Logistic Regression
    LR = LogisticRegression(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    LR.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = LR.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))

    # Evaluate predictions: Plotting the ROC curve
    y_pred_roc = LR.decision_function(X_validate)
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_validate, y_pred_roc)
    # rf_auc = auc(false_positive_rate, true_positive_rate)
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

false_positive_rate = np.asarray(false_positive_rate)
true_positive_rate = np.asarray(true_positive_rate)
threshold = np.asarray(threshold)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 3
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 4
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 5
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 6
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 8
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 9
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 11
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 12
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC sco

In [24]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

print(false_positive_rate)
# print(len(false_positive_rate))
print(true_positive_rate)
# print(len(true_positive_rate))
print(threshold)
# print(len(threshold))
# print(rf_auc)

Accuracy score array: array([72.58, 74.19, 75.41, 73.77, 73.77, 74.19, 72.58, 75.41, 75.41,
       73.77, 74.19, 74.19, 73.77, 73.77, 73.77, 72.58, 72.58, 75.41,
       77.05, 73.77, 72.58, 74.19, 73.77, 75.41, 75.41, 75.81, 74.19,
       75.41, 73.77, 73.77, 74.19, 74.19, 73.77, 73.77, 73.77, 74.19,
       72.58, 73.77, 75.41, 75.41, 72.58, 72.58, 75.41, 73.77, 75.41,
       72.58, 72.58, 77.05, 73.77, 75.41])
Mean   | Max    | Min
74.17% | 77.05% | 72.58%
F1 score array: array([84.11, 84.91, 85.71, 84.91, 84.91, 84.91, 84.11, 85.71, 85.71,
       84.91, 84.91, 84.91, 84.91, 84.91, 84.91, 84.11, 84.11, 85.71,
       86.54, 84.91, 84.11, 84.91, 84.91, 85.71, 85.71, 85.71, 84.91,
       85.71, 84.91, 84.91, 84.91, 84.91, 84.62, 84.91, 84.91, 84.91,
       84.11, 84.91, 85.71, 85.71, 84.11, 83.81, 85.71, 84.91, 85.71,
       84.11, 84.11, 86.54, 84.91, 85.71])
Mean   | Max    | Min
85.01% | 86.54% | 83.81%
ROC AUC score array: array([50.  , 52.94, 53.12, 50.  , 50.  , 52.94, 50.  , 53.12

In [25]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['logistic regression', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['logistic regression', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['logistic regression', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

with open("CoV_AbDab_02_ROC_Curve.csv", 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['logistic regression', 'mean pool', 'false_positive_rate'] + false_positive_rate.tolist())
    writer_object.writerow(['logistic regression', 'mean pool', 'true_positive_rate'] + true_positive_rate.tolist())
    writer_object.writerow(['logistic regression', 'mean pool', 'threshold'] + threshold.tolist())

    #Close the file object
    f_object.close()

### 5.2 Apply Logistic Regression with 5 K-Fold, X input: X_max

In [26]:
# Apply Logistic Regression with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Logistic Regression
    LR = LogisticRegression(random_state = 1001, solver='liblinear') # change the classifier here
    
    # Model fitting and training
    LR.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = LR.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))

    # Evaluate predictions: Plotting the ROC curve
    y_pred_roc = LR.decision_function(X_validate)
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_validate, y_pred_roc)
    # rf_auc = auc(false_positive_rate, true_positive_rate)
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

false_positive_rate = np.asarray(false_positive_rate)
true_positive_rate = np.asarray(true_positive_rate)
threshold = np.asarray(threshold)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 4
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 5
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 6
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 8
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 9
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 11
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 12
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC sco

In [27]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

print(false_positive_rate)
# print(len(false_positive_rate))
print(true_positive_rate)
# print(len(true_positive_rate))
print(threshold)
# print(len(threshold))
# print(rf_auc)

Accuracy score array: array([72.58, 72.58, 73.77, 73.77, 72.13, 72.58, 72.58, 75.41, 73.77,
       73.77, 72.58, 74.19, 73.77, 73.77, 73.77, 72.58, 70.97, 73.77,
       72.13, 73.77, 72.58, 72.58, 75.41, 73.77, 73.77, 74.19, 70.97,
       73.77, 73.77, 73.77, 72.58, 72.58, 72.13, 72.13, 70.49, 72.58,
       70.97, 73.77, 73.77, 73.77, 70.97, 72.58, 73.77, 73.77, 75.41,
       70.97, 72.58, 75.41, 73.77, 75.41])
Mean   | Max    | Min
73.16% | 75.41% | 70.49%
F1 score array: array([84.11, 84.11, 84.91, 84.91, 83.81, 84.11, 84.11, 85.71, 84.91,
       84.91, 84.11, 84.91, 84.91, 84.91, 84.91, 84.11, 83.02, 84.91,
       83.81, 84.91, 84.11, 83.81, 85.71, 84.91, 84.91, 84.91, 83.02,
       84.62, 84.91, 84.91, 84.11, 84.11, 83.5 , 83.81, 82.69, 84.11,
       83.02, 84.91, 84.91, 84.91, 83.02, 84.11, 84.91, 84.91, 85.71,
       83.02, 84.11, 85.71, 84.91, 85.71])
Mean   | Max    | Min
84.44% | 85.71% | 82.69%
ROC AUC score array: array([50.  , 50.  , 50.  , 50.  , 48.89, 50.  , 50.  , 53.12

In [28]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['logistic regression', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['logistic regression', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['logistic regression', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

with open("CoV_AbDab_02_ROC_Curve.csv", 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['logistic regression', 'max pool', 'false_positive_rate'] + false_positive_rate.tolist())
    writer_object.writerow(['logistic regression', 'max pool', 'true_positive_rate'] + true_positive_rate.tolist())
    writer_object.writerow(['logistic regression', 'max pool', 'threshold'] + threshold.tolist())

    #Close the file object
    f_object.close()

# 6. Model 4: Support Vector Machine

### 6.1 Apply Support Vector Machine with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [29]:
# Apply Support Vector Machine with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Support Vector Machine
    SVM = LinearSVC(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    SVM.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = SVM.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))

    # Evaluate predictions: Plotting the ROC curve
    y_pred_roc = SVM.decision_function(X_validate)
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_validate, y_pred_roc)
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

false_positive_rate = np.asarray(false_positive_rate)
true_positive_rate = np.asarray(true_positive_rate)
threshold = np.asarray(threshold)

RepeatedStratifiedKFold = 1
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 2
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 4
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 5
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 6
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 8
Accuracy: 75.41%
F1 score: 85.44%




ROC AUC score: 55.14%
RepeatedStratifiedKFold = 9
Accuracy: 72.13%
F1 score: 83.50%
ROC AUC score: 50.90%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 11
Accuracy: 72.58%
F1 score: 83.50%
ROC AUC score: 53.66%
RepeatedStratifiedKFold = 12
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 13
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 14
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 15
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 16
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 17
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 18
Accuracy: 77.05%
F1 score: 86.54%
ROC AUC score: 56.25%
RepeatedStratifiedKFold = 19
Accuracy: 73.77%
F1 score: 84.31%




ROC AUC score: 54.03%
RepeatedStratifiedKFold = 20
Accuracy: 72.13%
F1 score: 83.50%
ROC AUC score: 50.90%
RepeatedStratifiedKFold = 21
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 22
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 23
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 24
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 25
Accuracy: 72.13%
F1 score: 83.50%
ROC AUC score: 50.90%
RepeatedStratifiedKFold = 26
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 27
Accuracy: 74.19%
F1 score: 84.91%




ROC AUC score: 52.94%
RepeatedStratifiedKFold = 28
Accuracy: 78.69%
F1 score: 87.38%
ROC AUC score: 59.38%
RepeatedStratifiedKFold = 29
Accuracy: 70.49%
F1 score: 82.35%
ROC AUC score: 49.79%
RepeatedStratifiedKFold = 30
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 31
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 32
Accuracy: 74.19%
F1 score: 84.62%
ROC AUC score: 54.77%
RepeatedStratifiedKFold = 33
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 34
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 35
Accuracy: 73.77%




F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 36
Accuracy: 72.58%
F1 score: 83.81%
ROC AUC score: 51.83%
RepeatedStratifiedKFold = 37
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 38
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 39
Accuracy: 77.05%
F1 score: 86.54%
ROC AUC score: 56.25%
RepeatedStratifiedKFold = 40
Accuracy: 75.41%
F1 score: 85.15%
ROC AUC score: 57.15%
RepeatedStratifiedKFold = 41
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 42
Accuracy: 72.58%
F1 score: 83.50%
ROC AUC score: 53.66%
RepeatedStratifiedKFold = 43
Accuracy: 75.41%
F1 score: 85.71%




ROC AUC score: 53.12%
RepeatedStratifiedKFold = 44
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 45
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 46
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 47
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 48
Accuracy: 78.69%
F1 score: 87.38%
ROC AUC score: 59.38%
RepeatedStratifiedKFold = 49
Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 50
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%




In [30]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

print(false_positive_rate)
# print(len(false_positive_rate))
print(true_positive_rate)
# print(len(true_positive_rate))
print(threshold)
# print(len(threshold))
# print(rf_auc)

Accuracy score array: array([74.19, 74.19, 73.77, 73.77, 73.77, 74.19, 72.58, 75.41, 72.13,
       73.77, 72.58, 74.19, 73.77, 72.13, 73.77, 74.19, 70.97, 77.05,
       73.77, 72.13, 72.58, 74.19, 75.41, 73.77, 72.13, 72.58, 74.19,
       78.69, 70.49, 72.13, 74.19, 74.19, 73.77, 73.77, 73.77, 72.58,
       72.58, 72.13, 77.05, 75.41, 70.97, 72.58, 75.41, 72.13, 75.41,
       72.58, 70.97, 78.69, 70.49, 75.41])
Mean   | Max    | Min
73.65% | 78.69% | 70.49%
F1 score array: array([84.91, 84.91, 84.62, 84.91, 84.91, 84.91, 84.11, 85.44, 83.5 ,
       84.91, 83.5 , 84.91, 84.91, 83.81, 84.91, 84.91, 83.02, 86.54,
       84.31, 83.5 , 84.11, 84.91, 85.71, 84.91, 83.5 , 84.11, 84.91,
       87.38, 82.35, 83.81, 84.91, 84.62, 84.62, 84.62, 84.91, 83.81,
       84.11, 83.81, 86.54, 85.15, 83.02, 83.5 , 85.71, 83.81, 85.71,
       84.11, 83.02, 87.38, 82.69, 85.71])
Mean   | Max    | Min
84.58% | 87.38% | 82.35%
ROC AUC score array: array([52.94, 52.94, 52.01, 50.  , 50.  , 52.94, 50.  , 55.14

In [31]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['support vector machine', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['support vector machine', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['support vector machine', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

with open("CoV_AbDab_02_ROC_Curve.csv", 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['support vector machine', 'mean pool', 'false_positive_rate'] + false_positive_rate.tolist())
    writer_object.writerow(['support vector machine', 'mean pool', 'true_positive_rate'] + true_positive_rate.tolist())
    writer_object.writerow(['support vector machine', 'mean pool', 'threshold'] + threshold.tolist())

    #Close the file object
    f_object.close()

### 6.2 Apply Support Vector Machine with Repeated Stratified 5-Fold 10 times, X input: X_max

In [32]:
# Apply Support Vector Machine with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # Support Vector Machine
    SVM = LinearSVC(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    SVM.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = SVM.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))

    # Evaluate predictions: Plotting the ROC curve
    y_pred_roc = SVM.decision_function(X_validate)
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_validate, y_pred_roc)
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

false_positive_rate = np.asarray(false_positive_rate)
true_positive_rate = np.asarray(true_positive_rate)
threshold = np.asarray(threshold)

RepeatedStratifiedKFold = 1
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 2
Accuracy: 75.81%
F1 score: 85.71%
ROC AUC score: 55.88%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 4
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 5
Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 6
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 8




Accuracy: 77.05%
F1 score: 86.54%
ROC AUC score: 56.25%
RepeatedStratifiedKFold = 9
Accuracy: 73.77%
F1 score: 84.31%
ROC AUC score: 54.03%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 11
Accuracy: 74.19%
F1 score: 84.62%
ROC AUC score: 54.77%
RepeatedStratifiedKFold = 12
Accuracy: 74.19%
F1 score: 84.62%
ROC AUC score: 54.77%
RepeatedStratifiedKFold = 13
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 14
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 15
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 16
Accuracy: 72.58%
F1 score: 83.81%
ROC AUC score: 51.83%
RepeatedStratifiedKFold = 17
Accuracy: 70.97%




F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 18
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 19
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 20
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 21
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 22
Accuracy: 70.97%
F1 score: 82.69%
ROC AUC score: 50.72%
RepeatedStratifiedKFold = 23
Accuracy: 77.05%
F1 score: 86.54%
ROC AUC score: 56.25%
RepeatedStratifiedKFold = 24
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 25
Accuracy: 73.77%




F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 26
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 27
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 28
Accuracy: 77.05%
F1 score: 86.54%
ROC AUC score: 56.25%
RepeatedStratifiedKFold = 29
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 30
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 31
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 32
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 33
Accuracy: 75.41%
F1 score: 85.71%




ROC AUC score: 53.12%
RepeatedStratifiedKFold = 34
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 35
Accuracy: 72.13%
F1 score: 83.50%
ROC AUC score: 50.90%
RepeatedStratifiedKFold = 36
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 37
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 38
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 39
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 40
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 41
Accuracy: 70.97%




F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 42
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 43
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 44
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 45
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 46
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 47
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 48
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 49
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 50
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%




In [33]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

print(false_positive_rate)
# print(len(false_positive_rate))
print(true_positive_rate)
# print(len(true_positive_rate))
print(threshold)
# print(len(threshold))
# print(rf_auc)

Accuracy score array: array([74.19, 75.81, 73.77, 73.77, 70.49, 70.97, 72.58, 77.05, 73.77,
       73.77, 74.19, 74.19, 73.77, 73.77, 73.77, 72.58, 70.97, 73.77,
       72.13, 73.77, 72.58, 70.97, 77.05, 73.77, 73.77, 74.19, 70.97,
       77.05, 72.13, 73.77, 74.19, 72.58, 75.41, 73.77, 72.13, 72.58,
       70.97, 73.77, 73.77, 73.77, 70.97, 72.58, 75.41, 73.77, 75.41,
       70.97, 70.97, 75.41, 72.13, 75.41])
Mean   | Max    | Min
73.43% | 77.05% | 70.49%
F1 score array: array([84.91, 85.71, 84.62, 84.91, 82.69, 83.02, 84.11, 86.54, 84.31,
       84.91, 84.62, 84.62, 84.91, 84.91, 84.91, 83.81, 83.02, 84.91,
       83.81, 84.91, 84.11, 82.69, 86.54, 84.91, 84.91, 84.91, 83.02,
       86.54, 83.81, 84.62, 84.91, 84.11, 85.71, 84.62, 83.5 , 84.11,
       83.02, 84.91, 84.91, 84.91, 83.02, 84.11, 85.71, 84.91, 85.71,
       83.02, 83.02, 85.71, 83.81, 85.71])
Mean   | Max    | Min
84.51% | 86.54% | 82.69%
ROC AUC score array: array([52.94, 55.88, 52.01, 50.  , 47.78, 48.89, 50.  , 56.25

In [34]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['support vector machine', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['support vector machine', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['support vector machine', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

with open("CoV_AbDab_02_ROC_Curve.csv", 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['support vector machine', 'max pool', 'false_positive_rate'] + false_positive_rate.tolist())
    writer_object.writerow(['support vector machine', 'max pool', 'true_positive_rate'] + true_positive_rate.tolist())
    writer_object.writerow(['support vector machine', 'max pool', 'threshold'] + threshold.tolist())

    #Close the file object
    f_object.close()

# 7. Model 5: MLP Classifier

### 7.1 Apply MLP Classifier with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [35]:
# Apply MLP Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # MLP Classifier
    MLP = MLPClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    MLP.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = MLP.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 4
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 5




Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 6
Accuracy: 72.58%
F1 score: 83.81%
ROC AUC score: 51.83%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 8
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 9




Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 11
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 12




Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 13
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 14
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 15
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 16
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 17
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 18
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 19
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 20
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 21
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 22
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 23
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedS



Accuracy: 68.85%
F1 score: 81.55%
ROC AUC score: 46.67%
RepeatedStratifiedKFold = 39
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 40




Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 41
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 42
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 43
Accuracy: 77.05%
F1 score: 86.54%
ROC AUC score: 56.25%
RepeatedStratifiedKFold = 44
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 45
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 46
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 47
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 48
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 49
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 50
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%


In [36]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([72.58, 72.58, 73.77, 73.77, 73.77, 72.58, 72.58, 73.77, 75.41,
       73.77, 72.58, 72.58, 73.77, 73.77, 73.77, 72.58, 72.58, 73.77,
       73.77, 73.77, 72.58, 72.58, 73.77, 73.77, 75.41, 72.58, 72.58,
       73.77, 73.77, 73.77, 72.58, 72.58, 73.77, 73.77, 73.77, 72.58,
       72.58, 68.85, 73.77, 75.41, 72.58, 74.19, 77.05, 73.77, 73.77,
       72.58, 72.58, 73.77, 73.77, 73.77])
Mean   | Max    | Min
73.39% | 77.05% | 68.85%
F1 score array: array([84.11, 84.11, 84.91, 84.91, 84.91, 83.81, 84.11, 84.91, 85.71,
       84.91, 84.11, 84.11, 84.91, 84.91, 84.91, 84.11, 84.11, 84.91,
       84.91, 84.91, 84.11, 84.11, 84.91, 84.91, 85.71, 84.11, 84.11,
       84.91, 84.91, 84.91, 84.11, 84.11, 84.62, 84.91, 84.91, 84.11,
       84.11, 81.55, 84.62, 85.71, 84.11, 84.91, 86.54, 84.91, 84.91,
       84.11, 84.11, 84.91, 84.91, 84.91])
Mean   | Max    | Min
84.60% | 86.54% | 81.55%
ROC AUC score array: array([50.  , 50.  , 50.  , 50.  , 50.  , 51.83, 50.  , 50.  

In [37]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['mlp', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['mlp', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['mlp', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 7.2 Apply MLP Classifier with Repeated Stratified 5-Fold 10 times X input: X_max

In [38]:
# Apply MLP Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # MLP Classifier
    MLP = MLPClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    MLP.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = MLP.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 4
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 5
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 6
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 8
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 9
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 11
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 12
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC sco

In [39]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([72.58, 72.58, 73.77, 73.77, 73.77, 70.97, 72.58, 73.77, 73.77,
       73.77, 72.58, 72.58, 73.77, 73.77, 73.77, 72.58, 72.58, 73.77,
       73.77, 73.77, 72.58, 74.19, 73.77, 73.77, 73.77, 72.58, 72.58,
       73.77, 73.77, 73.77, 72.58, 72.58, 75.41, 73.77, 73.77, 72.58,
       72.58, 73.77, 73.77, 73.77, 72.58, 72.58, 73.77, 73.77, 73.77,
       72.58, 72.58, 73.77, 73.77, 73.77])
Mean   | Max    | Min
73.33% | 75.41% | 70.97%
F1 score array: array([84.11, 84.11, 84.91, 84.91, 84.91, 83.02, 84.11, 84.91, 84.91,
       84.91, 84.11, 84.11, 84.91, 84.91, 84.91, 84.11, 84.11, 84.91,
       84.91, 84.91, 84.11, 84.91, 84.91, 84.91, 84.91, 84.11, 84.11,
       84.91, 84.91, 84.91, 84.11, 84.11, 85.71, 84.91, 84.91, 84.11,
       84.11, 84.91, 84.91, 84.91, 84.11, 84.11, 84.91, 84.91, 84.91,
       84.11, 84.11, 84.91, 84.91, 84.91])
Mean   | Max    | Min
84.60% | 85.71% | 83.02%
ROC AUC score array: array([50.  , 50.  , 50.  , 50.  , 50.  , 48.89, 50.  , 50.  

In [40]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['mlp', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['mlp', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['mlp', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

# 8. Model 6: XGBoost Classifier

### 8.1 Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [41]:
# Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # XGBoost Classifier
    XGB = XGBClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    XGB.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = XGB.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))
    
    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))

    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1




Accuracy: 62.90%
F1 score: 76.29%
ROC AUC score: 46.99%
RepeatedStratifiedKFold = 2




Accuracy: 70.97%
F1 score: 82.35%
ROC AUC score: 52.55%
RepeatedStratifiedKFold = 3
Accuracy: 67.21%
F1 score: 79.59%
ROC AUC score: 49.58%
RepeatedStratifiedKFold = 4




Accuracy: 62.30%
F1 score: 76.77%
ROC AUC score: 42.22%
RepeatedStratifiedKFold = 5
Accuracy: 68.85%
F1 score: 81.19%
ROC AUC score: 48.68%
RepeatedStratifiedKFold = 6




Accuracy: 70.97%
F1 score: 82.00%
ROC AUC score: 54.38%
RepeatedStratifiedKFold = 7
Accuracy: 69.35%
F1 score: 80.41%
ROC AUC score: 55.10%
RepeatedStratifiedKFold = 8




Accuracy: 68.85%
F1 score: 80.00%
ROC AUC score: 54.72%
RepeatedStratifiedKFold = 9
Accuracy: 68.85%
F1 score: 80.41%
ROC AUC score: 52.71%
RepeatedStratifiedKFold = 10




Accuracy: 68.85%
F1 score: 81.19%
ROC AUC score: 48.68%
RepeatedStratifiedKFold = 11
Accuracy: 70.97%
F1 score: 81.25%
ROC AUC score: 58.04%
RepeatedStratifiedKFold = 12




Accuracy: 70.97%
F1 score: 82.69%
ROC AUC score: 50.72%
RepeatedStratifiedKFold = 13
Accuracy: 65.57%
F1 score: 77.42%
ROC AUC score: 52.50%
RepeatedStratifiedKFold = 14




Accuracy: 70.49%
F1 score: 81.63%
ROC AUC score: 53.82%
RepeatedStratifiedKFold = 15
Accuracy: 65.57%
F1 score: 78.79%
ROC AUC score: 46.46%
RepeatedStratifiedKFold = 16




Accuracy: 69.35%
F1 score: 80.81%
ROC AUC score: 53.27%
RepeatedStratifiedKFold = 17
Accuracy: 61.29%
F1 score: 74.47%
ROC AUC score: 47.71%
RepeatedStratifiedKFold = 18




Accuracy: 70.49%
F1 score: 81.25%
ROC AUC score: 55.83%
RepeatedStratifiedKFold = 19
Accuracy: 67.21%
F1 score: 78.72%
ROC AUC score: 53.61%
RepeatedStratifiedKFold = 20




Accuracy: 73.77%
F1 score: 84.31%
ROC AUC score: 54.03%
RepeatedStratifiedKFold = 21
Accuracy: 70.97%
F1 score: 82.00%
ROC AUC score: 54.38%
RepeatedStratifiedKFold = 22




Accuracy: 69.35%
F1 score: 80.00%
ROC AUC score: 56.93%
RepeatedStratifiedKFold = 23
Accuracy: 70.49%
F1 score: 80.85%
ROC AUC score: 57.85%
RepeatedStratifiedKFold = 24




Accuracy: 68.85%
F1 score: 81.19%
ROC AUC score: 48.68%
RepeatedStratifiedKFold = 25
Accuracy: 59.02%
F1 score: 73.68%
ROC AUC score: 42.01%
RepeatedStratifiedKFold = 26




Accuracy: 66.13%
F1 score: 78.79%
ROC AUC score: 49.22%
RepeatedStratifiedKFold = 27
Accuracy: 72.58%
F1 score: 83.50%
ROC AUC score: 53.66%
RepeatedStratifiedKFold = 28




Accuracy: 68.85%
F1 score: 80.41%
ROC AUC score: 52.71%
RepeatedStratifiedKFold = 29
Accuracy: 63.93%
F1 score: 77.08%
ROC AUC score: 47.36%
RepeatedStratifiedKFold = 30




Accuracy: 73.77%
F1 score: 83.67%
ROC AUC score: 58.06%
RepeatedStratifiedKFold = 31
Accuracy: 64.52%
F1 score: 77.55%
ROC AUC score: 48.10%
RepeatedStratifiedKFold = 32




Accuracy: 66.13%
F1 score: 77.89%
ROC AUC score: 52.88%
RepeatedStratifiedKFold = 33
Accuracy: 57.38%
F1 score: 72.34%
ROC AUC score: 40.90%
RepeatedStratifiedKFold = 34




Accuracy: 65.57%
F1 score: 78.35%
ROC AUC score: 48.47%
RepeatedStratifiedKFold = 35
Accuracy: 73.77%
F1 score: 84.00%
ROC AUC score: 56.04%
RepeatedStratifiedKFold = 36




Accuracy: 70.97%
F1 score: 80.85%
ROC AUC score: 59.87%
RepeatedStratifiedKFold = 37
Accuracy: 74.19%
F1 score: 84.31%
ROC AUC score: 56.60%
RepeatedStratifiedKFold = 38




Accuracy: 72.13%
F1 score: 83.50%
ROC AUC score: 50.90%
RepeatedStratifiedKFold = 39
Accuracy: 65.57%
F1 score: 78.35%
ROC AUC score: 48.47%
RepeatedStratifiedKFold = 40




Accuracy: 68.85%
F1 score: 80.81%
ROC AUC score: 50.69%
RepeatedStratifiedKFold = 41
Accuracy: 61.29%
F1 score: 75.51%
ROC AUC score: 44.05%
RepeatedStratifiedKFold = 42




Accuracy: 66.13%
F1 score: 78.35%
ROC AUC score: 51.05%
RepeatedStratifiedKFold = 43
Accuracy: 68.85%
F1 score: 80.81%
ROC AUC score: 50.69%
RepeatedStratifiedKFold = 44




Accuracy: 70.49%
F1 score: 81.25%
ROC AUC score: 55.83%
RepeatedStratifiedKFold = 45
Accuracy: 67.21%
F1 score: 79.59%
ROC AUC score: 49.58%
RepeatedStratifiedKFold = 46




Accuracy: 66.13%
F1 score: 77.89%
ROC AUC score: 52.88%
RepeatedStratifiedKFold = 47
Accuracy: 69.35%
F1 score: 80.81%
ROC AUC score: 53.27%
RepeatedStratifiedKFold = 48




Accuracy: 70.49%
F1 score: 81.63%
ROC AUC score: 53.82%
RepeatedStratifiedKFold = 49
Accuracy: 63.93%
F1 score: 77.08%
ROC AUC score: 47.36%
RepeatedStratifiedKFold = 50
Accuracy: 68.85%
F1 score: 80.00%
ROC AUC score: 54.72%




In [42]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([62.9 , 70.97, 67.21, 62.3 , 68.85, 70.97, 69.35, 68.85, 68.85,
       68.85, 70.97, 70.97, 65.57, 70.49, 65.57, 69.35, 61.29, 70.49,
       67.21, 73.77, 70.97, 69.35, 70.49, 68.85, 59.02, 66.13, 72.58,
       68.85, 63.93, 73.77, 64.52, 66.13, 57.38, 65.57, 73.77, 70.97,
       74.19, 72.13, 65.57, 68.85, 61.29, 66.13, 68.85, 70.49, 67.21,
       66.13, 69.35, 70.49, 63.93, 68.85])
Mean   | Max    | Min
68.01% | 74.19% | 57.38%
F1 score array: array([76.29, 82.35, 79.59, 76.77, 81.19, 82.  , 80.41, 80.  , 80.41,
       81.19, 81.25, 82.69, 77.42, 81.63, 78.79, 80.81, 74.47, 81.25,
       78.72, 84.31, 82.  , 80.  , 80.85, 81.19, 73.68, 78.79, 83.5 ,
       80.41, 77.08, 83.67, 77.55, 77.89, 72.34, 78.35, 84.  , 80.85,
       84.31, 83.5 , 78.35, 80.81, 75.51, 78.35, 80.81, 81.25, 79.59,
       77.89, 80.81, 81.63, 77.08, 80.  ])
Mean   | Max    | Min
79.87% | 84.31% | 72.34%
ROC AUC score array: array([46.99, 52.55, 49.58, 42.22, 48.68, 54.38, 55.1 , 54.72

In [43]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['xgboost', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['xgboost', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['xgboost', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 8.2 Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: X_max

In [44]:
# Apply XGBoost Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # XGBoost Classifier
    XGB = XGBClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    XGB.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = XGB.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2




Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 4




Accuracy: 68.85%
F1 score: 81.55%
ROC AUC score: 46.67%
RepeatedStratifiedKFold = 5
Accuracy: 68.85%
F1 score: 81.55%
ROC AUC score: 46.67%
RepeatedStratifiedKFold = 6




Accuracy: 72.58%
F1 score: 83.81%
ROC AUC score: 51.83%
RepeatedStratifiedKFold = 7
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 8




Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 9
Accuracy: 70.49%
F1 score: 82.35%
ROC AUC score: 49.79%
RepeatedStratifiedKFold = 10




Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 11
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 12




Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 13
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 14




Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 15
Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 16




Accuracy: 66.13%
F1 score: 79.61%
ROC AUC score: 45.56%
RepeatedStratifiedKFold = 17
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 18




Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 19
Accuracy: 68.85%
F1 score: 81.19%
ROC AUC score: 48.68%
RepeatedStratifiedKFold = 20




Accuracy: 72.13%
F1 score: 83.81%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 21
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 22




Accuracy: 70.97%
F1 score: 82.69%
ROC AUC score: 50.72%
RepeatedStratifiedKFold = 23
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 24




Accuracy: 72.13%
F1 score: 83.50%
ROC AUC score: 50.90%
RepeatedStratifiedKFold = 25
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 26




Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 27
Accuracy: 75.81%
F1 score: 85.71%
ROC AUC score: 55.88%
RepeatedStratifiedKFold = 28




Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 29
Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 30




Accuracy: 70.49%
F1 score: 81.63%
ROC AUC score: 53.82%
RepeatedStratifiedKFold = 31
Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 32




Accuracy: 74.19%
F1 score: 84.91%
ROC AUC score: 52.94%
RepeatedStratifiedKFold = 33
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 34




Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 35
Accuracy: 68.85%
F1 score: 80.41%
ROC AUC score: 52.71%
RepeatedStratifiedKFold = 36




Accuracy: 69.35%
F1 score: 81.90%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 37
Accuracy: 67.74%
F1 score: 80.39%
ROC AUC score: 48.50%
RepeatedStratifiedKFold = 38




Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 39
Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 40




Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 41
Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 42




Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 43
Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%
RepeatedStratifiedKFold = 44




Accuracy: 73.77%
F1 score: 84.62%
ROC AUC score: 52.01%
RepeatedStratifiedKFold = 45
Accuracy: 70.49%
F1 score: 82.69%
ROC AUC score: 47.78%
RepeatedStratifiedKFold = 46




Accuracy: 70.97%
F1 score: 83.02%
ROC AUC score: 48.89%
RepeatedStratifiedKFold = 47
Accuracy: 69.35%
F1 score: 81.55%
ROC AUC score: 49.61%
RepeatedStratifiedKFold = 48




Accuracy: 77.05%
F1 score: 86.54%
ROC AUC score: 56.25%
RepeatedStratifiedKFold = 49
Accuracy: 68.85%
F1 score: 81.55%
ROC AUC score: 46.67%
RepeatedStratifiedKFold = 50




Accuracy: 75.41%
F1 score: 85.71%
ROC AUC score: 53.12%


In [45]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([72.58, 70.97, 73.77, 68.85, 68.85, 72.58, 74.19, 72.13, 70.49,
       72.13, 74.19, 70.97, 75.41, 70.49, 72.13, 66.13, 70.97, 72.13,
       68.85, 72.13, 72.58, 70.97, 73.77, 72.13, 73.77, 70.97, 75.81,
       73.77, 70.49, 70.49, 74.19, 74.19, 73.77, 70.49, 68.85, 69.35,
       67.74, 70.49, 73.77, 75.41, 70.97, 70.97, 75.41, 73.77, 70.49,
       70.97, 69.35, 77.05, 68.85, 75.41])
Mean   | Max    | Min
71.92% | 77.05% | 66.13%
F1 score array: array([84.11, 83.02, 84.91, 81.55, 81.55, 83.81, 84.91, 83.81, 82.35,
       83.81, 84.91, 83.02, 85.71, 82.69, 83.81, 79.61, 83.02, 83.81,
       81.19, 83.81, 84.11, 82.69, 84.91, 83.5 , 84.91, 83.02, 85.71,
       84.62, 82.69, 81.63, 84.91, 84.91, 84.91, 82.69, 80.41, 81.9 ,
       80.39, 82.69, 84.62, 85.71, 83.02, 83.02, 85.71, 84.62, 82.69,
       83.02, 81.55, 86.54, 81.55, 85.71])
Mean   | Max    | Min
83.48% | 86.54% | 79.61%
ROC AUC score array: array([50.  , 48.89, 50.  , 46.67, 46.67, 51.83, 52.94, 48.89

In [46]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['xgboost', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['xgboost', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['xgboost', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

# 9. Model 7: LightGBM Classifier

### 9.1 Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [47]:
# Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # LightGBM Classifier
    lgbm = LGBMClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    lgbm.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = lgbm.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 69.35%
F1 score: 80.41%
ROC AUC score: 55.10%
RepeatedStratifiedKFold = 2
Accuracy: 75.81%
F1 score: 85.44%
ROC AUC score: 57.71%
RepeatedStratifiedKFold = 3
Accuracy: 70.49%
F1 score: 81.63%
ROC AUC score: 53.82%
RepeatedStratifiedKFold = 4
Accuracy: 55.74%
F1 score: 71.58%
ROC AUC score: 37.78%
RepeatedStratifiedKFold = 5
Accuracy: 68.85%
F1 score: 81.19%
ROC AUC score: 48.68%
RepeatedStratifiedKFold = 6
Accuracy: 69.35%
F1 score: 81.19%
ROC AUC score: 51.44%
RepeatedStratifiedKFold = 7
Accuracy: 67.74%
F1 score: 79.59%
ROC AUC score: 52.16%
RepeatedStratifiedKFold = 8
Accuracy: 68.85%
F1 score: 80.41%
ROC AUC score: 52.71%
RepeatedStratifiedKFold = 9
Accuracy: 63.93%
F1 score: 76.60%
ROC AUC score: 49.38%
RepeatedStratifiedKFold = 10
Accuracy: 65.57%
F1 score: 78.79%
ROC AUC score: 46.46%
RepeatedStratifiedKFold = 11
Accuracy: 67.74%
F1 score: 79.17%
ROC AUC score: 53.99%
RepeatedStratifiedKFold = 12
Accuracy: 75.81%
F1 score: 85.71%
ROC AUC sco

In [48]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([69.35, 75.81, 70.49, 55.74, 68.85, 69.35, 67.74, 68.85, 63.93,
       65.57, 67.74, 75.81, 68.85, 68.85, 68.85, 67.74, 61.29, 73.77,
       68.85, 75.41, 72.58, 67.74, 72.13, 65.57, 59.02, 70.97, 74.19,
       68.85, 62.3 , 73.77, 62.9 , 64.52, 62.3 , 65.57, 75.41, 70.97,
       70.97, 65.57, 63.93, 68.85, 64.52, 70.97, 65.57, 73.77, 62.3 ,
       66.13, 69.35, 75.41, 65.57, 67.21])
Mean   | Max    | Min
68.24% | 75.81% | 55.74%
F1 score array: array([80.41, 85.44, 81.63, 71.58, 81.19, 81.19, 79.59, 80.41, 76.6 ,
       78.79, 79.17, 85.71, 80.  , 80.81, 80.41, 80.39, 73.91, 84.  ,
       80.  , 85.44, 82.47, 78.72, 82.83, 79.21, 73.68, 81.63, 84.31,
       80.81, 75.79, 84.  , 75.79, 77.08, 75.79, 77.89, 85.15, 80.85,
       82.69, 79.21, 77.08, 80.81, 78.  , 82.35, 77.89, 84.31, 75.79,
       78.35, 80.81, 85.15, 78.79, 78.26])
Mean   | Max    | Min
80.04% | 85.71% | 71.58%
ROC AUC score array: array([55.1 , 57.71, 53.82, 37.78, 48.68, 51.44, 52.16, 52.71

In [49]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['lightgbm', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['lightgbm', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['lightgbm', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 9.2 Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times, X input: X_max

In [50]:
# Apply LightGBM Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # LightGBM Classifier
    lgbm = LGBMClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    lgbm.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = lgbm.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 2
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 3
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 4
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 5
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 6
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 7
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 8
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 9
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 10
Accuracy: 73.77%
F1 score: 84.91%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 11
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC score: 50.00%
RepeatedStratifiedKFold = 12
Accuracy: 72.58%
F1 score: 84.11%
ROC AUC sco

In [51]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([72.58, 72.58, 73.77, 73.77, 73.77, 72.58, 72.58, 73.77, 73.77,
       73.77, 72.58, 72.58, 73.77, 73.77, 73.77, 67.74, 72.58, 73.77,
       73.77, 73.77, 72.58, 70.97, 73.77, 73.77, 73.77, 72.58, 72.58,
       73.77, 73.77, 73.77, 72.58, 72.58, 73.77, 73.77, 73.77, 72.58,
       72.58, 73.77, 73.77, 73.77, 72.58, 72.58, 73.77, 73.77, 73.77,
       72.58, 72.58, 73.77, 73.77, 73.77])
Mean   | Max    | Min
73.17% | 73.77% | 67.74%
F1 score array: array([84.11, 84.11, 84.91, 84.91, 84.91, 84.11, 84.11, 84.91, 84.91,
       84.91, 84.11, 84.11, 84.91, 84.91, 84.91, 80.77, 84.11, 84.91,
       84.91, 84.91, 84.11, 83.02, 84.91, 84.91, 84.91, 84.11, 84.11,
       84.91, 84.91, 84.91, 84.11, 84.11, 84.91, 84.91, 84.91, 84.11,
       84.11, 84.91, 84.91, 84.91, 84.11, 84.11, 84.91, 84.91, 84.91,
       84.11, 84.11, 84.91, 84.91, 84.91])
Mean   | Max    | Min
84.50% | 84.91% | 80.77%
ROC AUC score array: array([50.  , 50.  , 50.  , 50.  , 50.  , 50.  , 50.  , 50.  

In [52]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['lightgbm', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['lightgbm', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['lightgbm', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

# 10. Model 8: CatBoost Classifier

### 10.1 Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [53]:
# Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # CatBoost Classifier
    cat = CatBoostClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    cat.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = cat.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Learning rate set to 0.005651
0:	learn: 0.6903556	total: 165ms	remaining: 2m 45s
1:	learn: 0.6877546	total: 169ms	remaining: 1m 24s
2:	learn: 0.6856339	total: 173ms	remaining: 57.4s
3:	learn: 0.6836177	total: 186ms	remaining: 46.3s
4:	learn: 0.6814060	total: 189ms	remaining: 37.6s
5:	learn: 0.6795583	total: 193ms	remaining: 31.9s
6:	learn: 0.6773170	total: 196ms	remaining: 27.8s
7:	learn: 0.6747527	total: 199ms	remaining: 24.7s
8:	learn: 0.6724347	total: 203ms	remaining: 22.4s
9:	learn: 0.6706360	total: 206ms	remaining: 20.4s
10:	learn: 0.6685043	total: 210ms	remaining: 18.9s
11:	learn: 0.6662005	total: 213ms	remaining: 17.5s
12:	learn: 0.6642459	total: 217ms	remaining: 16.4s
13:	learn: 0.6622396	total: 221ms	remaining: 15.5s
14:	learn: 0.6602585	total: 224ms	remaining: 14.7s
15:	learn: 0.6583025	total: 227ms	remaining: 14s
16:	learn: 0.6567891	total: 230ms	remaining: 13.3s
17:	learn: 0.6549174	total: 234ms	remaining: 12.8s
18:	learn: 0.6531331	total: 237ms	

CatBoostError: (The requested operation cannot be performed on a file with a user-mapped section open.) C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/util/system/file.cpp:856: can't open "catboost_info\\learn_error.tsv" with mode WrOnly|CreateAlways|Seq (0x00000034)

In [None]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([95.87, 95.87, 97.16, 93.52, 94.82, 96.38, 95.09, 95.09, 96.11,
       96.37, 95.87, 95.35, 95.61, 96.11, 95.85, 94.83, 95.09, 96.64,
       95.08, 95.34, 95.61, 96.38, 94.83, 94.56, 96.11, 94.57, 95.61,
       96.38, 95.6 , 96.11, 97.42, 96.12, 94.57, 95.6 , 95.6 , 93.28,
       96.64, 95.35, 96.89, 96.11, 95.61, 95.35, 96.64, 96.37, 94.56,
       95.61, 96.12, 96.12, 95.08, 96.37])
Mean   | Max    | Min
95.66% | 97.42% | 93.28%
ROC AUC score array: array([94.42, 94.06, 95.63, 91.1 , 94.05, 94.41, 92.49, 93.2 , 95.3 ,
       96.52, 94.77, 94.08, 93.89, 93.17, 94.37, 92.31, 93.55, 94.93,
       92.84, 93.67, 94.25, 95.47, 93.73, 92.85, 94.19, 92.5 , 94.6 ,
       94.41, 94.24, 94.55, 96.51, 94.95, 91.79, 93.18, 94.2 , 91.29,
       95.64, 93.01, 95.81, 94.55, 94.25, 93.37, 96.  , 95.82, 91.72,
       94.6 , 94.24, 94.59, 93.9 , 94.  ])
Mean   | Max    | Min
94.06% | 96.52% | 91.10%


In [None]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['catboost', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['catboost', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['catboost', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 10.2 Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times, X input: X_max

In [54]:
# Apply CatBoost Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # CatBoost Classifier
    cat = CatBoostClassifier(random_state = 1001) # change the classifier here
    
    # Model fitting and training
    cat.fit(X_train, y_train)
    
    # Make predictions for validation data
    y_pred = cat.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Learning rate set to 0.005651
0:	learn: 0.6912829	total: 1.87ms	remaining: 1.87s
1:	learn: 0.6897489	total: 3.76ms	remaining: 1.88s
2:	learn: 0.6882374	total: 5.32ms	remaining: 1.77s
3:	learn: 0.6865495	total: 7.63ms	remaining: 1.9s
4:	learn: 0.6850822	total: 9.35ms	remaining: 1.86s
5:	learn: 0.6832786	total: 11.6ms	remaining: 1.92s
6:	learn: 0.6817669	total: 13.2ms	remaining: 1.88s
7:	learn: 0.6805277	total: 14.1ms	remaining: 1.74s
8:	learn: 0.6793031	total: 14.9ms	remaining: 1.64s
9:	learn: 0.6778084	total: 16.9ms	remaining: 1.67s
10:	learn: 0.6763347	total: 18.3ms	remaining: 1.64s
11:	learn: 0.6748602	total: 19.3ms	remaining: 1.59s
12:	learn: 0.6733644	total: 21.2ms	remaining: 1.61s
13:	learn: 0.6721189	total: 22.4ms	remaining: 1.58s
14:	learn: 0.6708239	total: 23.5ms	remaining: 1.54s
15:	learn: 0.6694320	total: 25.2ms	remaining: 1.55s
16:	learn: 0.6680382	total: 26.6ms	remaining: 1.54s
17:	learn: 0.6664575	total: 28.6ms	remaining: 1.56s
18:	learn: 0.6650

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

54:	learn: 0.6270541	total: 146ms	remaining: 2.5s
55:	learn: 0.6260495	total: 148ms	remaining: 2.49s
56:	learn: 0.6251578	total: 152ms	remaining: 2.51s
57:	learn: 0.6242670	total: 154ms	remaining: 2.5s
58:	learn: 0.6235296	total: 157ms	remaining: 2.5s
59:	learn: 0.6226676	total: 159ms	remaining: 2.5s
60:	learn: 0.6217412	total: 164ms	remaining: 2.52s
61:	learn: 0.6210233	total: 165ms	remaining: 2.5s
62:	learn: 0.6203193	total: 166ms	remaining: 2.47s
63:	learn: 0.6196142	total: 167ms	remaining: 2.44s
64:	learn: 0.6188732	total: 169ms	remaining: 2.43s
65:	learn: 0.6181125	total: 171ms	remaining: 2.41s
66:	learn: 0.6174005	total: 172ms	remaining: 2.39s
67:	learn: 0.6165054	total: 178ms	remaining: 2.44s
68:	learn: 0.6158342	total: 179ms	remaining: 2.42s
69:	learn: 0.6151877	total: 180ms	remaining: 2.4s
70:	learn: 0.6144322	total: 182ms	remaining: 2.38s
71:	learn: 0.6137857	total: 186ms	remaining: 2.39s
72:	learn: 0.6131451	total: 187ms	remaining: 2.38s
73:	learn: 0.6124497	total: 189ms	rem

CatBoostError: (The requested operation cannot be performed on a file with a user-mapped section open.) C:/Program Files (x86)/Go Agent/pipelines/BuildMaster/catboost.git/util/system/file.cpp:856: can't open "catboost_info\\time_left.tsv" with mode WrOnly|CreateAlways|Seq (0x00000034)

In [None]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([95.61, 96.12, 97.42, 94.56, 95.85, 96.38, 95.87, 95.09, 96.11,
       96.89, 96.38, 95.61, 95.61, 96.11, 96.37, 95.61, 95.35, 96.9 ,
       96.11, 95.34, 96.38, 96.38, 94.83, 94.82, 96.37, 94.83, 95.61,
       97.16, 95.85, 95.85, 97.67, 95.87, 95.61, 95.85, 95.85, 93.8 ,
       97.16, 95.87, 96.89, 95.6 , 96.38, 95.61, 96.64, 96.37, 94.3 ,
       95.87, 96.38, 96.38, 95.6 , 96.63])
Mean   | Max    | Min
95.95% | 97.67% | 93.80%
ROC AUC score array: array([93.89, 94.59, 96.16, 93.2 , 95.1 , 94.41, 93.  , 93.55, 95.3 ,
       96.86, 95.83, 94.6 , 94.25, 93.52, 94.72, 93.54, 94.08, 95.46,
       94.59, 93.67, 95.12, 95.47, 94.09, 93.37, 94.36, 93.02, 94.6 ,
       95.28, 94.77, 94.37, 96.68, 94.77, 93.54, 93.71, 94.74, 92.34,
       96.34, 93.35, 95.81, 93.12, 95.47, 93.89, 96.  , 96.18, 91.18,
       94.42, 94.76, 95.12, 94.95, 94.53])
Mean   | Max    | Min
94.51% | 96.86% | 91.18%


In [None]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['catboost', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['catboost', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['catboost', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

# 11. Model 9: TabNet Classifier

### 11.1 Apply TabNet Classifier with Repeated Stratified 5-Fold 10 times, X input: X_mean

In [55]:
# Apply TabNet Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_mean, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_mean[train_index], X_mean[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # TabNet Classifier
    tn = TabNetClassifier(seed = 1001) # change the classifier here
    
    # Model fitting and training
    tn.fit(X_train, y_train, eval_set=[(X_validate, y_validate)])
    
    # Make predictions for validation data
    y_pred = tn.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Device used : cuda
epoch 0  | loss: 1.01182 | val_0_auc: 0.51111 |  0:00:02s
epoch 1  | loss: 0.84593 | val_0_auc: 0.40654 |  0:00:02s
epoch 2  | loss: 0.77603 | val_0_auc: 0.46928 |  0:00:03s
epoch 3  | loss: 0.64693 | val_0_auc: 0.44444 |  0:00:03s
epoch 4  | loss: 0.62649 | val_0_auc: 0.5085  |  0:00:03s
epoch 5  | loss: 0.59183 | val_0_auc: 0.41307 |  0:00:03s
epoch 6  | loss: 0.58319 | val_0_auc: 0.36863 |  0:00:03s
epoch 7  | loss: 0.57591 | val_0_auc: 0.33464 |  0:00:03s
epoch 8  | loss: 0.5812  | val_0_auc: 0.33595 |  0:00:03s
epoch 9  | loss: 0.59592 | val_0_auc: 0.39869 |  0:00:03s
epoch 10 | loss: 0.56438 | val_0_auc: 0.45098 |  0:00:03s

Early stopping occured at epoch 10 with best_epoch = 0 and best_val_0_auc = 0.51111
Best weights from best epoch are automatically used!
Accuracy: 66.13%
F1 score: 79.61%
ROC AUC score: 45.56%
RepeatedStratifiedKFold = 2
Device used : cuda
epoch 0  | loss: 0.96263 | val_0_auc: 0.57124 |  0:00:00s
epoch 1  | loss:

In [56]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([66.13, 53.23, 50.82, 73.77, 73.77, 72.58, 72.58, 73.77, 73.77,
       73.77, 72.58, 74.19, 73.77, 73.77, 54.1 , 72.58, 72.58, 73.77,
       73.77, 72.13, 72.58, 72.58, 73.77, 73.77, 73.77, 72.58, 72.58,
       73.77, 73.77, 73.77, 72.58, 64.52, 70.49, 73.77, 73.77, 72.58,
       72.58, 73.77, 73.77, 73.77, 72.58, 72.58, 73.77, 73.77, 73.77,
       74.19, 72.58, 73.77, 73.77, 77.05])
Mean   | Max    | Min
71.80% | 77.05% | 50.82%
F1 score array: array([79.61, 59.15, 54.55, 84.91, 84.91, 84.11, 84.11, 84.31, 84.91,
       84.91, 84.11, 84.91, 84.91, 84.91, 60.  , 83.17, 84.11, 84.91,
       84.91, 82.47, 84.11, 84.11, 84.91, 84.91, 84.91, 84.11, 84.11,
       84.91, 84.91, 84.91, 84.11, 76.6 , 81.63, 84.91, 84.91, 84.11,
       84.11, 84.91, 84.91, 84.91, 84.11, 84.11, 84.91, 84.91, 84.91,
       84.91, 83.81, 84.91, 84.91, 86.  ])
Mean   | Max    | Min
82.65% | 86.00% | 54.55%
ROC AUC score array: array([45.56, 58.63, 60.62, 50.  , 50.  , 50.  , 50.  , 54.03

In [57]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['tabnet', 'mean pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['tabnet', 'mean pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['tabnet', 'mean pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()

### 11.2 Apply TabNet Classifier with Repeated Stratified 5-Fold 10 timesd, X input: X_max

In [58]:
# Apply TabNet Classifier with Repeated Stratified 5-Fold 10 times
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state = 1001)
# kf = StratifiedKFold(n_splits=5, random_state = 1001)
# kf = KFold(n_splits=5, randon_state = 1001)

accuracy_arr = []
f1_arr = []
roc_arr = []

i = 0
for train_index, validate_index in kf.split(X_max, y):
    i += 1
    print("RepeatedStratifiedKFold = " + str(i))
    
    # Split the training and validation dataset
    X_train, X_validate = X_max[train_index], X_max[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    # TabNet Classifier
    tn = TabNetClassifier(seed = 1001) # change the classifier here
    
    # Model fitting and training
    tn.fit(X_train, y_train, eval_set=[(X_validate, y_validate)])
    
    # Make predictions for validation data
    y_pred = tn.predict(X_validate)
    predictions = [round(value) for value in y_pred]
    
    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_validate, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    accuracy_arr.append(accuracy)
    print("Accuracy: %.2f%%" % (accuracy))

    # Evaluate predictions: F1 score
    F1_score = f1_score(y_validate, predictions) * 100
    f1_arr.append(F1_score)
    print("F1 score: %.2f%%" % (F1_score))
    
    # Evaluate predictions: ROC AUC score
    roc_score = roc_auc_score(y_validate, predictions) * 100
    roc_arr.append(roc_score)
    print("ROC AUC score: %.2f%%" % (roc_score))
    
    # # Evaluate predictions: Classification report
    # cr = classification_report(y_validate, predictions)
    # print("Classification report:")
    # print(cr)
    
    # # Evaluate predictions: Confusion Matrix
    # cm = confusion_matrix(y_validate, predictions)
    # print("Confusion matrix:")
    # print(cm)
    
    print("=" * 30)

accuracy_arr = np.asarray(accuracy_arr)
f1_arr = np.asarray(f1_arr)
roc_arr = np.asarray(roc_arr)

RepeatedStratifiedKFold = 1
Device used : cuda
epoch 0  | loss: 1.12952 | val_0_auc: 0.33725 |  0:00:00s
epoch 1  | loss: 1.02121 | val_0_auc: 0.42876 |  0:00:00s
epoch 2  | loss: 0.79744 | val_0_auc: 0.45359 |  0:00:00s
epoch 3  | loss: 0.75399 | val_0_auc: 0.5085  |  0:00:00s
epoch 4  | loss: 0.67992 | val_0_auc: 0.57647 |  0:00:00s
epoch 5  | loss: 0.65554 | val_0_auc: 0.56275 |  0:00:00s
epoch 6  | loss: 0.62092 | val_0_auc: 0.71373 |  0:00:00s
epoch 7  | loss: 0.65375 | val_0_auc: 0.66078 |  0:00:00s
epoch 8  | loss: 0.68783 | val_0_auc: 0.5719  |  0:00:00s
epoch 9  | loss: 0.65386 | val_0_auc: 0.66275 |  0:00:00s
epoch 10 | loss: 0.6661  | val_0_auc: 0.62222 |  0:00:00s
epoch 11 | loss: 0.6964  | val_0_auc: 0.53333 |  0:00:00s
epoch 12 | loss: 0.66261 | val_0_auc: 0.54771 |  0:00:00s
epoch 13 | loss: 0.66184 | val_0_auc: 0.46536 |  0:00:00s
epoch 14 | loss: 0.64229 | val_0_auc: 0.28889 |  0:00:00s
epoch 15 | loss: 0.59515 | val_0_auc: 0.44183 |  0:00:00s
epoch 16 | loss: 0.61174 

In [59]:
# Printing Accuracy score, F1 score and ROC AUC score
mean_acc = np.mean(accuracy_arr)
max_acc = np.max(accuracy_arr)
min_acc = np.min(accuracy_arr)

mean_f1 = np.mean(f1_arr)
max_f1 = np.max(f1_arr)
min_f1 = np.min(f1_arr)

mean_roc = np.mean(roc_arr)
max_roc = np.max(roc_arr)
min_roc = np.min(roc_arr)

print("Accuracy score array: " + repr(np.round(accuracy_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_acc, max_acc, min_acc))
print("F1 score array: " + repr(np.round(f1_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_f1, max_f1, min_f1))
print("ROC AUC score array: " + repr(np.round(roc_arr, 2)))
print("Mean   | Max    | Min")
print("%.2f%% | %.2f%% | %.2f%%" % (mean_roc, max_roc, min_roc))

Accuracy score array: array([72.58, 43.55, 73.77, 72.13, 72.13, 72.58, 72.58, 26.23, 26.23,
       73.77, 40.32, 46.77, 65.57, 73.77, 73.77, 30.65, 64.52, 50.82,
       75.41, 73.77, 29.03, 72.58, 73.77, 73.77, 73.77, 72.58, 30.65,
       73.77, 31.15, 73.77, 27.42, 72.58, 40.98, 73.77, 73.77, 72.58,
       72.58, 73.77, 73.77, 77.05, 72.58, 72.58, 75.41, 26.23, 31.15,
       72.58, 72.58, 42.62, 50.82, 73.77])
Mean   | Max    | Min
61.09% | 77.05% | 26.23%
F1 score array: array([84.11, 40.68, 84.91, 83.81, 83.81, 84.11, 84.11,  0.  ,  0.  ,
       84.91, 37.29, 45.9 , 78.35, 84.91, 84.91,  8.51, 76.09, 53.12,
       85.71, 84.91,  4.35, 84.11, 84.91, 84.91, 84.91, 84.11,  8.51,
       84.91, 16.  , 84.91,  0.  , 84.11, 37.93, 84.91, 84.91, 84.11,
       84.11, 84.91, 84.91, 86.54, 84.11, 83.81, 85.71,  0.  , 12.5 ,
       84.11, 84.11, 42.62, 54.55, 84.91])
Mean   | Max    | Min
64.49% | 86.54% | 0.00%
ROC AUC score array: array([50.  , 57.45, 50.  , 48.89, 48.89, 50.  , 50.  , 50.  ,

In [60]:
# Open your CSV file in append mode
# Create a file object for this file

with open(csv_file_name, 'a+', newline = '') as f_object:
    
    writer_object = csv.writer(f_object)
    writer_object.writerow(['tabnet', 'max pool', 'accuracy'] + [mean_acc, max_acc, min_acc] + accuracy_arr.tolist())
    writer_object.writerow(['tabnet', 'max pool', 'f1'] + [mean_f1, max_f1, min_f1] + f1_arr.tolist())
    writer_object.writerow(['tabnet', 'max pool', 'roc auc'] + [mean_roc, max_roc, min_roc] + roc_arr.tolist())

    #Close the file object
    f_object.close()