#### Connect to Google Drive if required

In [None]:
from google.colab import drive
drive.mount("content/drive")

cd "" # add in path when required to connect google drive

In [None]:
# install required package if using colab to run
!pip install xgboost

#### Import Required Library

In [50]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

#### Dataset Loading & Train, Test Split

In [51]:
# Dataset loading
df = pd.read_csv("data_preprocessed.csv")

In [52]:
# Dataset Split 
from sklearn.model_selection import train_test_split

y = df[df.columns[-1]]
X = df.drop(df.columns[-1], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#### Model Building 

In [75]:
# model define
model = XGBClassifier(objective='binary:logistic', n_estimators=50, max_depth=3)
model.fit(X_train, y_train)

#### Evaluation

In [11]:
y_pred=model.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print(acc)

0.9561688311688312


In [13]:
y_pred = model.predict(X_train)
acc = accuracy_score(y_train, y_pred)
print(acc)

0.9753787878787878


In [53]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
# from scikitplot.metrics import plot_roc, plot_confusion_matrix, plot_precision_recall

In [68]:
def eval_performance(y_pred, y_true, y_proba, plot=False):

  # confusion matrix
  if plot:
    plot_confusion_matrix(y_true, y_pred)

    # y_probas == (prob for class 0, prob for class 1)
    # y_proba == prob for class 1
    # hence, the arrange of y_probas == (1-y_proba, y_proba)
    y_probas = [[y, x] for x, y in zip(np.array(y_proba), 1-np.array(y_proba))]

    plot_roc(torch.tensor(y_true), torch.tensor(y_probas))

    plot_precision_recall(y_true, y_probas)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('Precision-Recall Curve')

    plt.show()

  acc = accuracy_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_score = roc_auc_score(y_true, y_proba)


  print(f"recall score: {recall:.4f}")
  print(f"Precision score: {precision:.4f}")
  print(f"F1 score: {f1:.4f}")
  print(f"ROC_AUC score: {roc_score:.4f}")
  print(f"Accuracy Score: {acc:.4f}\n")

  return acc, recall, precision, f1, roc_score

#### 5 fold cross validation

In [76]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

all_acc = []
all_recall = []
all_precision = []
all_f1 = []
all_roc_score = []

all_train_acc = []
all_train_recall = []
all_train_precision = []
all_train_f1 = []
all_train_roc_score = []

# Iterate through each fold in KFold
for train_index, val_index in kfold.split(X_train):
      
      # Split data based on the current fold indices
      X_train_val, X_val = X.iloc[train_index], X.iloc[val_index]
      y_train_val, y_val = y.iloc[train_index], y.iloc[val_index]
    
    
      # train the model using train set
      model.fit(X_train_val, y_train_val)
      
      # Train set Eval
      y_probas = model.predict_proba(X_train_val)
      
      y_proba = y_probas[:, 1]

      y_pred = model.predict(X_train_val)
      
      train_acc, train_recall, train_precision, train_f1, train_roc_score = eval_performance(y_pred, y_train_val, y_proba)
      
      all_train_acc.append(train_acc)
      all_train_recall.append(train_recall)
      all_train_precision.append(train_precision)
      all_train_f1.append(train_f1)
      all_train_roc_score.append(train_roc_score)
            
      # Val set eval
      y_probas = model.predict_proba(X_val)
      
      y_proba = y_probas[:, 1]

      y_pred = model.predict(X_val)
      
      acc, recall, precision, f1, roc_score = eval_performance(y_pred, y_val, y_proba)
      
      all_acc.append(acc)
      all_recall.append(recall)
      all_precision.append(precision)
      all_f1.append(f1)
      all_roc_score.append(roc_score)

# Train set avg metrics
print(f"\nAvg training accuracy: \t\t{sum(all_train_acc)/len(all_train_acc):.4f}")
print(f"Avg training recall: \t\t{sum(all_train_recall)/len(all_train_recall):.4f}")
print(f"Avg training precision: \t{sum(all_train_precision)/len(all_train_precision):.4f}")
print(f"Avg training f1: \t\t{sum(all_train_f1)/len(all_train_f1):.4f}")
print(f"Avg training ROC score: \t{sum(all_train_roc_score)/len(all_train_roc_score):.4f}")

# Val set avg metrics
print(f"\nAvg validation accuracy: \t{sum(all_acc)/len(all_acc):.4f}")
print(f"Avg validation recall: \t\t{sum(all_recall)/len(all_recall):.4f}")
print(f"Avg validation precision: \t{sum(all_precision)/len(all_precision):.4f}")
print(f"Avg validation f1: \t\t{sum(all_f1)/len(all_f1):.4f}")
print(f"Avg validation ROC score: \t{sum(all_roc_score)/len(all_roc_score):.4f}")

recall score: 0.9818
Precision score: 0.9814
F1 score: 0.9816
ROC_AUC score: 0.9991
Accuracy Score: 0.9853

recall score: 0.9253
Precision score: 0.9484
F1 score: 0.9367
ROC_AUC score: 0.9912
Accuracy Score: 0.9513

recall score: 0.9796
Precision score: 0.9796
F1 score: 0.9796
ROC_AUC score: 0.9988
Accuracy Score: 0.9838

recall score: 0.9502
Precision score: 0.9453
F1 score: 0.9477
ROC_AUC score: 0.9923
Accuracy Score: 0.9588

recall score: 0.9788
Precision score: 0.9779
F1 score: 0.9784
ROC_AUC score: 0.9988
Accuracy Score: 0.9828

recall score: 0.9241
Precision score: 0.9470
F1 score: 0.9354
ROC_AUC score: 0.9909
Accuracy Score: 0.9499

recall score: 0.9781
Precision score: 0.9781
F1 score: 0.9781
ROC_AUC score: 0.9988
Accuracy Score: 0.9828

recall score: 0.9570
Precision score: 0.9414
F1 score: 0.9491
ROC_AUC score: 0.9913
Accuracy Score: 0.9581

recall score: 0.9765
Precision score: 0.9774
F1 score: 0.9769
ROC_AUC score: 0.9985
Accuracy Score: 0.9817

recall score: 0.9545
Precisi