#### Connect to Google Drive if required

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd '/content/drive/MyDrive/FYP/Project/XGBoost/'

/content/drive/MyDrive/FYP/Project/XGBoost


In [None]:
!pwd

/content/drive/MyDrive/FYP/Project/XGBoost


In [None]:
# install required package if using colab to run
!pip install xgboost
!pip install scikit-plot
!pip install optuna
!pip install optuna-dashboard



#### Import Required Library

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

#### Dataset Loading & Train, Test Split

In [None]:
# Dataset loading
df = pd.read_csv("data_preprocessed.csv")

In [None]:
# Dataset Split
from sklearn.model_selection import train_test_split

y = df[df.columns[-1]]
X = df.drop(df.columns[-1], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#### XGBClassifier Model

In [None]:
# model define
baseline_model = XGBClassifier(objective='binary:logistic', n_estimators=50, max_depth=3)
baseline_model.fit(X_train, y_train)

#### Helper function

In [None]:
# K-fold cross val
from sklearn.model_selection import KFold

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from scikitplot.metrics import plot_roc, plot_confusion_matrix, plot_precision_recall

In [None]:
def eval_performance(y_pred, y_true, y_proba, plot=False, verbose=False):

  # confusion matrix
  if plot:
    plot_confusion_matrix(y_true, y_pred)

    # y_probas == (prob for class 0, prob for class 1)
    # y_proba == prob for class 1
    # hence, the arrange of y_probas == (1-y_proba, y_proba)
    y_probas = [[y, x] for x, y in zip(np.array(y_proba), 1-np.array(y_proba))]

    plot_roc(y_true, y_probas)

    plot_precision_recall(y_true, y_probas)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('Precision-Recall Curve')

    plt.show()

  acc = accuracy_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_score = roc_auc_score(y_true, y_proba)

  if verbose:
    print(f"recall score: {recall:.4f}")
    print(f"Precision score: {precision:.4f}")
    print(f"F1 score: {f1:.4f}")
    print(f"ROC_AUC score: {roc_score:.4f}")
    print(f"Accuracy Score: {acc:.4f}\n")

  return acc, recall, precision, f1, roc_score

In [None]:
def cross_val(verbose=False, model=baseline_model):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    all_acc = []
    all_recall = []
    all_precision = []
    all_f1 = []
    all_roc_score = []

    all_train_acc = []
    all_train_recall = []
    all_train_precision = []
    all_train_f1 = []
    all_train_roc_score = []

    # Iterate through each fold in KFold
    for train_index, val_index in kfold.split(X_train):

        # Split data based on the current fold indices
        X_train_val, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_val, y_val = y_train.iloc[train_index], y_train.iloc[val_index]


        # train the model using train set
        model.fit(X_train_val, y_train_val)

        # Train set Eval
        y_probas = model.predict_proba(X_train_val)

        y_proba = y_probas[:, 1]

        y_pred = model.predict(X_train_val)

        train_acc, train_recall, train_precision, train_f1, train_roc_score = eval_performance(y_pred, y_train_val, y_proba)

        all_train_acc.append(train_acc)
        all_train_recall.append(train_recall)
        all_train_precision.append(train_precision)
        all_train_f1.append(train_f1)
        all_train_roc_score.append(train_roc_score)

        # Val set eval
        y_probas = model.predict_proba(X_val)

        y_proba = y_probas[:, 1]

        y_pred = model.predict(X_val)

        acc, recall, precision, f1, roc_score = eval_performance(y_pred, y_val, y_proba)

        all_acc.append(acc)
        all_recall.append(recall)
        all_precision.append(precision)
        all_f1.append(f1)
        all_roc_score.append(roc_score)

        if verbose:
          # Train set avg metrics
          print(f"\nTrain accuracy: \t{train_acc:.4f}")
          print(f"Train recall: \t\t{train_recall:.4f}")
          print(f"Train precision: \t{train_precision:.4f}")
          print(f"Train f1: \t\t{train_f1:.4f}")
          print(f"Train ROC score: \t{train_roc_score:.4f}")

          # Val set avg metrics
          print(f"\nVal accuracy: \t{acc:.4f}")
          print(f"Val recall: \t\t{recall:.4f}")
          print(f"Val precision: \t{precision:.4f}")
          print(f"Val f1: \t\t{f1:.4f}")
          print(f"Val ROC score: \t{roc_score:.4f}")

    avg_train_acc = sum(all_train_acc)/len(all_train_acc)
    avg_train_recall = sum(all_train_recall)/len(all_train_recall)
    avg_train_precision = sum(all_train_precision)/len(all_train_precision)
    avg_train_f1 = sum(all_train_f1)/len(all_train_f1)
    avg_train_roc = sum(all_train_roc_score)/len(all_train_roc_score)

    avg_val_acc = sum(all_acc)/len(all_acc)
    avg_val_recall = sum(all_recall)/len(all_recall)
    avg_val_precision = sum(all_precision)/len(all_precision)
    avg_val_f1 = sum(all_f1)/len(all_f1)
    avg_val_roc = sum(all_roc_score)/len(all_roc_score)

    if verbose:
        # Train set avg metrics
        print(f"\nAvg training accuracy: \t\t{avg_train_acc:.4f}")
        print(f"Avg training recall: \t\t{avg_train_recall:.4f}")
        print(f"Avg training precision: \t{avg_train_precision:.4f}")
        print(f"Avg training f1: \t\t{avg_train_f1:.4f}")
        print(f"Avg training ROC score: \t{avg_train_roc:.4f}")

        # Val set avg metrics
        print(f"\nAvg validation accuracy: \t{avg_val_acc:.4f}")
        print(f"Avg validation recall: \t\t{avg_val_recall:.4f}")
        print(f"Avg validation precision: \t{avg_val_precision:.4f}")
        print(f"Avg validation f1: \t\t{avg_val_f1:.4f}")
        print(f"Avg validation ROC score: \t{avg_val_roc:.4f}")

    # return training acc, val acc
    return avg_train_acc, avg_val_acc

#### Hyperparameter Tuning

In [None]:
import optuna
from optuna import samplers, pruners
import optuna_dashboard
import time
import joblib

import threading
# from google.colab import output
from optuna_dashboard import run_server

![image](076_xgboost_hyperparameters.jpg)

In [None]:
# Hyperparameter to tune
'''
1. learning_rate - float
2. n_estimators - int
3. max_depth - int

** optional if serious overfitting
1. reg_lambda (L2 reg) - float
2. reg_alpha (L1 reg) - float
'''

def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1)
    n_estimators = trial.suggest_int('n_estimators', 1, 20)
    max_depth = trial.suggest_int('max_depth', 1, 3)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-3, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-3, 1)

    tune_model = XGBClassifier(objective='binary:logistic', learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, reg_alpha=reg_alpha, reg_lambda=reg_lambda)

    # 5 fold cross val, get avg train acc and val acc
    _, val_acc = cross_val(verbose=False, model=tune_model)

    return val_acc

In [None]:
# Create optuna study
storage = optuna.storages.InMemoryStorage()
study = optuna.create_study(direction="maximize", storage=storage, sampler=samplers.GPSampler(), study_name="Bayesian Optimization") # using Random Sampler to perform Bayesian Optimization

  study = optuna.create_study(direction="maximize", storage=storage, sampler=samplers.GPSampler(), study_name="Bayesian Optimization") # using Random Sampler to perform Bayesian Optimization
[I 2024-07-01 13:25:49,167] A new study created in memory with name: Bayesian Optimization


In [None]:
# calculate the time require for randomize search
start_time = time.time()
study.optimize(objective, n_trials=150)  # Adjust number of trials as needed
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Optimization Time: {elapsed_time:.4f} seconds")

[I 2024-07-01 13:25:49,980] Trial 0 finished with value: 0.9412871769957574 and parameters: {'learning_rate': 0.4590006686265176, 'n_estimators': 8, 'max_depth': 1, 'reg_lambda': 0.4480309224025559, 'reg_alpha': 0.9213033396488629}. Best is trial 0 with value: 0.9412871769957574.
[I 2024-07-01 13:25:50,813] Trial 1 finished with value: 0.9456161635014698 and parameters: {'learning_rate': 0.7143484187957192, 'n_estimators': 13, 'max_depth': 2, 'reg_lambda': 0.6411757725435486, 'reg_alpha': 0.5334756980809638}. Best is trial 1 with value: 0.9456161635014698.
[I 2024-07-01 13:25:51,620] Trial 2 finished with value: 0.9211284551149562 and parameters: {'learning_rate': 0.04818112170692214, 'n_estimators': 14, 'max_depth': 1, 'reg_lambda': 0.49510257706889294, 'reg_alpha': 0.06603238457897424}. Best is trial 1 with value: 0.9456161635014698.
[I 2024-07-01 13:25:52,492] Trial 3 finished with value: 0.9472394305116009 and parameters: {'learning_rate': 0.29692952153431873, 'n_estimators': 18, '

Optimization Time: 235.5834 seconds


In [None]:
# Access best trial results
best_trial = study.best_trial
print(best_trial.params)

{'learning_rate': 0.15044362683413326, 'n_estimators': 20, 'max_depth': 3, 'reg_lambda': 0.9999999999999999, 'reg_alpha': 0.001}


In [None]:
# for analysis
joblib.dump(study, 'bayesopt_study.pkl')
joblib.dump(storage, "bayesopt_storage.pkl")

['bayesopt_storage.pkl']

In [None]:
# read study in case required
study = joblib.load('bayesopt_study.pkl')

best_trial = study.best_trial

### Evaluation

#### 5 fold cross validation

In [None]:
model = XGBClassifier(objective='binary:logistic', **best_trial.params)

In [None]:
cross_val(verbose=True, model=model)


Train accuracy: 	0.9589
Train recall: 		0.9467
Train precision: 	0.9526
Train f1: 		0.9496
Train ROC score: 	0.9934

Val accuracy: 	0.9561
Val recall: 		0.9410
Val precision: 	0.9459
Val f1: 		0.9434
Val ROC score: 	0.9906

Train accuracy: 	0.9576
Train recall: 		0.9463
Train precision: 	0.9491
Train f1: 		0.9477
Train ROC score: 	0.9931

Val accuracy: 	0.9540
Val recall: 		0.9390
Val precision: 	0.9454
Val f1: 		0.9422
Val ROC score: 	0.9925

Train accuracy: 	0.9589
Train recall: 		0.9456
Train precision: 	0.9516
Train f1: 		0.9486
Train ROC score: 	0.9936

Val accuracy: 	0.9493
Val recall: 		0.9437
Val precision: 	0.9362
Val f1: 		0.9400
Val ROC score: 	0.9898

Train accuracy: 	0.9613
Train recall: 		0.9479
Train precision: 	0.9555
Train f1: 		0.9517
Train ROC score: 	0.9937

Val accuracy: 	0.9438
Val recall: 		0.9205
Val precision: 	0.9434
Val f1: 		0.9318
Val ROC score: 	0.9889

Train accuracy: 	0.9606
Train recall: 		0.9480
Train precision: 	0.9548
Train f1: 		0.9514
Train ROC sc

(0.9594493221260756, 0.9508917355379463)

In [None]:
# further split train set to train and val set

X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

In [None]:
model.fit(X_train_val, y_train_val)

In [None]:
# y_train_val_pred = model.predict(X_train_val)
y_val_pred = model.predict(X_val)


val_accuracy_score = accuracy_score(y_val, y_val_pred)
# print(f"Train Accuracy: {train_val_accuracy_score:.4f}")
print(f"Val Accuracy: {val_accuracy_score:.4f}")

Val Accuracy: 0.9398


#### Test Set inference

In [None]:
y_pred=model.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
print(acc)

0.9583333333333334


#### Save Model

In [106]:
model.save_model('bayesopt_xgb_model.xgb')



In [107]:
# Load model
'''
from xgboost import XGBClassifier

# Load the saved model object
loaded_model = XGBClassifier()
loaded_model.load_model('model.xgb')
'''

"\nfrom xgboost import XGBClassifier\n\n# Load the saved model object\nloaded_model = XGBClassifier()\nloaded_model.load_model('model.xgb')\n"