#### Connect to Google Drive if required

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd '/content/drive/MyDrive/FYP/Project/XGBoost/'

/content/drive/MyDrive/FYP/Project/XGBoost


In [3]:
!pwd

/content/drive/MyDrive/FYP/Project/XGBoost


In [4]:
# install required package if using colab to run
!pip install xgboost
!pip install scikit-plot
!pip install optuna
!pip install optuna-dashboard

Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13

#### Import Required Library

In [5]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

#### Dataset Loading & Train, Test Split

In [6]:
# Dataset loading
df = pd.read_csv("data_preprocessed.csv")

In [7]:
# Dataset Split
from sklearn.model_selection import train_test_split

y = df[df.columns[-1]]
X = df.drop(df.columns[-1], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#### XGBClassifier Model

In [9]:
# model define
baseline_model = XGBClassifier(objective='binary:logistic', n_estimators=50, max_depth=3)
baseline_model.fit(X_train, y_train)

#### Helper function

In [10]:
# K-fold cross val
from sklearn.model_selection import KFold

In [11]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from scikitplot.metrics import plot_roc, plot_confusion_matrix, plot_precision_recall

In [12]:
def eval_performance(y_pred, y_true, y_proba, plot=False, verbose=False):

  # confusion matrix
  if plot:
    plot_confusion_matrix(y_true, y_pred)

    # y_probas == (prob for class 0, prob for class 1)
    # y_proba == prob for class 1
    # hence, the arrange of y_probas == (1-y_proba, y_proba)
    y_probas = [[y, x] for x, y in zip(np.array(y_proba), 1-np.array(y_proba))]

    plot_roc(y_true, y_probas)

    plot_precision_recall(y_true, y_probas)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('Precision-Recall Curve')

    plt.show()

  acc = accuracy_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  roc_score = roc_auc_score(y_true, y_proba)

  if verbose:
    print(f"recall score: {recall:.4f}")
    print(f"Precision score: {precision:.4f}")
    print(f"F1 score: {f1:.4f}")
    print(f"ROC_AUC score: {roc_score:.4f}")
    print(f"Accuracy Score: {acc:.4f}\n")

  return acc, recall, precision, f1, roc_score

In [15]:
def cross_val(verbose=False, model=baseline_model):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    all_acc = []
    all_recall = []
    all_precision = []
    all_f1 = []
    all_roc_score = []

    all_train_acc = []
    all_train_recall = []
    all_train_precision = []
    all_train_f1 = []
    all_train_roc_score = []

    # Iterate through each fold in KFold
    for train_index, val_index in kfold.split(X_train):

        # Split data based on the current fold indices
        X_train_val, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_val, y_val = y_train.iloc[train_index], y_train.iloc[val_index]


        # train the model using train set
        model.fit(X_train_val, y_train_val)

        # Train set Eval
        y_probas = model.predict_proba(X_train_val)

        y_proba = y_probas[:, 1]

        y_pred = model.predict(X_train_val)

        train_acc, train_recall, train_precision, train_f1, train_roc_score = eval_performance(y_pred, y_train_val, y_proba)

        all_train_acc.append(train_acc)
        all_train_recall.append(train_recall)
        all_train_precision.append(train_precision)
        all_train_f1.append(train_f1)
        all_train_roc_score.append(train_roc_score)

        # Val set eval
        y_probas = model.predict_proba(X_val)

        y_proba = y_probas[:, 1]

        y_pred = model.predict(X_val)

        acc, recall, precision, f1, roc_score = eval_performance(y_pred, y_val, y_proba)

        all_acc.append(acc)
        all_recall.append(recall)
        all_precision.append(precision)
        all_f1.append(f1)
        all_roc_score.append(roc_score)

        if verbose:
          # Train set avg metrics
          print(f"\nTrain accuracy: \t{train_acc:.4f}")
          print(f"Train recall: \t\t{train_recall:.4f}")
          print(f"Train precision: \t{train_precision:.4f}")
          print(f"Train f1: \t\t{train_f1:.4f}")
          print(f"Train ROC score: \t{train_roc_score:.4f}")

          # Val set avg metrics
          print(f"\nVal accuracy: \t{acc:.4f}")
          print(f"Val recall: \t\t{recall:.4f}")
          print(f"Val precision: \t{precision:.4f}")
          print(f"Val f1: \t\t{f1:.4f}")
          print(f"Val ROC score: \t{roc_score:.4f}")

    avg_train_acc = sum(all_train_acc)/len(all_train_acc)
    avg_train_recall = sum(all_train_recall)/len(all_train_recall)
    avg_train_precision = sum(all_train_precision)/len(all_train_precision)
    avg_train_f1 = sum(all_train_f1)/len(all_train_f1)
    avg_train_roc = sum(all_train_roc_score)/len(all_train_roc_score)

    avg_val_acc = sum(all_acc)/len(all_acc)
    avg_val_recall = sum(all_recall)/len(all_recall)
    avg_val_precision = sum(all_precision)/len(all_precision)
    avg_val_f1 = sum(all_f1)/len(all_f1)
    avg_val_roc = sum(all_roc_score)/len(all_roc_score)

    if verbose:
        # Train set avg metrics
        print(f"\nAvg training accuracy: \t\t{avg_train_acc:.4f}")
        print(f"Avg training recall: \t\t{avg_train_recall:.4f}")
        print(f"Avg training precision: \t{avg_train_precision:.4f}")
        print(f"Avg training f1: \t\t{avg_train_f1:.4f}")
        print(f"Avg training ROC score: \t{avg_train_roc:.4f}")

        # Val set avg metrics
        print(f"\nAvg validation accuracy: \t{avg_val_acc:.4f}")
        print(f"Avg validation recall: \t\t{avg_val_recall:.4f}")
        print(f"Avg validation precision: \t{avg_val_precision:.4f}")
        print(f"Avg validation f1: \t\t{avg_val_f1:.4f}")
        print(f"Avg validation ROC score: \t{avg_val_roc:.4f}")

    # return training acc, val acc
    return avg_train_acc, avg_val_acc

#### Hyperparameter Tuning

In [16]:
import optuna
from optuna import samplers, pruners
import optuna_dashboard
import time
import joblib

import threading
# from google.colab import output
from optuna_dashboard import run_server

![image](076_xgboost_hyperparameters.jpg)

In [17]:
# Hyperparameter to tune
'''
1. learning_rate - float
2. n_estimators - int
3. max_depth - int

** optional if serious overfitting
1. reg_lambda (L2 reg) - float
2. reg_alpha (L1 reg) - float
'''

def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1)
    n_estimators = trial.suggest_int('n_estimators', 1, 20)
    max_depth = trial.suggest_int('max_depth', 1, 3)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-3, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-3, 1)

    tune_model = XGBClassifier(objective='binary:logistic', learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, reg_alpha=reg_alpha, reg_lambda=reg_lambda)

    # 5 fold cross val, get avg train acc and val acc
    _, val_acc = cross_val(verbose=False, model=tune_model)

    return val_acc

In [18]:
# Create optuna study
storage = optuna.storages.InMemoryStorage()
study = optuna.create_study(direction='maximize', storage=storage, sampler=samplers.RandomSampler(), study_name="Random Search Optimization") # using Random Sampler to perform Random Optimization

[I 2024-07-01 13:39:16,158] A new study created in memory with name: Random Search Optimization


In [19]:
# calculate the time require for randomize search
start_time = time.time()
study.optimize(objective, n_trials=150)  # Adjust number of trials as needed
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Optimization Time: {elapsed_time:.4f} seconds")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-07-01 13:39:17,174] Trial 0 finished with value: 0.5949674331026797 and parameters: {'learning_rate': 0.04098086006442312, 'n_estimators': 1, 'max_depth': 1, 'reg_lambda': 0.8911386203092233, 'reg_alpha': 0.6886322947611996}. Best is trial 0 with value: 0.5949674331026797.
[I 2024-07-01 13:39:18,017] Trial 1 finished with value: 0.9522459219327691 and parameters: {'learning_rate': 0.24459599875357346, 'n_estimators': 17, 'max_depth': 3, '

Optimization Time: 134.7297 seconds


In [23]:
# Access best trial results
best_trial = study.best_trial
print(best_trial.params)

{'learning_rate': 0.24459599875357346, 'n_estimators': 17, 'max_depth': 3, 'reg_lambda': 0.690995856935781, 'reg_alpha': 0.7178216537253156}


In [24]:
# for analysis
joblib.dump(study, 'randopt_study.pkl')
joblib.dump(storage, "randopt_storage.pkl")

['randopt_storage.pkl']

In [25]:
# read study in case required
study = joblib.load('randopt_study.pkl')

best_trial = study.best_trial

#### Evaluation

#### 5 fold cross validation

In [26]:
model = XGBClassifier(objective='binary:logistic', **best_trial.params)

In [27]:
cross_val(verbose=True, model=model)


Train accuracy: 	0.9589
Train recall: 		0.9457
Train precision: 	0.9532
Train f1: 		0.9494
Train ROC score: 	0.9942

Val accuracy: 	0.9561
Val recall: 		0.9451
Val precision: 	0.9435
Val f1: 		0.9443
Val ROC score: 	0.9916

Train accuracy: 	0.9609
Train recall: 		0.9496
Train precision: 	0.9532
Train f1: 		0.9514
Train ROC score: 	0.9946

Val accuracy: 	0.9466
Val recall: 		0.9365
Val precision: 	0.9350
Val f1: 		0.9357
Val ROC score: 	0.9908

Train accuracy: 	0.9596
Train recall: 		0.9484
Train precision: 	0.9512
Train f1: 		0.9498
Train ROC score: 	0.9943

Val accuracy: 	0.9601
Val recall: 		0.9425
Val precision: 	0.9599
Val f1: 		0.9511
Val ROC score: 	0.9921

Train accuracy: 	0.9616
Train recall: 		0.9529
Train precision: 	0.9533
Train f1: 		0.9531
Train ROC score: 	0.9945

Val accuracy: 	0.9472
Val recall: 		0.9322
Val precision: 	0.9322
Val f1: 		0.9322
Val ROC score: 	0.9899

Train accuracy: 	0.9625
Train recall: 		0.9513
Train precision: 	0.9553
Train f1: 		0.9533
Train ROC sc

(0.9607007046887339, 0.9522459219327691)

In [28]:
# further split train set to train and val set

X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

In [29]:
model.fit(X_train_val, y_train_val)

In [30]:
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(acc)

0.9479377958079783


#### Test Set inference

In [31]:
y_pred=model.predict(X_test)

In [32]:
acc = accuracy_score(y_test, y_pred)
print(acc)

0.9523809523809523


#### Save Model

In [33]:
model.save_model('randopt_xgb_model.xgb')



In [34]:
# Load model
'''
from xgboost import XGBClassifier

# Load the saved model object
loaded_model = XGBClassifier()
loaded_model.load_model('model.xgb')
'''

"\nfrom xgboost import XGBClassifier  # Or XGBRegressor depending on your model type\n\n# Load the saved model object\nloaded_model = XGBClassifier()  # Or XGBRegressor\nloaded_model.load_model('model.xgb')\n"