In [1]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import xlrd
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import math
import random
from utils import *

In [2]:
data = pd.read_excel('Titanic_dataset.xls') 

In [3]:
na_value = data.isna()
na_counts = na_value.sum()
print(na_counts)

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [4]:
columns_to_drop = ['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest']
data = data.drop(columns_to_drop, axis=1)

data['age'].fillna(data['age'].median(), inplace=True)
data['fare'].fillna(data['fare'].mean(), inplace=True)
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)

data = pd.get_dummies(data, columns=['sex', 'embarked'])

In [5]:
X = data.drop('survived', axis=1)
y = data['survived']


# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)  

In [6]:
# model = xgb.XGBClassifier() 

In [7]:
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
n_estimators = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
max_depths = [3, 5, 7, 9]
# reg_alphas = [0, 0.1, 0.5, 1, 2]    , 'reg_alpha': ra

param_space = []

for lr in learning_rates:
    for ne in n_estimators:
        for md in max_depths:
            # for ra in reg_alphas:
                config = {'learning_rate': lr, 'n_estimators': ne, 'max_depth': md}
                param_space.append(config)

In [8]:
len(param_space)

280

In [9]:
def ucb_selection(rewards, attempts, t, c):
    ucb_values = []
    n = len(rewards)

    for i in range(n):
        if attempts[i] == 0: # Exploration term when no attempts have been made
            ucb_value = float('inf')
        else:
            exploitation = rewards[i] / attempts[i]
            exploration = c * math.sqrt(math.log(t) / attempts[i])
            ucb_value = exploitation + exploration

        ucb_values.append(ucb_value)

    selected_index = max(range(n), key=lambda i: ucb_values[i])

    return selected_index 

In [10]:
# def ucb_selection(rewards, attempts, t, c):
#     ucb_values = []
#     n = len(rewards)
#     epsilon = 1e-6  # to avoid zero division

#     for i in range(n):
#         exploitation = rewards[i] / (attempts[i] + epsilon)
#         exploration = c * math.sqrt(math.log(t) / (attempts[i] + epsilon))
#         ucb_value = exploitation + exploration
#         ucb_values.append(ucb_value)

#     selected_index = max(range(n), key=lambda i: ucb_values[i])

#     return selected_index


In [14]:
rewards = [0] * len(param_space)
attempts = [0] * len(param_space)
best_config_ucb = None
best_error_ucb = float('inf')

best_config_random = None
best_error_random = float('inf')

max_iterations = 140
c = 2  # Exploration parameter 

cv = 5  # Number of cross-validation folds 

for t in range(1, max_iterations + 1):
    # UCB strategy
    config_idx_ucb = ucb_selection(rewards, attempts, t, c)
    config_ucb = param_space[config_idx_ucb]
    ucb_model = xgb.XGBClassifier(**config_ucb)

    ucb_model.fit(X_train, y_train) 
    y_pred = ucb_model.predict(X_test)

    error_ucb = accuracy_score(y_test,y_pred)
    val_err_ucb = 1 - error_ucb

    rewards[config_idx_ucb] += error_ucb
    attempts[config_idx_ucb] += 1

    if val_err_ucb < best_error_ucb:
        best_config_ucb = config_ucb
        best_error_ucb = val_err_ucb 

# Random Strategy
config_random = random.choice(param_space)
rand_model = xgb.XGBClassifier(**config_random)

rand_model.fit(X_train, y_train)
y_pred_rand = rand_model.predict(X_test)

error_rand = accuracy_score(y_test,y_pred)
val_err_rand = 1 - error_rand

if val_err_rand < best_error_random:
    best_config_random = config_random
    best_error_random = val_err_rand

print("Dataset: Titanic Dataset")

print("\nUCB Strategy:")
print("Best validation error:", best_error_ucb)
print("Best hyperparameter configuration:", best_config_ucb)

print("\nRandom Strategy:")
print("Best validation error:", best_error_random)
print("Best hyperparameter configuration:", best_config_random) 

Dataset: Titanic Dataset

UCB Strategy:
Best validation error: 0.1870229007633588
Best hyperparameter configuration: {'learning_rate': 0.1, 'n_estimators': 300, 'max_depth': 9}

Random Strategy:
Best validation error: 0.20610687022900764
Best hyperparameter configuration: {'learning_rate': 0.2, 'n_estimators': 100, 'max_depth': 9}


### Train the final model using the best hyperparameters from UCB strategy

In [15]:
ucb_test = xgb.XGBClassifier(**best_config_ucb)
ucb_test.fit(X_train, y_train)

### Evaluate performance on the test set using the best hyperparameters from UCB strategy

In [16]:
test_predictions_ucb = ucb_test.predict(X_test)
test_accuracy_ucb = accuracy_score(y_test, test_predictions_ucb)
test_precision_ucb = precision_score(y_test, test_predictions_ucb)
test_recall_ucb = recall_score(y_test, test_predictions_ucb)
test_f1_ucb = f1_score(y_test, test_predictions_ucb)
test_roc_auc_ucb = roc_auc_score(y_test, ucb_test.predict_proba(X_test)[:, 1])

### Evaluate performance on the test set using the best hyperparameters from random strategy


In [18]:
test_predictions_random = y_pred_rand
test_accuracy_random = accuracy_score(y_test, test_predictions_random)
test_precision_random = precision_score(y_test, test_predictions_random)
test_recall_random = recall_score(y_test, test_predictions_random)
test_f1_random = f1_score(y_test, test_predictions_random)
test_roc_auc_random = roc_auc_score(y_test, rand_model.predict_proba(X_test)[:, 1])

In [19]:
print("UCB-selected hyperparameters: ", best_config_ucb)
print("UCB-selected performance:")
print(f"  - Accuracy: {test_accuracy_ucb}")
print(f"  - Precision: {test_precision_ucb}")
print(f"  - Recall: {test_recall_ucb}")
print(f"  - F1-score: {test_f1_ucb}")
print(f"  - ROC AUC: {test_roc_auc_ucb}")

print("\nRandom-selected hyperparameters: ", best_config_random)
print("Random-selected performance:")
print(f"  - Accuracy: {test_accuracy_random}")
print(f"  - Precision: {test_precision_random}")
print(f"  - Recall: {test_recall_random}")
print(f"  - F1-score: {test_f1_random}")
print(f"  - ROC AUC: {test_roc_auc_random}")

UCB-selected hyperparameters:  {'learning_rate': 0.1, 'n_estimators': 300, 'max_depth': 9}
UCB-selected performance:
  - Accuracy: 0.8129770992366412
  - Precision: 0.8349514563106796
  - Recall: 0.7288135593220338
  - F1-score: 0.7782805429864253
  - ROC AUC: 0.8628472222222222

Random-selected hyperparameters:  {'learning_rate': 0.2, 'n_estimators': 100, 'max_depth': 9}
Random-selected performance:
  - Accuracy: 0.8015267175572519
  - Precision: 0.83
  - Recall: 0.7033898305084746
  - F1-score: 0.7614678899082568
  - ROC AUC: 0.8626706685499059
