In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import xlrd
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import math
import random
from utils import *

In [13]:
data = pd.read_excel('Titanic_dataset.xls') 

In [14]:
na_value = data.isna()
na_counts = na_value.sum()
print(na_counts)

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [15]:
columns_to_drop = ['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest']
data = data.drop(columns_to_drop, axis=1)

data['age'].fillna(data['age'].median(), inplace=True)
data['fare'].fillna(data['fare'].mean(), inplace=True)
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)

data = pd.get_dummies(data, columns=['sex', 'embarked'])

In [16]:
X = data.drop('survived', axis=1)
y = data['survived']


# Split the data into train, validation, and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)

In [17]:
model = xgb.XGBClassifier()

In [18]:
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.3]
n_estimators = [50, 100, 150, 200, 250, 300]
max_depths = [3, 4, 5, 6, 7, 8]
reg_alphas = [0, 0.1, 0.5, 1, 2]

param_space = []

for lr in learning_rates:
    for ne in n_estimators:
        for md in max_depths:
            for ra in reg_alphas:
                config = {'learning_rate': lr, 'n_estimators': ne, 'max_depth': md, 'reg_alpha': ra}
                param_space.append(config)

In [19]:
len(param_space)

900

In [20]:
def ucb_selection(rewards, attempts, t):
    n_configs = len(rewards)
    ucb_values = []
    
    for i in range(n_configs):
        if attempts[i] == 0:
            ucb_values.append(float('inf'))  # Assign a high value to unexplored configurations
        else:
            ucb_values.append(rewards[i] + math.sqrt(2 * math.log(t) / (attempts[i] + 1e-8)))  # Add epsilon value
        
    best_config = max(range(n_configs), key=lambda x: ucb_values[x])
    return best_config 

In [21]:
rewards = [0] * len(param_space)
attempts = [0] * len(param_space)
best_config_ucb = None
best_error_ucb = float('inf')

best_config_random = None
best_error_random = float('inf')

max_iterations = 70

# Perform cross-validation for evaluation
cv = 5  # Number of cross-validation folds

for t in range(1, max_iterations + 1):
    # UCB Strategy
    config_idx_ucb = ucb_selection(rewards, attempts, t)
    config_ucb = param_space[config_idx_ucb]
    model.set_params(**config_ucb)
    
    # Train the model on the training set
    model.fit(X_train, y_train)
    
    # Perform cross-validation on the validation set
    scores = cross_val_score(model, X_val, y_val, cv=cv)
    error_ucb = np.mean(scores)
    
    rewards[config_idx_ucb] += error_ucb
    attempts[config_idx_ucb] += 1
    
    if error_ucb < best_error_ucb:
        best_config_ucb = config_ucb
        best_error_ucb = error_ucb
    
    # Random Strategy
    config_random = random.choice(param_space)
    model.set_params(**config_random)
    
    # Train the model on the training set
    model.fit(X_train, y_train)
    
    # Perform cross-validation on the validation set
    scores = cross_val_score(model, X_val, y_val, cv=cv)
    error_random = np.mean(scores)
    
    if error_random < best_error_random:
        best_config_random = config_random
        best_error_random = error_random


print("Dataset: Titanic Dataset")

print("\nUCB Strategy:")
print("Best validation error:", best_error_ucb)
print("Best hyperparameter configuration:", best_config_ucb)
    
print("\nRandom Strategy:")
print("Best validation error:", best_error_random)
print("Best hyperparameter configuration:", best_config_random)

Dataset: Titanic Dataset

UCB Strategy:
Best validation error: 0.6904761904761905
Best hyperparameter configuration: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 6, 'reg_alpha': 0}

Random Strategy:
Best validation error: 0.680952380952381
Best hyperparameter configuration: {'learning_rate': 0.05, 'n_estimators': 250, 'max_depth': 3, 'reg_alpha': 0}


In [67]:
# rewards = [0] * len(param_space)
# attempts = [0] * len(param_space)
# best_config_ucb = None
# best_score_ucb = 0

# best_config_random = None
# best_score_random = 0

# max_iterations = 30

# for t in range(1, max_iterations + 1):
#     # UCB Strategy
#     config_idx_ucb = ucb_selection(rewards, attempts, t)
#     config_ucb = param_space[config_idx_ucb]
#     model.set_params(**config_ucb)
#     model.fit(X_train, y_train)
    
#     # Evaluate performance on validation set using multiple metrics
#     metrics_ucb = {
#         'accuracy': accuracy_score(y_val, model.predict(X_val)),
#         'precision': precision_score(y_val, model.predict(X_val)),
#         'recall': recall_score(y_val, model.predict(X_val)),
#         'f1': f1_score(y_val, model.predict(X_val)),
#         'roc_auc': roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
#     }
    
#     # Calculate mean performance score
#     score_ucb = np.mean(list(metrics_ucb.values()))
    
#     rewards[config_idx_ucb] += score_ucb
#     attempts[config_idx_ucb] += 1
    
#     if score_ucb > best_score_ucb:
#         best_config_ucb = config_ucb
#         best_score_ucb = score_ucb
    
#     # Random Strategy
#     config_random = random.choice(param_space)
#     model.set_params(**config_random)
#     model.fit(X_train, y_train)
    
#     # Evaluate performance on validation set using multiple metrics
#     metrics_random = {
#         'accuracy': accuracy_score(y_val, model.predict(X_val)),
#         'precision': precision_score(y_val, model.predict(X_val)),
#         'recall': recall_score(y_val, model.predict(X_val)),
#         'f1': f1_score(y_val, model.predict(X_val)),
#         'roc_auc': roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
#     }
    
#     # Calculate mean performance score
#     score_random = np.mean(list(metrics_random.values()))
    
#     if score_random > best_score_random:
#         best_config_random = config_random
#         best_score_random = score_random

In [68]:
# Train the final model using the best hyperparameters from UCB strategy
model.set_params(**best_config_ucb)
model.fit(X_trainval, y_trainval)

In [69]:
# Evaluate performance on the test set using the best hyperparameters from UCB strategy
test_predictions_ucb = model.predict(X_test)
test_accuracy_ucb = accuracy_score(y_test, test_predictions_ucb)
test_precision_ucb = precision_score(y_test, test_predictions_ucb)
test_recall_ucb = recall_score(y_test, test_predictions_ucb)
test_f1_ucb = f1_score(y_test, test_predictions_ucb)
test_roc_auc_ucb = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

In [70]:
# Train the final model using the best hyperparameters from random strategy
model.set_params(**best_config_random)
model.fit(X_trainval, y_trainval)

In [71]:
# Evaluate performance on the test set using the best hyperparameters from random strategy
test_predictions_random = model.predict(X_test)
test_accuracy_random = accuracy_score(y_test, test_predictions_random)
test_precision_random = precision_score(y_test, test_predictions_random)
test_recall_random = recall_score(y_test, test_predictions_random)
test_f1_random = f1_score(y_test, test_predictions_random)
test_roc_auc_random = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

In [72]:
print("UCB-selected hyperparameters: ", best_config_ucb)
print("UCB-selected performance:")
print(f"  - Accuracy: {test_accuracy_ucb}")
print(f"  - Precision: {test_precision_ucb}")
print(f"  - Recall: {test_recall_ucb}")
print(f"  - F1-score: {test_f1_ucb}")
print(f"  - ROC AUC: {test_roc_auc_ucb}")

print("\nRandom-selected hyperparameters: ", best_config_random)
print("Random-selected performance:")
print(f"  - Accuracy: {test_accuracy_random}")
print(f"  - Precision: {test_precision_random}")
print(f"  - Recall: {test_recall_random}")
print(f"  - F1-score: {test_f1_random}")
print(f"  - ROC AUC: {test_roc_auc_random}")


UCB-selected hyperparameters:  {'learning_rate': 0.01, 'n_estimators': 50, 'max_depth': 6, 'reg_alpha': 0}
UCB-selected performance:
  - Accuracy: 0.7595419847328244
  - Precision: 0.8666666666666667
  - Recall: 0.5508474576271186
  - F1-score: 0.6735751295336788
  - ROC AUC: 0.85172434086629

Random-selected hyperparameters:  {'learning_rate': 0.05, 'n_estimators': 100, 'max_depth': 5, 'reg_alpha': 0.5}
Random-selected performance:
  - Accuracy: 0.767175572519084
  - Precision: 0.8607594936708861
  - Recall: 0.576271186440678
  - F1-score: 0.6903553299492386
  - ROC AUC: 0.8666137005649719
