### Import Necessary Libraries

In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import xlrd
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import math
import random
from utils import * 

# Part 1

### Read the dataset

In [81]:
data = pd.read_excel('Titanic_dataset.xls')

### Check for missing values

In [82]:
na_value = data.isna()
na_counts = na_value.sum()
print(na_counts)

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


### Let's drop irrelevant columns, handle missing values and perform one-hot encoding for categorical variables

In [83]:
columns_to_drop = ['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest']
data = data.drop(columns_to_drop, axis=1)

data['age'].fillna(data['age'].median(), inplace=True)
data['fare'].fillna(data['fare'].mean(), inplace=True)
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)

data = pd.get_dummies(data, columns=['sex', 'embarked'])

# Part 2

In [84]:
X = data.drop('survived', axis=1)
y = data['survived']

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42) 

### Choose a gradient boosting model library and initialize the model with default hyperparameters. 

In [49]:
model = xgb.XGBClassifier()

### Define the hyperparameter search space based on the available hyperparameters of the chosen model.

In [61]:
learning_rates = [0.01, 0.1, 0.2]
n_estimators = [100, 200, 300]
max_depths = [3, 5, 7]
reg_alphas = [0, 0.1, 0.5]

param_space = []

for lr in learning_rates:
    for ne in n_estimators:
        for md in max_depths:
            for ra in reg_alphas:
                config = {'learning_rate': lr, 'n_estimators': ne, 'max_depth': md, 'reg_alpha': ra}
                param_space.append(config)


### Implement the UCB algorithm to select hyperparameters based on past performance.  

In [62]:
def ucb_selection(rewards, attempts, t):
    n_configs = len(rewards)
    ucb_values = []
    
    for i in range(n_configs):
        if attempts[i] == 0:
            ucb_values.append(float('inf'))  # Assign a high value to unexplored configurations
        else:
            ucb_values.append(rewards[i] + math.sqrt(2 * math.log(t) / (attempts[i] + 1e-8)))  # Add epsilon value
        
    best_config = max(range(n_configs), key=lambda x: ucb_values[x])
    return best_config

### Perform the hyperparameter tuning loop using the UCB algorithm.

In [75]:
rewards = [0] * len(param_space)
attempts = [0] * len(param_space)
best_config = None
best_score = 0
max_iterations = 30

for t in range(1, max_iterations + 1):
    # Select hyperparameters using UCB algorithm
    config_idx = ucb_selection(rewards, attempts, t)
    print("config_idx:", config_idx)  # Added print statement
    config = param_space[config_idx]

    # Train and evaluate the model with the selected hyperparameters
    print("config:", config)  # Added print statement
    model.set_params(**config)
    model.fit(X_train, y_train)
    
    # Evaluate performance on multiple metrics using cross-validation
    metrics = {
        'accuracy': cross_val_score(model, X_trainval, y_trainval, cv=5),
        'precision': cross_val_score(model, X_trainval, y_trainval, cv=5, scoring='precision'),
        'recall': cross_val_score(model, X_trainval, y_trainval, cv=5, scoring='recall'),
        'f1': cross_val_score(model, X_trainval, y_trainval, cv=5, scoring='f1'),
        'roc_auc': cross_val_score(model, X_trainval, y_trainval, cv=5, scoring='roc_auc')
    }
    
    # Calculate mean performance scores across all metrics
    score = np.mean(list(metrics.values()))
    
    # Update rewards and attempts
    rewards[config_idx] += score
    attempts[config_idx] += 1
    
    # Track the best hyperparameter configuration and score
    if score > best_score:
        best_config = config
        best_score = score

# Train the final model using the best hyperparameters
model.set_params(**best_config)
model.fit(X_trainval, y_trainval)

config_idx: 0
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 3, 'reg_alpha': 0}
config_idx: 1
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 3, 'reg_alpha': 0.1}
config_idx: 2
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 3, 'reg_alpha': 0.5}
config_idx: 3
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 5, 'reg_alpha': 0}
config_idx: 4
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 5, 'reg_alpha': 0.1}
config_idx: 5
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 5, 'reg_alpha': 0.5}
config_idx: 6
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 7, 'reg_alpha': 0}
config_idx: 7
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 7, 'reg_alpha': 0.1}
config_idx: 8
config: {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 7, 'reg_alpha': 0.5}
config_idx: 9
config: {'learning_rate': 0.01, 'n_estimators': 200, 'max_depth': 3, 'reg_alpha': 0

## Evaluate the final model on the test set

In [76]:
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

### Comparison with Random Strategy: Implement the random strategy for hyperparameter selection and compare its performance with the UCB algorithm.

In [77]:
random_config = random.choice(param_space)
model.set_params(**random_config)
model.fit(X_trainval, y_trainval)

## Evaluate the model selected by random strategy on the test set

In [78]:
random_test_predictions = model.predict(X_test)
random_test_accuracy = accuracy_score(y_test, random_test_predictions)
random_test_precision = precision_score(y_test, random_test_predictions)
random_test_recall = recall_score(y_test, random_test_predictions)
random_test_f1 = f1_score(y_test, random_test_predictions)
random_test_roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

## Results and Evaluation: Present the results, including the dataset, UCB algorithm implementation, and performance comparison between UCB and random.


In [79]:
print("Dataset: Titanic Dataset")
print("\nUCB-selected hyperparameters: ", best_config)
print("UCB-selected performance:")
print(f"  - Accuracy: {test_accuracy}")
print(f"  - Precision: {test_precision}")
print(f"  - Recall: {test_recall}")
print(f"  - F1-score: {test_f1}")
print(f"  - ROC AUC: {test_roc_auc}")

print("\nRandom-selected hyperparameters: ", random_config)
print("Random-selected performance:")
print(f"  - Accuracy: {random_test_accuracy}")
print(f"  - Precision: {random_test_precision}")
print(f"  - Recall: {random_test_recall}")
print(f"  - F1-score: {random_test_f1}")
print(f"  - ROC AUC: {random_test_roc_auc}")

Dataset: Titanic Dataset

UCB-selected hyperparameters:  {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 7, 'reg_alpha': 0.1}
UCB-selected performance:
  - Accuracy: 0.7938931297709924
  - Precision: 0.8809523809523809
  - Recall: 0.6271186440677966
  - F1-score: 0.7326732673267327
  - ROC AUC: 0.8694091337099811

Random-selected hyperparameters:  {'learning_rate': 0.2, 'n_estimators': 200, 'max_depth': 7, 'reg_alpha': 0}
Random-selected performance:
  - Accuracy: 0.7862595419847328
  - Precision: 0.7924528301886793
  - Recall: 0.711864406779661
  - F1-score: 0.75
  - ROC AUC: 0.857962570621469


## Visualization: Plotting the performance metrics for different hyperparameter configurations
