In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
pd.set_option("display.precision", 2)

This notebook tries to solve the Titanic problem using Bayesian optimisation to do hyperparameter tuning

## 1) Preprocessing the data

First of all, we need to get the data into a format that we can actually use for models \
Therefore, we remove missing values and redundant columns etc

In [109]:
data = pd.read_csv("../Data/Titanic/train.csv")

In [110]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [111]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [112]:
# select target
y = data.Survived

# drop target from the data
X = data.drop(['Survived'], axis = 1)

In [113]:
# split data into train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [114]:
# where do we have null values?

cols_with_null = [col for col in X_train.columns if X_train[col].isnull().any()]
#print("null values in ", cols_with_null)

Null values present in Age, Embarked and Cabin
* Cabin is null if Pclass != 1 --> drop
* Drop PassengerId, Ticket, Name --> all are non-useful columns
* For age, just give people the median age
* Drop Embarked (for now)

In [115]:
# dropping relevant columns
cols_to_drop = ['Cabin', 'PassengerId', 'Name', 'Ticket', 'Embarked']
reduced_X_train = X_train.drop(cols_to_drop, axis = 1)
reduced_X_valid = X_valid.drop(cols_to_drop, axis = 1)

In [116]:
# # ONE-HOT ENCODING

encoder = OneHotEncoder(sparse = False)

label_X_train = reduced_X_train.copy()
label_X_valid = reduced_X_valid.copy()

# train onehot encoder to the data
train_encoded = pd.DataFrame(encoder.fit_transform(np.asarray(label_X_train['Sex']).reshape(-1, 1)), dtype = int)
valid_encoded = pd.DataFrame(encoder.transform(np.asarray(label_X_valid['Sex']).reshape(-1, 1)), dtype = int)

train_encoded.index = label_X_train.index
valid_encoded.index = label_X_valid.index

# remove the sex column from label_X_train and add back columns for Male and Female
num_X_train = label_X_train.drop(['Sex'], axis = 1)
num_X_valid = label_X_valid.drop(['Sex'], axis = 1)

encoded_X_train = pd.concat([num_X_train, train_encoded], axis = 1)
encoded_X_train = encoded_X_train.rename(columns = {0 : 'Male', 1 : 'Female'})

encoded_X_valid = pd.concat([num_X_valid, valid_encoded], axis = 1)
encoded_X_valid = encoded_X_valid.rename(columns = {0 : 'Male', 1 : 'Female'})
encoded_X_train.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Female
140,3,,0,2,15.25,1,0
439,2,31.0,0,0,10.5,0,1
817,2,31.0,1,1,37.0,0,1
378,3,20.0,0,0,4.01,0,1
491,3,21.0,0,0,7.25,0,1


In [117]:
final_X_train = encoded_X_train.copy()
final_X_valid = encoded_X_valid.copy()

# replace values in the age column with median age
final_X_train['Age'] = encoded_X_train['Age'].fillna(encoded_X_train['Age'].median())
final_X_valid['Age'] = encoded_X_valid['Age'].fillna(encoded_X_valid['Age'].median())

In [118]:
# delete all variables that aren't useful anymore
del(label_X_train, label_X_valid, reduced_X_train, reduced_X_valid, \
    cols_with_null, cols_to_drop, train_encoded, valid_encoded, \
   num_X_train, num_X_valid, encoded_X_train, encoded_X_valid)

In [119]:
final_X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Female
140,3,29.0,0,2,15.25,1,0
439,2,31.0,0,0,10.5,0,1
817,2,31.0,1,1,37.0,0,1
378,3,20.0,0,0,4.01,0,1
491,3,21.0,0,0,7.25,0,1


## Bayesian Optimisation of a xgBoost method

This uses a library to carry out Bayesian optimisation to progressively improve an xgBoost method

In [120]:
from xgboost import XGBClassifier

# setting up a default classifier

classifier1 = XGBClassifier(use_label_encoder=False,eval_metric='logloss').fit(final_X_train, y_train)

# making predictions using that default classifier

default_predictions = classifier1.predict(final_X_valid)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [121]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(default_predictions, y_valid)
acc = cm.diagonal().sum()/cm.sum()
print("accuracy with default = ", acc)

accuracy with default =  0.8324022346368715


In [122]:
from bayes_opt import BayesianOptimization
import xgboost as xgb
from sklearn.metrics import f1_score

In [123]:
# convert data to a matrix to use with xgb 
def func_to_tune(max_depth, n_estimators, learning_rate):
    params = {'max_depth': int(max_depth),
                'n_estimators': int(n_estimators),
                'learning_rate': learning_rate}
    trained_model = XGBClassifier(learning_rate=params['learning_rate'], \
        max_depth=params['max_depth'], \
            n_estimators=params['n_estimators'], use_label_encoder=False, objective='binary:logistic', \
                eval_metric='logloss', \
                    verbose=0, silent=True).fit(final_X_train, y_train)
    predictions = trained_model.predict(final_X_valid)
    score = f1_score(y_valid, predictions)
    return score

In [124]:
import optuna

In [125]:
def objective(trial):
    rf_max_depth = trial.suggest_int('max_depth', 3, 10)
    rf_n_estimators = trial.suggest_int('n_estimators', 50, 150)
    classifier_obj = RandomForestClassifier(
                                   max_depth=rf_max_depth, 
                                   n_estimators=rf_n_estimators).fit(final_X_train, y_train)
    predictions = classifier_obj.predict(final_X_valid)
    score = f1_score(y_valid, predictions)
    return score 
    

In [126]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[32m[I 2022-03-19 15:49:58,081][0m A new study created in memory with name: no-name-5772a3e4-826a-42f2-a1c8-e954c7082a8e[0m
[32m[I 2022-03-19 15:49:58,326][0m Trial 0 finished with value: 0.7559055118110236 and parameters: {'max_depth': 6, 'n_estimators': 115}. Best is trial 0 with value: 0.7559055118110236.[0m
[32m[I 2022-03-19 15:49:58,567][0m Trial 1 finished with value: 0.7906976744186046 and parameters: {'max_depth': 10, 'n_estimators': 112}. Best is trial 1 with value: 0.7906976744186046.[0m
[32m[I 2022-03-19 15:49:58,796][0m Trial 2 finished with value: 0.7596899224806202 and parameters: {'max_depth': 6, 'n_estimators': 122}. Best is trial 1 with value: 0.7906976744186046.[0m
[32m[I 2022-03-19 15:49:58,932][0m Trial 3 finished with value: 0.8 and parameters: {'max_depth': 10, 'n_estimators': 70}. Best is trial 3 with value: 0.8.[0m
[32m[I 2022-03-19 15:49:59,105][0m Trial 4 finished with value: 0.7596899224806202 and parameters: {'max_depth': 5, 'n_estimators': 

In [132]:
import plotly
from optuna.visualization import plot_optimization_history
plot_optimization_history(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.