In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics

%matplotlib inline
pd.set_option("display.precision", 2)

This notebook attempts to solve the Titanic problem with a grid search approach for training our models

## 1) Preprocessing the data

First of all, we need to get the data into a format that we can actually use for models \
Therefore, we remove missing values and redundant columns etc

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [5]:
# select target
y = data.Survived

# drop target from the data
X = data.drop(['Survived'], axis = 1)

In [6]:
# split data into train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [7]:
# where do we have null values?

cols_with_null = [col for col in X_train.columns if X_train[col].isnull().any()]
#print("null values in ", cols_with_null)

Null values present in Age, Embarked and Cabin
* Cabin is null if Pclass != 1 --> drop
* Drop PassengerId, Ticket, Name --> all are non-useful columns
* For age, just give people the median age
* Drop Embarked (for now)

In [8]:
# dropping relevant columns
cols_to_drop = ['Cabin', 'PassengerId', 'Name', 'Ticket', 'Embarked']
reduced_X_train = X_train.drop(cols_to_drop, axis = 1)
reduced_X_valid = X_valid.drop(cols_to_drop, axis = 1)

In [9]:
# # LABEL ENCODING

# from sklearn.preprocessing import LabelEncoder

# object_cols = [col for col in reduced_X_train.columns if reduced_X_train[col].dtype == "object"]
# label_encoder = LabelEncoder()

# label_X_train = reduced_X_train.copy()
# label_X_valid = reduced_X_valid.copy()
# for col in set(object_cols):
#     print(col)
#     label_X_train[col] = label_encoder.fit_transform(reduced_X_train[col])
#     label_X_valid[col] = label_encoder.transform(reduced_X_valid[col]) # Your code here

# # ONE-HOT ENCODING
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse = False)

label_X_train = reduced_X_train.copy()
label_X_valid = reduced_X_valid.copy()

# train onehot encoder to the data
train_encoded = pd.DataFrame(encoder.fit_transform(np.asarray(label_X_train['Sex']).reshape(-1, 1)), dtype = int)
valid_encoded = pd.DataFrame(encoder.transform(np.asarray(label_X_valid['Sex']).reshape(-1, 1)), dtype = int)

train_encoded.index = label_X_train.index
valid_encoded.index = label_X_valid.index

# remove the sex column from label_X_train and add back columns for Male and Female
num_X_train = label_X_train.drop(['Sex'], axis = 1)
num_X_valid = label_X_valid.drop(['Sex'], axis = 1)

encoded_X_train = pd.concat([num_X_train, train_encoded], axis = 1)
encoded_X_train = encoded_X_train.rename(columns = {0 : 'Male', 1 : 'Female'})

encoded_X_valid = pd.concat([num_X_valid, valid_encoded], axis = 1)
encoded_X_valid = encoded_X_valid.rename(columns = {0 : 'Male', 1 : 'Female'})
encoded_X_train.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Female
140,3,,0,2,15.25,1,0
439,2,31.0,0,0,10.5,0,1
817,2,31.0,1,1,37.0,0,1
378,3,20.0,0,0,4.01,0,1
491,3,21.0,0,0,7.25,0,1


In [10]:
final_X_train = encoded_X_train.copy()
final_X_valid = encoded_X_valid.copy()

# replace values in the age column with median age
final_X_train['Age'] = encoded_X_train['Age'].fillna(encoded_X_train['Age'].median())
final_X_valid['Age'] = encoded_X_valid['Age'].fillna(encoded_X_valid['Age'].median())

In [11]:
# delete all variables that aren't useful anymore
del(label_X_train, label_X_valid, reduced_X_train, reduced_X_valid, \
    cols_with_null, cols_to_drop, train_encoded, valid_encoded, \
   num_X_train, num_X_valid, encoded_X_train, encoded_X_valid)

In [12]:
final_X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Female
140,3,29.0,0,2,15.25,1,0
439,2,31.0,0,0,10.5,0,1
817,2,31.0,1,1,37.0,0,1
378,3,20.0,0,0,4.01,0,1
491,3,21.0,0,0,7.25,0,1


### 1) Basic Random Forest Classifier

In [13]:
# from sklearn.ensemble import RandomForestClassifier

In [14]:
# model1 = RandomForestClassifier(n_estimators = 80, \
#                                max_depth = 5, \
#                                random_state = 0)

In [15]:
# model1.fit(final_X_train, y_train)

In [16]:
# predictions1 = model1.predict(final_X_valid)

In [17]:
# # calculate the error
# score_train = metrics.f1_score(np.asarray(y_valid), predictions1)
# accuracy_score = metrics.accuracy_score(np.asarray(y_valid), predictions1)
# print("f1 score = ", score_train)
# print("accuracy score = ", accuracy_score)

#### Now improve the random forest classifier using a grid search

In [18]:
# from sklearn.model_selection import GridSearchCV

In [19]:
# param_grid = { 
#     'criterion' : ['gini'],
#     'n_estimators': [70, 80, 90, 100, 110, 120],
#     'max_features': ['auto', 'log2'],
#     'max_depth' : [3, 5, 7, 9, 11],
#                 }
# randomForest_CV = GridSearchCV(estimator = RandomForestClassifier(random_state = 0), param_grid = param_grid, \
#                                cv = 3, verbose = True, scoring = "f1")
# randomForest_CV.fit(final_X_train, y_train)
# print(randomForest_CV.best_params_)

# # Get the parameters from the best fit
# criterion = randomForest_CV.best_params_.get('criterion')
# nRF = randomForest_CV.best_params_.get('n_estimators')
# max_features = randomForest_CV.best_params_.get('max_features')
# max_depth = randomForest_CV.best_params_.get('max_depth')

# best_model = RandomForestClassifier(n_estimators=nRF, max_depth=max_depth, \
#                                     max_features=max_features, criterion=criterion, random_state = 0)
# best_model.fit(final_X_train, y_train)

In [20]:
# predictions_rf = best_model.predict(final_X_valid)
# print("f1 score =", metrics.f1_score(np.asarray(y_valid), predictions_rf))
# print("accuracy score = ", metrics.f1_score(np.asarray(y_valid), predictions_rf))

Finding that the grid search actually produces a worse answer than just guessing a single tree - is it actually working correctly? Are we doing cross-fold validation incorrectly? 

### 2) XGB Boost Method

All seems to work fairly fine, but would be good to actually tune hyperparameters to see if we can get a better model \
What about an XG Boost method?

In [21]:
# # basic XGBoost model
# import xgboost as xgb
# from xgboost import XGBClassifier

# xgboost_searcher = XGBClassifier(use_label_encoder=False,random_state=0,n_estimators=110,learning_rate=0.2,max_depth=3)
# xgboost_searcher.fit(final_X_train, y_train)
# predictions2 = xgboost_searcher.predict(final_X_valid)

In [22]:
# print("f1 score =", metrics.f1_score(np.asarray(y_valid), predictions2))
# print("accuracy score = ", metrics.accuracy_score(np.asarray(y_valid), predictions2))

In [23]:
# param_grid = { 
#     'learning_rate' : [0.1, 0.2, 0.5],
#     'max_depth': [3, 5, 7],   
#     'n_estimators' : [80, 90, 100, 110, 120, 130]
#                 }
# xgBoost_CV = GridSearchCV(estimator = XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', random_state = 0), \
# param_grid = param_grid, cv = 3, verbose = 1, scoring = "f1")
# xgBoost_CV.fit(final_X_train, y_train)

# print(xgBoost_CV.best_params_)

# # # Get the parameters from the best fit
# learning = xgBoost_CV.best_params_.get('learning_rate')
# depth = xgBoost_CV.best_params_.get('max_depth')
# nestimators = xgBoost_CV.best_params_.get('n_estimators')

# # # Train the classifier on all the data for predicting the test data (use best hyperparams)
# bestXGB = XGBClassifier(learning_rate=learning, max_depth=depth, n_estimators=nestimators,\
#                         use_label_encoder=False, eval_metric = 'logloss', random_state = 0)

# bestXGB.fit(final_X_train, y_train)

# testPredictionsXGB = bestXGB.predict(final_X_valid)

In [24]:
# print("f1 score =", metrics.f1_score(np.asarray(y_valid), testPredictionsXGB))
# print("accuracy score = ", metrics.accuracy_score(np.asarray(y_valid), testPredictionsXGB))

### 3) XGBoost with Bayesian hyperparameter tuning

Here, we use an XGBoost method and use a Bayesian optimisation routine to choose our hyperparameters