In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier

%matplotlib inline
pd.set_option("display.precision", 2)

  from pandas import MultiIndex, Int64Index


This notebook attempts to solve the Titanic problem with a grid search approach for training our models

## 1) Preprocessing the data

First of all, we need to get the data into a format that we can actually use for models \
Therefore, we remove missing values and redundant columns etc

In [55]:
data = pd.read_csv("../Data/Titanic/train.csv")

In [56]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [57]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [58]:
# select target
y = data.Survived

# drop target from the data
X = data.drop(['Survived'], axis = 1)

In [59]:
# split data into train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [60]:
# where do we have null values?

cols_with_null = [col for col in X_train.columns if X_train[col].isnull().any()]
#print("null values in ", cols_with_null)

Null values present in Age, Embarked and Cabin
* Cabin is null if Pclass != 1 --> drop
* Drop PassengerId, Ticket, Name --> all are non-useful columns
* For age, just give people the median age
* Drop Embarked (for now)

In [61]:
# dropping relevant columns
cols_to_drop = ['Cabin', 'PassengerId', 'Name', 'Ticket', 'Embarked']
reduced_X_train = X_train.drop(cols_to_drop, axis = 1)
reduced_X_valid = X_valid.drop(cols_to_drop, axis = 1)

In [62]:
# # ONE-HOT ENCODING

encoder = OneHotEncoder(sparse = False)

label_X_train = reduced_X_train.copy()
label_X_valid = reduced_X_valid.copy()

# train onehot encoder to the data
train_encoded = pd.DataFrame(encoder.fit_transform(np.asarray(label_X_train['Sex']).reshape(-1, 1)), dtype = int)
valid_encoded = pd.DataFrame(encoder.transform(np.asarray(label_X_valid['Sex']).reshape(-1, 1)), dtype = int)

train_encoded.index = label_X_train.index
valid_encoded.index = label_X_valid.index

# remove the sex column from label_X_train and add back columns for Male and Female
num_X_train = label_X_train.drop(['Sex'], axis = 1)
num_X_valid = label_X_valid.drop(['Sex'], axis = 1)

encoded_X_train = pd.concat([num_X_train, train_encoded], axis = 1)
encoded_X_train = encoded_X_train.rename(columns = {0 : 'Male', 1 : 'Female'})

encoded_X_valid = pd.concat([num_X_valid, valid_encoded], axis = 1)
encoded_X_valid = encoded_X_valid.rename(columns = {0 : 'Male', 1 : 'Female'})
encoded_X_train.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Female
140,3,,0,2,15.25,1,0
439,2,31.0,0,0,10.5,0,1
817,2,31.0,1,1,37.0,0,1
378,3,20.0,0,0,4.01,0,1
491,3,21.0,0,0,7.25,0,1


In [63]:
final_X_train = encoded_X_train.copy()
final_X_valid = encoded_X_valid.copy()

# replace values in the age column with median age
final_X_train['Age'] = encoded_X_train['Age'].fillna(encoded_X_train['Age'].median())
final_X_valid['Age'] = encoded_X_valid['Age'].fillna(encoded_X_valid['Age'].median())

In [64]:
# delete all variables that aren't useful anymore
del(label_X_train, label_X_valid, reduced_X_train, reduced_X_valid, \
    cols_with_null, cols_to_drop, train_encoded, valid_encoded, \
   num_X_train, num_X_valid, encoded_X_train, encoded_X_valid)

In [65]:
final_X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Female
140,3,29.0,0,2,15.25,1,0
439,2,31.0,0,0,10.5,0,1
817,2,31.0,1,1,37.0,0,1
378,3,20.0,0,0,4.01,0,1
491,3,21.0,0,0,7.25,0,1


#### 1) Random Forest Classifier
Optimised using Grid Search

In [69]:
# set of parameters to tune over
param_grid = { 
    'n_estimators': [70, 80, 90, 100, 110, 120],
    'max_depth' : [1, 3, 5, 7, 9, 11],
                }

randomForest_CV = GridSearchCV(estimator = RandomForestClassifier(random_state = 0), param_grid = param_grid, \
                               cv = 5, verbose = True, scoring = "f1", refit = True)

# carries out a grid search with cross validation, using f1_score for the scoring because the data isn't balanced very well

randomForest_CV.fit(final_X_train, y_train)

# Get the parameters from the best fit
nRF = randomForest_CV.best_params_.get('n_estimators')
max_depth = randomForest_CV.best_params_.get('max_depth')

best_model = randomForest_CV.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [70]:
print("best params = ", randomForest_CV.best_params_)
print("best score = ", randomForest_CV.best_score_)
ind_of_best = randomForest_CV.best_index_


best params =  {'max_depth': 11, 'n_estimators': 120}
best score =  0.7622528007356084


### 2) XGB Boost Method

All seems to work fairly fine, but would be good to actually tune hyperparameters to see if we can get a better model \
What about an XG Boost method?

In [71]:
param_grid = { 
    'learning_rate' : [0.1, 0.2, 0.5],
    'max_depth': [3, 5, 7],   
    'n_estimators' : [80, 90, 100, 110, 120, 130]
                }
xgBoost_CV = GridSearchCV(estimator = XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', random_state = 0), \
param_grid = param_grid, cv = 3, verbose = 1, scoring = "f1", refit = True)
xgBoost_CV.fit(final_X_train, y_train)

# # Get the parameters from the best fit
learning = xgBoost_CV.best_params_.get('learning_rate')
depth = xgBoost_CV.best_params_.get('max_depth')
nestimators = xgBoost_CV.best_params_.get('n_estimators')

bestXGB = xgBoost_CV.best_estimator_

Fitting 3 folds for each of 54 candidates, totalling 162 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

In [72]:
print("best params = ", xgBoost_CV.best_params_)
print("best score = ", xgBoost_CV.best_score_)

best params =  {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 110}
best score =  0.7406092224752444
