# Titanic dataset - Predict survival

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
%matplotlib inline
#plt.style.use('fivethirtyeight')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
df = pd.read_csv('/Users/gautamborgohain/Downloads/titanic_train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [None]:
# df = pd.get_dummies(df,columns=['Sex','Embarked'])
# df.drop(['PassengerId','Name','Ticket','Cabin'], axis = 1, inplace=True)


In [3]:
df.Cabin.fillna(0, inplace=True)

Seperate out the Cabin attribute to see if it improves performance

In [4]:
df['Cabin_Alp'] = [' '.join(re.findall(r'([A-Z]+)[0-9]',str(cabin))) for cabin in df.Cabin]
df['Cabin_Num'] = [''.join(re.findall(r'[A-Z]([0-9]+)',str(cabin))) for cabin in df.Cabin]

In [5]:
df.Cabin_Num = pd.to_numeric(df.Cabin_Num)
df.Cabin_Num.fillna(0, inplace=True)

In [6]:
df = pd.get_dummies(df,columns=['Sex','Embarked','Cabin_Alp'])
df.drop(['PassengerId','Name','Ticket','Cabin'], axis = 1, inplace=True)

In [7]:
df.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_Num',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Cabin_Alp_', 'Cabin_Alp_A', 'Cabin_Alp_B', 'Cabin_Alp_B B',
       'Cabin_Alp_B B B', 'Cabin_Alp_B B B B', 'Cabin_Alp_C', 'Cabin_Alp_C C',
       'Cabin_Alp_C C C', 'Cabin_Alp_D', 'Cabin_Alp_D D', 'Cabin_Alp_E',
       'Cabin_Alp_F', 'Cabin_Alp_G'],
      dtype='object')

# XGBoost

In [8]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

import xgboost as xgb
y = np.ravel(df.Survived)
X = df.drop('Survived', axis = 1,inplace= False)
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size = 0.3, random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((623, 25), (268, 25), (623,), (268,))

In [9]:
# Default parameters
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [10]:
from sklearn import metrics
predicted = xgb_model.predict(X_test)       
metrics.accuracy_score(Y_test, predicted)

0.76865671641791045

In [11]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(xgb_model, X_train, Y_train, cv=5)
scores.mean()

0.84120798771121363

Tuning the parameters

In [12]:
xgb_model = xgb.XGBClassifier(gamma=0,learning_rate=0.01,max_depth=6,reg_alpha=0, reg_lambda=1.0)
xgb_model.fit(X_train,Y_train)
predicted = xgb_model.predict(X_test)       
metrics.accuracy_score(Y_test, predicted)

0.79104477611940294

In [14]:
scores = cross_validation.cross_val_score(xgb_model, X_train, Y_train, cv=5)
scores.mean()

0.8220323604710702

It has been overtrained as i decreased the learning rate too much

# Random Forrest

In [15]:
df = pd.read_csv('/Users/gautamborgohain/Downloads/titanic_train.csv')
df.Cabin.fillna(0, inplace=True)
df['Cabin_Alp'] = [' '.join(re.findall(r'([A-Z]+)[0-9]',str(cabin))) for cabin in df.Cabin]
df['Cabin_Num'] = [''.join(re.findall(r'[A-Z]([0-9]+)',str(cabin))) for cabin in df.Cabin]
df.Cabin_Num = pd.to_numeric(df.Cabin_Num)
df = pd.get_dummies(df,columns=['Sex','Embarked','Cabin_Alp'])
df.drop(['PassengerId','Name','Ticket','Cabin'], axis = 1, inplace=True)
df.fillna(0, inplace=True)

y = np.ravel(df.Survived)
X = df.drop('Survived', axis = 1,inplace= False)
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size = 0.3, random_state=1)

Tune the parameters in the classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier
randomf_model = RandomForestClassifier(n_estimators = 500, random_state = 1,max_features = 7,max_depth = 10)
randomf_model.fit(X_train, Y_train)
predicted = randomf_model.predict(X_test)
metrics.accuracy_score(Y_test, predicted) 

0.79104477611940294

In [17]:
scores = cross_validation.cross_val_score(randomf_model, X_train, Y_train, cv=5)
scores.mean()

0.83808458781362005