# Titanic Data Challenge

In [135]:
datafile = "titanicdata.htm"

In [136]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
with open (datafile,encoding='ISO-8859-1') as f:
    soup = BeautifulSoup(f,"html.parser")

In [137]:
table = soup.find('table')

In [138]:
## Read table as a dataframe
## read_html returns a list. Here we have only table and hence we need just the 1st element 
data = pd.read_html(str(table), flavor='bs4')[0]
data.head(5)

Unnamed: 0,Name,Age,Class/Dept,Ticket,Joined,Job,Boat [Body],Unnamed: 7
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27,3rd Class Passenger,2699£18 15s 9d,Cherbourg,,15,
1,"ABBING, Mr Anthony",42,3rd Class Passenger,5547£7 11s,Southampton,Blacksmith,,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39,3rd Class Passenger,CA2673£20 5s,Southampton,,A,
3,"ABBOTT, Mr Rossmore Edward",16,3rd Class Passenger,CA2673£20 5s,Southampton,Jeweller,[190],
4,"ABBOTT, Mr Eugene Joseph",13,3rd Class Passenger,CA2673£20 5s,Southampton,Scholar,,


In [139]:
def cleanup(value):
    return value.encode('ascii', errors='replace').replace("."," ")
def remove_nas(value):
    value=str(value)
    return value.replace("nan"," ")

data["Boat [Body]"] = data["Boat [Body]"].apply(remove_nas)
#data['Name'] = data['Name'].apply(cleanup)
data['Age'] = data['Age'].apply(pd.to_numeric, errors='coerce')
data = data[["Name","Age","Class/Dept","Boat [Body]"]]

data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body]
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190]
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,


## Add attributes

In [140]:
def checkPass(class_type):
    if "Passenger" in class_type:
        return "Passenger"
    else:
        return "Crew"

data["Crew/Pass"] = data["Class/Dept"].apply(checkPass)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger


In [141]:
def extractClass(class_type):
    if "Passenger" in class_type:
        return class_type.split(" ")[0]
    else:
        return "Crew"
data["Class"] = data['Class/Dept'].apply(extractClass)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd


In [142]:
def checkAge(value):
    if value < 18:
        return "Child"
    else:
        return "Adult"
    
data["Child/Adult"] = data["Age"].apply(checkAge)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Child/Adult
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child


In [143]:
def checkGender(name):
    firstname = name[name.index(",")+2:]
    salutation = firstname.split(" ")[0]
    if salutation in ["Mr","Master"]:
        return "Male"
    else:
        return "Female"
    
data["Gender"] = data['Name'].apply(checkGender)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Child/Adult,Gender
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult,Male
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult,Male
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult,Female
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child,Male
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child,Male


In [144]:
def checkSurvival(boat):
    if boat.strip()=="" or "[" in boat:
        return 0
    else:
        return 1
data['Survived'] = data['Boat [Body]'].apply(checkSurvival)
data.head()

Unnamed: 0,Name,Age,Class/Dept,Boat [Body],Crew/Pass,Class,Child/Adult,Gender,Survived
0,"ABÄ«-AL-MUNÃ , Mr NÄsÄ«f QÄsim",27.0,3rd Class Passenger,15,Passenger,3rd,Adult,Male,1
1,"ABBING, Mr Anthony",42.0,3rd Class Passenger,,Passenger,3rd,Adult,Male,0
2,"ABBOTT, Mrs Rhoda Mary 'Rosa'",39.0,3rd Class Passenger,A,Passenger,3rd,Adult,Female,1
3,"ABBOTT, Mr Rossmore Edward",16.0,3rd Class Passenger,[190],Passenger,3rd,Child,Male,0
4,"ABBOTT, Mr Eugene Joseph",13.0,3rd Class Passenger,,Passenger,3rd,Child,Male,0


In [145]:
#REMOVE REC|ORDS WITH NULL VALUES
#len(data)
data = data.dropna()
len(data)

2426

In [146]:
data.groupby(['Crew/Pass'])['Survived'].sum()*100/data.groupby(['Crew/Pass'])['Survived'].count()

Crew/Pass
Crew         13.636364
Passenger    36.425339
Name: Survived, dtype: float64

In [147]:
def compare(group,data):
    return data.groupby([group])['Survived'].sum()*100/data.groupby([group])['Survived'].count()

compare('Class', data)

Class
1st     59.701493
2nd     39.084507
3rd     24.328147
Crew    13.636364
Name: Survived, dtype: float64

In [148]:
compare('Gender',data)

Gender
Female    60.424028
Male      15.645161
Name: Survived, dtype: float64

In [149]:
trainingData = data[['Age','Crew/Pass','Class','Child/Adult','Gender','Survived']]
trainingData.head()

Unnamed: 0,Age,Crew/Pass,Class,Child/Adult,Gender,Survived
0,27.0,Passenger,3rd,Adult,Male,1
1,42.0,Passenger,3rd,Adult,Male,0
2,39.0,Passenger,3rd,Adult,Female,1
3,16.0,Passenger,3rd,Child,Male,0
4,13.0,Passenger,3rd,Child,Male,0


## Try CatBoost

In [158]:
from catboost import CatBoostClassifier

In [178]:
cat_features_index = [1,2,3,4]
#cat_features_index = np.where(trainingData.dtypes != float)[0]
x = trainingData.drop('Survived',axis=1)
y = trainingData.Survived

# Split into train and test 
from sklearn import model_selection
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(x,y,train_size=.85,random_state=1234)

#let us make the catboost model, use_best_model params will make the model prevent overfitting
clf = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42)

def checkCatAccuracy(clf):
    clf = clf.fit(xtrain, ytrain, cat_features=cat_features_index, eval_set=(xtest,ytest))
    predict = clf.predict(xtest)
    return accuracy_score(ytest, predict)



In [182]:
# checkCatAccuracy(clf)
print('the test accuracy is :{:.6f}'.format(accuracy_score(ytest,clf.predict(xtest))))

the test accuracy is :0.829670


In [184]:
from sklearn.metrics import accuracy_score
def checkAccuracy(clf):
    clf = clf.fit(xtrain, ytrain)
    predict = clf.predict(xtest)
    return accuracy_score(ytest, predict)

In [185]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

catData = trainingData[['Crew/Pass','Child/Adult','Gender', 'Class']].apply(catToNum)
trainingData[['Crew/Pass','Child/Adult','Gender', 'Class']] = catData
trainingData.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Unnamed: 0,Age,Crew/Pass,Class,Child/Adult,Gender,Survived
0,27.0,1,2,0,1,1
1,42.0,1,2,0,1,0
2,39.0,1,2,0,0,1
3,16.0,1,2,1,1,0
4,13.0,1,2,1,1,0


In [186]:
from sklearn import model_selection
xtrain,xtest,ytrain,ytest = model_selection.train_test_split(x,y,train_size=.85,random_state=1234)



In [187]:
xtrain.head()

Unnamed: 0,Age,Crew/Pass,Class,Child/Adult,Gender
1939,26.0,0,3,0,1
2257,27.0,1,2,0,0
229,28.0,0,3,0,1
2032,40.0,0,3,0,1
1014,44.0,1,0,0,0


## Simple Decision Trees

In [188]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [189]:
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [190]:
checkAccuracy(clf)

0.8104395604395604

## Gradient Boosting trees

In [191]:
from xgboost.sklearn import XGBClassifier

In [192]:
clf = XGBClassifier()

In [193]:
checkAccuracy(clf)

  if diff:


0.8159340659340659

## Random Forests

In [194]:
# Interesting observation -- Gradient Boosted trees can deal wit null values in the dataset.. But Random forests cannot
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 1000)

In [195]:
clf
#n_estimators is used to specify the no of trees in the classifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [196]:
# with the default no of trees in the model, accuracy score was 79 %
checkAccuracy(clf)

0.8076923076923077

## Hyper Parameter Tuning

In [160]:
# A brute force technique to find the optimal combination of parameters
#pip install git+https://github.com/hyperopt/hyperopt.git
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [165]:
# optimise the parameters of the gradient boosted trees
space = {
    'n_estimators' : hp.quniform('n_estimators', 100,1000,1),
    'learning_rate' : hp.quniform('learning_rate', 0.025, 0.5, 0.025),
    'max_depth' : hp.quniform('max_depth', 1,13,1),
    'min_child_weight' : hp.quniform('min_child_weight', 1,6,1),
    'subsample' : hp.quniform('subsample', 0.5,1,0.05),
    'gamma' : hp.quniform('gamma', 0.5,1,0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5,1,0.05),
    'nthread' : 6,
    'silent' : 1
}

In [177]:
def score(space):
    space['n_estimators'] = int(space['n_estimators'])
    space['max_depth'] = int(space['max_depth'])
    clf = XGBClassifier(space)
    return {'loss':1-checkAccuracy(clf), 'status':STATUS_OK}

In [175]:
trials = Trials()

In [178]:
best = fmin(score, space, algo=tpe.suggest, trials = trials, max_evals=250)

XGBoostError: b"Invalid Parameter format for max_depth expect int but value='{'colsample_bytree': 0.9, 'gamma': 0.55, 'learning_rate': 0.47500000000000003, 'max_depth': 8, 'min_child_weight': 3.0, 'n_estimators': 576, 'nthread': 6, 'silent': 1, 'subsample': 0.5}'"

In [128]:
print (best)

NameError: name 'best' is not defined