In [1]:
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.svm import SVC
import os
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv', index_col = 0)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Before we begin feature engineering, we split the train data into training data and test data:

In [3]:
y = df['Survived']

In [4]:
X = df[['Pclass', 'Name', 'Sex', 'Age', "SibSp", "Fare", "Embarked"]]

In [5]:
y.shape

(891,)

In [6]:
X.shape

(891, 7)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify = y, 
                                                    random_state=42)

In [8]:
y_train.value_counts()

0    412
1    256
Name: Survived, dtype: int64

In [9]:
y_test.value_counts()

0    137
1     86
Name: Survived, dtype: int64

In [10]:
X_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
487,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35.0,1,90.0,S
239,2,"Pengelly, Mr. Frederick William",male,19.0,0,10.5,S
723,2,"Gillespie, Mr. William Henry",male,34.0,0,13.0,S
185,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4.0,0,22.025,S
57,2,"Rugg, Miss. Emily",female,21.0,0,10.5,S


In [11]:
y_train.head()

PassengerId
487    1
239    0
723    0
185    1
57     1
Name: Survived, dtype: int64

In [12]:
X_train.shape

(668, 7)

### Feature engineering: 
* *scale* the Pclass (primarily to aid the SVM we run later)
* *one-hot-encode* the category of sex (turn it from category to binary)
* *impute* to populate the missing age values with the overall mean age
* also add missing values to embarkation point for two passengers
* create a new binary column indicating whether the passenger embarked at Cherbourg (C)
* *bin* the ages to convert from scalar to category and *one-hot-encode* to assign a 1 or 0 to each age bracket

(as a function so that we can later call it on the test data):

In [13]:
scaler = MinMaxScaler()
X = X_train['Pclass'].values.reshape(-1, 1)
scaler.fit(X)
X_train['Pclass_scaled'] = scaler.transform(X)



In [14]:
# dictionary to categorise titles
newtitles={
    "Capt":       "Official",
    "Col":        "Official",
    "Major":      "Official",
    "Jonkheer":   "Higher",
    "Don":        "Higher",
    "Sir" :       "Higher",
    "Dr":         "Official",
    "Rev":        "Official",
    "the Countess":"Higher",
    "Dona":       "Higher",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Higher"}

In [15]:
# extract title from name in order to impute missing ages
X_train['Title'] = X_train['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
X_train['Title']=X_train['Title'].map(newtitles)

In [16]:
mean_ages = X_train.groupby(['Title','Sex'])[['Age']].mean()
mean_ages

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Title,Sex,Unnamed: 2_level_1
Higher,female,48.0
Higher,male,42.333333
Master,male,3.8668
Miss,female,21.533784
Mr,male,32.458054
Mrs,female,36.406977
Official,female,49.0
Official,male,46.583333


In [17]:
# def cleaning(dataframe):
#     dataframe['Sex'] = pd.get_dummies(dataframe['Sex'], drop_first = True) #one-hot-encode to turn sex from category to binary
#     dataframe['Age'].fillna(np.mean(dataframe['Age']), inplace=True) # impute to fill in the missing age values
#     dataframe['Embarked'].fillna('S', inplace=True)
#     dataframe['age_grp']= pd.cut(dataframe['Age'], bins = [0,12,25,45,80],duplicates='drop').astype(str)
#     dummies1 = pd.get_dummies(dataframe['age_grp'], drop_first=True )
#     dataframe = pd.merge(dummies1, dataframe, right_index=True, left_index=True)
#     del dataframe['age_grp']
#     del dataframe['Age']
#     dummies2 = pd.get_dummies(dataframe['Embarked'])
#     dataframe = pd.merge(dummies2, dataframe, right_index=True, left_index=True)
#     del dataframe['Embarked']
#     del dataframe ['Q']
#     del dataframe['S']
#     del dataframe['Pclass'] ### REMOVE ORIGINAL COLUMN SO IT DOESNT AFFECT FITTING
#     return dataframe

In [18]:
def get_new_ages(cols):
    """define new ages, with missing values based on title and sex"""
    Title=cols[0]
    Sex=cols[1]
    Age=cols[2]
    if pd.isnull(Age):
        if Title=='Master' and Sex==1:
            return 3.8668
        elif Title=='Miss' and Sex==0:
            return 21.5338
        elif Title=='Mr' and Sex==1: 
            return 32.458
        elif Title=='Mrs' and Sex==0:
            return 36.407
        elif Title=='Official' and Sex==0:
            return 49.0
        elif Title=='Official' and Sex==1:
            return 46.5833
        elif Title=='Higher' and Sex==0:
            return 48.0
        elif Title=='Higher' and Sex==1:
            return 42.3333
    else:
        return Age

In [19]:
### Breaking down Feature Engineering ###
X_train2 = X_train.copy()

# one-hot-encode to turn sex from category to binary
X_train2['Sex'] = pd.get_dummies(X_train2['Sex'], drop_first = True)

X_train2['New_Age'] = X_train2[['Title','Sex','Age']].apply(get_new_ages, axis=1)

X_train2['Embarked'].fillna('S', inplace=True)

X_train2['age_grp'] = pd.cut(X_train2['New_Age'], bins = [0,12,25,45,80],duplicates='drop').astype(str)

dummies1 = pd.get_dummies(X_train2['age_grp'], drop_first=True )

X_train2 = pd.merge(dummies1, X_train2, right_index=True, left_index=True)

del X_train2['age_grp']
del X_train2['Age']

dummies2 = pd.get_dummies(X_train2['Embarked'])
X_train2 = pd.merge(dummies2, X_train2, right_index=True, left_index=True)

del X_train2['Embarked']
del X_train2['Q']
del X_train2['S']
del X_train2['Pclass']
del X_train2['New_Age']
del X_train2['Title']
del X_train2['Name']

In [20]:
X_train2.head()

Unnamed: 0_level_0,C,"(12, 25]","(25, 45]","(45, 80]",Sex,SibSp,Fare,Pclass_scaled
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
487,0,0,1,0,0,1,90.0,0.0
239,0,1,0,0,1,0,10.5,0.5
723,0,0,1,0,1,0,13.0,0.5
185,0,0,0,0,0,0,22.025,1.0
57,0,1,0,0,0,0,10.5,0.5


In [None]:
cXtr = cleaning(X_train)

In [None]:
cXtr.head()

### Next, we define, fit and examine model:

In [None]:
m = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

In [None]:
m.fit(cXtr, y_train)

In [None]:
m.score(cXtr, y_train) # Calculate the accuracy of the model

In [None]:
m.coef_

so boarding at Cherbourg has a positive influence, Pclass has a negative influence, age a smaller negative influence and sex (in this case, "maleness") a stronger negative influence on chance of survival.

In [None]:
Dave = [[0, 0, 1, 0, 0.5, 1]] # create a new input value called Dave

In [None]:
m.predict_proba(Dave) # predict probability of Dave first dying and then surviving (as survived = 0 --> "died")

so Dave has around a (xx)% chance of survival based on the input values we used

In [None]:
ypredtr = m.predict(cXtr)

In [None]:
ypredtr

In [None]:
confusion_matrix(y_pred = ypredtr, y_true = y_train)

In [None]:
precision_score(y_pred = ypredtr, y_true = y_train)

In [None]:
recall_score(y_pred = ypredtr, y_true = y_train)

In [None]:
yprobtr = m.predict_proba(cXtr)

In [None]:
yprobtr

In [None]:
skplt.metrics.plot_roc(y_train, yprobtr, title = 'ROC curve', plot_micro = False, plot_macro = False, classes_to_plot = 1)

---Validate your model---

In [None]:
cross_val_score(m, cXtr, y_train, cv=5, scoring = 'accuracy')

This produces a wider range of scores than we would like. Let's try bootstrapping:

In [None]:
boots = []
m2 = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

for i in range(1000):
    Xb, yb = resample(cXtr, y_train)
    m2.fit(Xb, yb)
    score = m2.score(Xb, yb)
    boots.append(score)
#     print(i, score)

In [None]:
boots.sort()

In [None]:
ci80 = boots[100:-100]
print(f"80% confidence interval: {ci80[0]:5.4} -{ci80[-1]:5.4}")

In [None]:
ci90 = boots[50:-50]
print(f"90% confidence interval: {ci90[0]:5.4} -{ci90[-1]:5.4}")

In [None]:
ci95 = boots[25:-25]
print(f"95% confidence interval: {ci95[0]:5.4} -{ci95[-1]:5.4}")

In [None]:
ci99 = boots[5:-5]
print(f"99% confidence interval: {ci99[0]:5.4} -{ci99[-1]:5.4}")

### Let's test the same model on the X_test sample. First we clean it using the function:

In [None]:
X_test['Pclass_scaled'] = scaler.transform(X_test[['Pclass']])

X_test.head()

In [None]:
cXte = cleaning(X_test.copy())


In [None]:
cXte.head(10)

### Tune the hyperparameters for our Logistic Regression model: we do a Grid Search selecting different C-values (levels of regularisation)

In [None]:
params = {
    'C': [20.0, 15.0, 10.0, 5.0, 1.0, 0.1, 0.01, 0.001]
}
g = GridSearchCV(m, param_grid=params, cv=5, iid=False)
g.fit(cXtr, y_train)
g.score(cXtr, y_train)

In [None]:
g.best_params_

Check score for Logistic Regression model on test data:

In [None]:
m.score(cXte, y_test)

In [None]:
m.coef_

# Next we will apply a random forest to our data set:

First, a decision tree:

In [None]:
t = DecisionTreeClassifier(max_depth=5)
t.fit(cXtr, y_train)

In [None]:
t.score(cXtr, y_train)

We visualise this using graphviz:

In [None]:
tree = export_graphviz(t, out_file=None, 
                class_names=["Did not survive", "Survived"],
                feature_names=['boarded C','age 12-25', 'age 25-45', 'age 45-80', "Pclass", "sex"],
                impurity=False,
                filled=True)
open('titanic.dot', 'w').write(tree)

graph = graphviz.Source(tree)
graph.render('tree')  # creates PDF
graph # displays in jupyter

Then expand this to a Random Forest:

In [None]:
rf = RandomForestClassifier()
rf.fit(cXtr, y_train)
rf.score(cXtr, y_train)

### To tune the hyperparameters for our Random Forest, we do a Grid Search, selecting different levels of maximum tree depth and different numbers of trees:

In [None]:
params = {
    'max_depth': [1, 2, 4, 6, 7, 8, 9, 10],
    'n_estimators': [1, 3, 5, 7, 10, 15, 20]
}
g = GridSearchCV(rf, param_grid=params, cv=5, iid=False)
g.fit(cXtr, y_train)
g.score(cXtr, y_train)

In [None]:
g.best_params_

Decision tree for test data:

In [None]:
t.score(cXte, y_test)

In [None]:
rf.score(cXte, y_test)

In [None]:
g.score(cXte, y_test)

In [None]:
# m_poly = PolynomialFeatures(2)
# n = m_poly.fit_transform(X_train)

## Now let's try a Support Vector:

In [None]:
sv = SVC()

In [None]:
sv.fit(cXtr, y_train)

In [None]:
sv.score(cXtr, y_train)

In [None]:
sv.score(cXte, y_test)