# Titanic ML model

This notebook will guide you through my process of creating a ML model to predict who died and who survived the Titanic.

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [5]:
data = pd.read_csv("/Users/pedro/github/intro-statistical-learning/data/titanic/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Stratified sampling

During data exploration, proxy class seemed to be a rather influential feature, thus I want to make sure these are stratified proportionally in my training and test groups.

In [7]:
split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state = 69)
for train_index, test_index in split.split(data, data.Pclass):
        strat_train_set = data.loc[train_index]
        strat_test_set = data.loc[test_index]

Let's check if the proportions were maintained

In [8]:
strat_test_set.Pclass.value_counts() / len(strat_test_set)

3    0.553073
1    0.240223
2    0.206704
Name: Pclass, dtype: float64

In [11]:
strat_train_set.Pclass.value_counts() /len(strat_train_set)

3    0.550562
1    0.242978
2    0.206461
Name: Pclass, dtype: float64

In [12]:
#Original proportions
data.Pclass.value_counts() / len(data)

3    0.551066
1    0.242424
2    0.206510
Name: Pclass, dtype: float64

Ja! Alles gut!

In [116]:
#We create a copy of our training set to manipulate it as we wish
df = strat_train_set.copy()
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
456,457,0,1,"Millet, Mr. Francis Davis",male,65.00,0,0,13509,26.5500,E38,S
494,495,0,3,"Stanley, Mr. Edward Roland",male,21.00,0,0,A/4 45380,8.0500,,S
611,612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.0500,,S
136,137,1,1,"Newsom, Miss. Helen Monypeny",female,19.00,0,2,11752,26.2833,D47,S
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.00,4,2,347082,31.2750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
281,282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28.00,0,0,347464,7.8542,,S
303,304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.3500,E101,Q
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.00,0,0,7552,10.5167,,S
378,379,0,3,"Betros, Mr. Tannous",male,20.00,0,0,2648,4.0125,,C


# Transformations
Now let's deal with nans, scaling, and encoding our ordinal and categorical variables so that we can later test out different algorithms with any of our features

In [124]:
from sklearn.preprocessing import OneHotEncoder

df_cat = df[['Sex']]

cat_encoder = OneHotEncoder()
sex_1hot = cat_encoder.fit_transform(df_cat)
cat_encoder.categories_

[array(['female', 'male'], dtype=object)]

In [90]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import CategoricalImputer

#First we create the pipeline for numerical variables
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

#Now we create the full pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

num_att = ['Age', 'SibSp', 'Parch', 'Fare']
cat_att = ['Sex']
ord_att = ['Pclass']


full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_att),
    ('cat', OneHotEncoder(), cat_att),
    ('ord', OrdinalEncoder(), ord_att)
])

df_prepd = full_pipeline.fit_transform(df)

#Couldn't fit Embarkment because of its NaN. Follow the link for info on how to impute categorical variables to most frequent
# https://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn


NameError: name 'num' is not defined

# Modeling

In [141]:
from sklearn.linear_model import LinearRegression
df_prepd.shape


(712, 7)

In [153]:
linreg = LinearRegression()

df_features=['Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Pclass']

linreg.fit(df_prepd, df.Survived) #_labels is referring to our dependent variable, what we are trying to predict

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [161]:
some_data = df.iloc[:5]
some_labels = df.iloc[:5]
some_data_prepd = full_pipeline.transform(some_data)
print(linreg.predict(some_data_prepd))
print(list(some_labels))

[0.23408705 0.1578793  0.11449608 1.0173357  0.06898139]
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [149]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(df_prepd, df_labels)

ValueError: could not convert string to float: 'Age'