I'm doing the Titanic project for Kaggle.

The aim is to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).

In [58]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [59]:
# Load the passenger data
passengers = pd.read_csv('train.csv')
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [60]:
# Update sex column to numerical
passengers['Sex'] = passengers['Sex'].replace(['female'],0)
passengers['Sex'] = passengers['Sex'].replace(['male'],1)

In [61]:
passengers.Age.isna().sum()

177

In [62]:
# Fill the nan values in the age column
passengers['Age'] = passengers['Age'].replace(np.nan,np.mean(passengers['Age']))

In [63]:
passengers.Age.isna().sum()

0

In [64]:
passengers.Pclass.unique()

array([3, 1, 2])

In [65]:
# one hot encode the class columns
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x==1 else 0)
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x==2 else 0)
passengers['ThirdClass'] = passengers['Pclass'].apply(lambda x: 1 if x==3 else 0)

passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass,ThirdClass
0,1,0,3,"Braund, Mr. Owen Harris",1,22.000000,1,0,A/5 21171,7.2500,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.000000,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.000000,1,0,113803,53.1000,C123,S,1,0,0
4,5,0,3,"Allen, Mr. William Henry",1,35.000000,0,0,373450,8.0500,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.000000,0,0,211536,13.0000,,S,0,1,0
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.000000,0,0,112053,30.0000,B42,S,1,0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,29.699118,1,2,W./C. 6607,23.4500,,S,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",1,26.000000,0,0,111369,30.0000,C148,C,1,0,0


In [66]:
#Figure out what correlates with survival

In [67]:
passengers[passengers.columns].corr()['Survived'].sort_values(ascending=False)

Survived       1.000000
FirstClass     0.285904
Fare           0.257307
SecondClass    0.093349
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.069809
ThirdClass    -0.322308
Pclass        -0.338481
Sex           -0.543351
Name: Survived, dtype: float64

I'm going to try a few different ML models: Logistic Regression, Random Forest, Naive Bayes, SVM. And then rank them.

In [68]:
# Select the desired features i.e. the numerical data
features = passengers[['Sex','Age','SibSp','Parch','Fare','FirstClass','SecondClass','ThirdClass']]
survival = passengers['Survived']
features

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,ThirdClass
0,1,22.000000,1,0,7.2500,0,0,1
1,0,38.000000,1,0,71.2833,1,0,0
2,0,26.000000,0,0,7.9250,0,0,1
3,0,35.000000,1,0,53.1000,1,0,0
4,1,35.000000,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...
886,1,27.000000,0,0,13.0000,0,1,0
887,0,19.000000,0,0,30.0000,1,0,0
888,0,29.699118,1,2,23.4500,0,0,1
889,1,26.000000,0,0,30.0000,1,0,0


Logistic Regression first.

In [69]:
# Perform train, test, split
train_features,test_features,train_labels,test_labels = train_test_split(features,survival)

# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

model = LogisticRegression()
model.fit(train_features,train_labels)
model.score(train_features,train_labels)

0.7979041916167665

In [70]:
# Score the model on the test data
log_reg_score = model.score(test_features,test_labels)
log_reg_score

0.7668161434977578

Random Forest:

In [71]:
train_features,test_features,train_labels,test_labels = train_test_split(features,survival)

model = RandomForestClassifier()
model.fit(train_features,train_labels)
model.score(train_features,train_labels)

0.9820359281437125

In [72]:
random_forest_score = model.score(test_features,test_labels)
random_forest_score

0.820627802690583

Naive Bayes:

In [73]:
train_features,test_features,train_labels,test_labels = train_test_split(features,survival)

model = MultinomialNB()
model.fit(train_features,train_labels)
model.score(train_features,train_labels)

0.6976047904191617

In [74]:
bayes_score = model.score(test_features,test_labels)
bayes_score

0.6816143497757847

SVC:

In [75]:
train_features,test_features,train_labels,test_labels = train_test_split(features,survival)

scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

model = LinearSVC()
model.fit(train_features,train_labels)
model.score(train_features,train_labels)



0.7964071856287425

In [81]:
svc_score = model.score(test_features,test_labels)
svc_score

0.8071748878923767

Now let's rank them:

In [82]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Linear SVC'],
    'Score': [log_reg_score, random_forest_score, bayes_score,svc_score]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
1,Random Forest,0.820628
3,Linear SVC,0.807175
0,Logistic Regression,0.766816
2,Naive Bayes,0.681614
