# Titanic - Machine Learning from Disaster

Start here! Predict survival on the Titanic and get familiar with ML basics

https://www.kaggle.com/competitions/titanic

In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, precision_score, recall_score, \
    fbeta_score, roc_auc_score, precision_recall_curve, auc

In [2]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [6]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [7]:
assert(df.isnull().sum().all() == 0)

In [8]:
y = df['Survived']
X = df.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])

In [9]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,28.0,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [10]:
X = pd.get_dummies(X, columns=['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked'], drop_first=True)
X.head()

Unnamed: 0,Age,Fare,Sex_male,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,22.0,7.25,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,38.0,71.2833,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,26.0,7.925,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,35.0,53.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,35.0,8.05,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
X_train

Unnamed: 0,Age,Fare,Sex_male,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
440,45.0,26.2500,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1
539,22.0,49.5000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
285,33.0,8.6625,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
221,27.0,13.0000,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
116,70.5,7.7500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,28.0,22.5250,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
790,28.0,7.7500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
476,34.0,21.0000,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
500,17.0,8.6625,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# Gradient Boosting

In [12]:
gb = CatBoostClassifier(task_type='GPU')
gb.fit(X_train, y_train)
gb_predicted = gb.predict(X_test)

Learning rate set to 0.0346
0:	learn: 0.6708544	total: 32.9ms	remaining: 32.9s
1:	learn: 0.6481148	total: 52.8ms	remaining: 26.4s
2:	learn: 0.6301217	total: 77ms	remaining: 25.6s
3:	learn: 0.6110259	total: 96.1ms	remaining: 23.9s
4:	learn: 0.5941102	total: 116ms	remaining: 23s
5:	learn: 0.5776866	total: 135ms	remaining: 22.3s
6:	learn: 0.5646061	total: 154ms	remaining: 21.9s
7:	learn: 0.5577762	total: 171ms	remaining: 21.1s
8:	learn: 0.5452087	total: 190ms	remaining: 20.9s
9:	learn: 0.5353203	total: 209ms	remaining: 20.7s
10:	learn: 0.5250777	total: 229ms	remaining: 20.6s
11:	learn: 0.5135302	total: 248ms	remaining: 20.5s
12:	learn: 0.5050293	total: 268ms	remaining: 20.3s
13:	learn: 0.5004184	total: 283ms	remaining: 20s
14:	learn: 0.4926568	total: 303ms	remaining: 19.9s
15:	learn: 0.4850407	total: 325ms	remaining: 20s
16:	learn: 0.4781301	total: 344ms	remaining: 19.9s
17:	learn: 0.4721792	total: 362ms	remaining: 19.8s
18:	learn: 0.4657965	total: 381ms	remaining: 19.7s
19:	learn: 0.4609

In [13]:
gb.score(X_test, y_test)

0.8246268656716418

# Random Forest

In [14]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)

In [15]:
rf.score(X_test, y_test)

0.8097014925373134

# Logistic Regression

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled

array([[ 1.17050467, -0.14362802, -1.38246698, ..., -0.04009635,
        -0.2953981 ,  0.61507608],
       [-0.59171014,  0.28316014, -1.38246698, ..., -0.04009635,
        -0.2953981 , -1.62581514],
       [ 0.25108825, -0.46647261,  0.72334458, ..., -0.04009635,
        -0.2953981 , -1.62581514],
       ...,
       [ 0.32770628, -0.23999954,  0.72334458, ..., -0.04009635,
        -0.2953981 ,  0.61507608],
       [-0.97480031, -0.46647261,  0.72334458, ..., -0.04009635,
        -0.2953981 ,  0.61507608],
       [-1.97083477, -0.09085314,  0.72334458, ..., -0.04009635,
         3.38526218, -1.62581514]])

In [17]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
lr_predicted = lr.predict(X_test_scaled)

In [18]:
lr.score(X_test_scaled, y_test)

0.832089552238806

# Final model

In [19]:
X.insert(loc=17, column='Parch_9', value=0)

In [20]:
final_model = CatBoostClassifier(task_type='GPU')
final_model.fit(X, y)

Learning rate set to 0.033926
0:	learn: 0.6723711	total: 20.2ms	remaining: 20.2s
1:	learn: 0.6485542	total: 39.5ms	remaining: 19.7s
2:	learn: 0.6307569	total: 57.9ms	remaining: 19.2s
3:	learn: 0.6116633	total: 77.8ms	remaining: 19.4s
4:	learn: 0.5944660	total: 96.6ms	remaining: 19.2s
5:	learn: 0.5776872	total: 116ms	remaining: 19.2s
6:	learn: 0.5648808	total: 135ms	remaining: 19.2s
7:	learn: 0.5528352	total: 154ms	remaining: 19.1s
8:	learn: 0.5417655	total: 178ms	remaining: 19.6s
9:	learn: 0.5304710	total: 201ms	remaining: 19.9s
10:	learn: 0.5206853	total: 226ms	remaining: 20.3s
11:	learn: 0.5130449	total: 245ms	remaining: 20.1s
12:	learn: 0.5071031	total: 262ms	remaining: 19.9s
13:	learn: 0.4998294	total: 278ms	remaining: 19.6s
14:	learn: 0.4928892	total: 297ms	remaining: 19.5s
15:	learn: 0.4843223	total: 316ms	remaining: 19.4s
16:	learn: 0.4786698	total: 335ms	remaining: 19.4s
17:	learn: 0.4734806	total: 354ms	remaining: 19.3s
18:	learn: 0.4699760	total: 371ms	remaining: 19.1s
19:	le

<catboost.core.CatBoostClassifier at 0x254ed38b4f0>

# Predict

In [21]:
df_test = pd.read_csv('data/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [22]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [23]:
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].median())
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mode()[0])

In [24]:
assert(df_test.isnull().sum().all() == 0)

In [25]:
X_ans = pd.DataFrame()
X_ans['PassengerId'] = df_test['PassengerId']

X_ans

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [26]:
X_predict = df_test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

In [27]:
X_predict = pd.get_dummies(X_predict, columns=['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked'], drop_first=True)
X_predict

Unnamed: 0,Age,Fare,Sex_male,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_Q,Embarked_S
0,34.5,7.8292,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,47.0,7.0000,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,62.0,9.6875,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,27.0,8.6625,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,22.0,12.2875,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,27.0,8.0500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
414,39.0,108.9000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
415,38.5,7.2500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
416,27.0,8.0500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [28]:
assert(X_predict.shape[1] == X.shape[1])

In [29]:
X_ans['Survived'] = final_model.predict(X_predict)
X_ans

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [30]:
X_ans.to_csv('submission.csv', index=False)