In [97]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, precision_score, recall_score, \
    fbeta_score, roc_auc_score, precision_recall_curve, auc

In [99]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [103]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [104]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [106]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [107]:
assert(df.isnull().sum().all() == 0)

In [111]:
y = df['Survived']
X = df.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])

In [113]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,28.0,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [114]:
X = pd.get_dummies(X, columns=['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked'], drop_first=True)
X.head()

Unnamed: 0,Age,Fare,Sex_male,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
0,22.0,7.25,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,38.0,71.2833,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,26.0,7.925,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,35.0,53.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,35.0,8.05,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
X_train

Unnamed: 0,Age,Fare,Sex_male,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_Q,Embarked_S
320,22.0,7.2500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
388,28.0,7.7292,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
871,47.0,52.5542,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1
521,22.0,7.8958,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
510,29.0,7.7500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,39.0,7.9250,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
819,10.0,27.9000,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1
690,31.0,57.0000,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
765,51.0,77.9583,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


# Gradient Boosting

In [116]:
gb = CatBoostClassifier(task_type='GPU')
gb.fit(X_train, y_train)
gb_predicted = gb.predict(X_test)

Learning rate set to 0.0346
0:	learn: 0.6720644	total: 25.7ms	remaining: 25.7s
1:	learn: 0.6497034	total: 50.5ms	remaining: 25.2s
2:	learn: 0.6314897	total: 72ms	remaining: 23.9s
3:	learn: 0.6120556	total: 97.5ms	remaining: 24.3s
4:	learn: 0.5961282	total: 126ms	remaining: 25s
5:	learn: 0.5802370	total: 160ms	remaining: 26.5s
6:	learn: 0.5660750	total: 184ms	remaining: 26.1s
7:	learn: 0.5526015	total: 208ms	remaining: 25.8s
8:	learn: 0.5410371	total: 232ms	remaining: 25.6s
9:	learn: 0.5309120	total: 255ms	remaining: 25.2s
10:	learn: 0.5210753	total: 273ms	remaining: 24.6s
11:	learn: 0.5129427	total: 292ms	remaining: 24s
12:	learn: 0.5047735	total: 310ms	remaining: 23.5s
13:	learn: 0.4962037	total: 329ms	remaining: 23.2s
14:	learn: 0.4896049	total: 345ms	remaining: 22.7s
15:	learn: 0.4822966	total: 367ms	remaining: 22.6s
16:	learn: 0.4749724	total: 386ms	remaining: 22.3s
17:	learn: 0.4710554	total: 407ms	remaining: 22.2s
18:	learn: 0.4663609	total: 443ms	remaining: 22.9s
19:	learn: 0.46

In [117]:
gb.score(X_test, y_test)

0.8470149253731343

# Random Forest

In [119]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)

In [120]:
rf.score(X_test, y_test)

0.7761194029850746

# Logistic Regression

In [121]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled

array([[-0.5716604 , -0.51666622,  0.73626029, ..., -0.04009635,
        -0.28235209,  0.6125977 ],
       [-0.10809063, -0.50670515,  0.73626029, ..., -0.04009635,
         3.54167732, -1.63239269],
       [ 1.35988031,  0.42506664, -1.35821532, ..., -0.04009635,
        -0.28235209,  0.6125977 ],
       ...,
       [ 0.12369426,  0.51748094,  0.73626029, ..., -0.04009635,
        -0.28235209,  0.6125977 ],
       [ 1.66892683,  0.95313855, -1.35821532, ..., -0.04009635,
        -0.28235209,  0.6125977 ],
       [ 1.20535705, -0.11547948,  0.73626029, ..., -0.04009635,
        -0.28235209,  0.6125977 ]])

In [123]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
lr_predicted = lr.predict(X_test_scaled)

In [124]:
lr.score(X_test_scaled, y_test)

0.8432835820895522

# Final model

In [148]:
X.insert(loc=17, column='Parch_9', value=0)

ValueError: cannot insert Parch_9, already exists

In [150]:
final_model = CatBoostClassifier(task_type='GPU')
final_model.fit(X, y)

Learning rate set to 0.033926
0:	learn: 0.6723711	total: 26.3ms	remaining: 26.3s
1:	learn: 0.6485542	total: 53.5ms	remaining: 26.7s
2:	learn: 0.6307569	total: 76.4ms	remaining: 25.4s
3:	learn: 0.6116633	total: 103ms	remaining: 25.6s
4:	learn: 0.5944660	total: 128ms	remaining: 25.4s
5:	learn: 0.5776872	total: 153ms	remaining: 25.4s
6:	learn: 0.5648808	total: 179ms	remaining: 25.3s
7:	learn: 0.5528352	total: 205ms	remaining: 25.5s
8:	learn: 0.5417655	total: 227ms	remaining: 25s
9:	learn: 0.5304710	total: 248ms	remaining: 24.5s
10:	learn: 0.5206853	total: 272ms	remaining: 24.5s
11:	learn: 0.5130449	total: 292ms	remaining: 24.1s
12:	learn: 0.5071031	total: 310ms	remaining: 23.5s
13:	learn: 0.4998294	total: 326ms	remaining: 23s
14:	learn: 0.4928892	total: 346ms	remaining: 22.7s
15:	learn: 0.4843223	total: 364ms	remaining: 22.4s
16:	learn: 0.4786698	total: 383ms	remaining: 22.2s
17:	learn: 0.4734806	total: 403ms	remaining: 22s
18:	learn: 0.4699760	total: 422ms	remaining: 21.8s
19:	learn: 0.4

<catboost.core.CatBoostClassifier at 0x1c87b35cf40>

# Predict

In [152]:
df_test = pd.read_csv('data/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [153]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [155]:
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].median())
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mode()[0])

In [157]:
assert(df_test.isnull().sum().all() == 0)

In [158]:
X_ans = pd.DataFrame()
X_ans['PassengerId'] = df_test['PassengerId']

X_ans

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [159]:
X_predict = df_test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

In [160]:
X_predict = pd.get_dummies(X_predict, columns=['Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked'], drop_first=True)
X_predict

Unnamed: 0,Age,Fare,Sex_male,Pclass_2,Pclass_3,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Parch_9,Embarked_Q,Embarked_S
0,34.5,7.8292,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,47.0,7.0000,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,62.0,9.6875,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,27.0,8.6625,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,22.0,12.2875,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,27.0,8.0500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
414,39.0,108.9000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
415,38.5,7.2500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
416,27.0,8.0500,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [162]:
assert(X_predict.shape[1] == X.shape[1])

In [164]:
X_ans['Survived'] = final_model.predict(X_predict)
X_ans

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [165]:
X_ans.to_csv('predictions.csv', index=False)