KAGGLE - TITANIC

In [84]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report

import pickle

In [85]:
RANDOM_STATE = 42

In [86]:
df = pd.read_csv('./DATA/train.csv')

In [87]:
df.shape

(891, 12)

In [88]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [89]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [90]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [91]:
features = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = ['Survived']

df = df[features+target]

In [92]:
df.head()

Unnamed: 0,Sex,Age,Pclass,SibSp,Parch,Fare,Embarked,Survived
0,male,22.0,3,1,0,7.25,S,0
1,female,38.0,1,1,0,71.2833,C,1
2,female,26.0,3,0,0,7.925,S,1
3,female,35.0,1,1,0,53.1,S,1
4,male,35.0,3,0,0,8.05,S,0


In [93]:
df.isnull().sum()

Sex           0
Age         177
Pclass        0
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

In [94]:
df.dtypes

Sex          object
Age         float64
Pclass        int64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
Survived      int64
dtype: object

In [95]:
cat_cols = ['Sex', 'Embarked']
num_cols = ['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']

In [96]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
# X_train, y_train = X, y

In [98]:
X_train.shape, X_test.shape

((668, 7), (223, 7))

In [99]:
X_train.head()

Unnamed: 0,Sex,Age,Pclass,SibSp,Parch,Fare,Embarked
486,female,35.0,1,1,0,90.0,S
238,male,19.0,2,0,0,10.5,S
722,male,34.0,2,0,0,13.0,S
184,female,4.0,3,0,2,22.025,S
56,female,21.0,2,0,0,10.5,S


In [100]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,537.0,29.931881,14.455975,0.42,21.0,29.0,39.0,80.0
Pclass,668.0,2.312874,0.831906,1.0,2.0,3.0,3.0,3.0
SibSp,668.0,0.464072,0.999353,0.0,0.0,0.0,1.0,8.0
Parch,668.0,0.375749,0.832877,0.0,0.0,0.0,0.0,6.0
Fare,668.0,31.177469,47.457877,0.0,7.8958,13.8604,30.5,512.3292


In [101]:
# Drop one category per feature
categorical_transformer = Pipeline(
    steps=[('OHencoder', OneHotEncoder(handle_unknown='ignore', sparse=False, drop = 'first'))
    ])

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, cat_cols)],
    remainder='passthrough', 
    verbose_feature_names_out=False)

In [102]:
X_train_oh = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())
X_test_oh = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [103]:
X_train_oh.head()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Embarked_nan,Age,Pclass,SibSp,Parch,Fare
0,0.0,0.0,1.0,0.0,35.0,1.0,1.0,0.0,90.0
1,1.0,0.0,1.0,0.0,19.0,2.0,0.0,0.0,10.5
2,1.0,0.0,1.0,0.0,34.0,2.0,0.0,0.0,13.0
3,0.0,0.0,1.0,0.0,4.0,3.0,0.0,2.0,22.025
4,0.0,0.0,1.0,0.0,21.0,2.0,0.0,0.0,10.5


In [104]:
X_train = X_train_oh
X_test = X_test_oh

X_train = X_train.set_index(y_train.index)
X_test = X_test.set_index(y_test.index)

In [105]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sex_male,668.0,0.649701,0.477421,0.0,0.0,1.0,1.0,1.0
Embarked_Q,668.0,0.080838,0.272791,0.0,0.0,0.0,0.0,1.0
Embarked_S,668.0,0.717066,0.450762,0.0,0.0,1.0,1.0,1.0
Embarked_nan,668.0,0.002994,0.054677,0.0,0.0,0.0,0.0,1.0
Age,537.0,29.931881,14.455975,0.42,21.0,29.0,39.0,80.0
Pclass,668.0,2.312874,0.831906,1.0,2.0,3.0,3.0,3.0
SibSp,668.0,0.464072,0.999353,0.0,0.0,0.0,1.0,8.0
Parch,668.0,0.375749,0.832877,0.0,0.0,0.0,0.0,6.0
Fare,668.0,31.177469,47.457877,0.0,7.8958,13.8604,30.5,512.3292


In [106]:
scaler = StandardScaler()

scaler.fit(X_train[num_cols])

X_train[num_cols] = scaler.transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [107]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sex_male,668.0,0.6497006,0.477421,0.0,0.0,1.0,1.0,1.0
Embarked_Q,668.0,0.08083832,0.272791,0.0,0.0,0.0,0.0,1.0
Embarked_S,668.0,0.7170659,0.450762,0.0,0.0,1.0,1.0,1.0
Embarked_nan,668.0,0.002994012,0.054677,0.0,0.0,0.0,0.0,1.0
Age,537.0,-4.32098e-17,1.000932,-2.043404,-0.618444,-0.064523,0.627877,3.466719
Pclass,668.0,-2.227094e-16,1.000749,-1.579334,-0.376375,0.826584,0.826584,0.826584
SibSp,668.0,1.030447e-17,1.000749,-0.46472,-0.46472,-0.46472,0.536677,7.546455
Parch,668.0,9.141058000000001e-17,1.000749,-0.451483,-0.451483,-0.451483,-0.451483,6.757861
Fare,668.0,6.26578e-17,1.000749,-0.657443,-0.490943,-0.365167,-0.014286,10.146098


HistGradientBoosting

In [None]:
clf = HistGradientBoostingClassifier(random_state=RANDOM_STATE, max_iter=10000)

params = {'learning_rate' : [0.01, 0.1, 0.2],
          'l2_regularization' : [0, 0.01, 0.1, 0.2],
          "max_depth": [10, 15, 20]}

clf_grid = GridSearchCV(clf, 
                        param_grid = params,
                        verbose = 2)

clf_grid.fit(X_train, y_train)

In [72]:
clf_grid.best_estimator_.get_params()

{'categorical_features': None,
 'early_stopping': 'auto',
 'l2_regularization': 0,
 'learning_rate': 0.01,
 'loss': 'log_loss',
 'max_bins': 255,
 'max_depth': 10,
 'max_iter': 10000,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 20,
 'monotonic_cst': None,
 'n_iter_no_change': 10,
 'random_state': 42,
 'scoring': 'loss',
 'tol': 1e-07,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [108]:
clf = HistGradientBoostingClassifier(loss = 'log_loss', 
                                     learning_rate = 0.01,
                                     l2_regularization = 0,
                                     max_depth = 10,
                                     random_state=RANDOM_STATE, max_iter=10000)
clf.fit(X_train, y_train)

In [109]:
y_pred = clf.predict(X_test)

In [110]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       137
           1       0.74      0.69      0.71        86

    accuracy                           0.78       223
   macro avg       0.77      0.77      0.77       223
weighted avg       0.78      0.78      0.78       223



Logistic regression (TBD)

In [None]:
# clf = HistGradientBoostingClassifier(random_state=RANDOM_STATE, max_iter=10000)

# params = {'learning_rate' : [0.01, 0.1, 0.2],
#           'l2_regularization' : [0, 0.01, 0.1, 0.2],
#           "max_depth": [10, 15, 20]}

# clf_grid = GridSearchCV(clf, 
#                         param_grid = params,
#                         verbose = 2)

# clf_grid.fit(X_train, y_train)

In [None]:
# clf_grid.best_estimator_.get_params()

In [None]:
# clf_hist_gb = HistGradientBoostingClassifier(loss = 'log_loss', 
#                                             learning_rate = 0.05,
#                                             l2_regularization = 0.1,
#                                             max_depth = None,
#                                             random_state=RANDOM_STATE, max_iter=10000)
# clf.fit(X_train, y_train)

In [None]:
# y_pred = clf.predict(X_test)

In [70]:
# print(classification_report(y_test, y_pred))

Train on whole train data and export preprocessors and model

In [112]:
preprocessor.fit(X)
X_oh = pd.DataFrame(preprocessor.transform(X), columns = preprocessor.get_feature_names_out())
X = X_oh
X = X.set_index(y.index)

X[num_cols] = scaler.fit_transform(X[num_cols])

clf.fit(X, y)

In [113]:
filename = "OUTPUT/preprocessor.pkl"
pickle.dump(preprocessor, open(filename, 'wb'))

filename = "OUTPUT/scaler.pkl"
pickle.dump(scaler, open(filename, 'wb'))

filename = "OUTPUT/clf.pkl"
pickle.dump(clf, open(filename, 'wb'))

Apply on test data

In [114]:
df = pd.read_csv('./DATA/test.csv')

In [115]:
cat_cols = ['Sex', 'Embarked']
num_cols = ['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']
features = cat_cols + num_cols

X_test = df[features]

In [116]:
filename = "OUTPUT/preprocessor.pkl"
preprocessor = pickle.load(open(filename, 'rb'))

filename = "OUTPUT/scaler.pkl"
scaler = pickle.load(open(filename, 'rb'))

filename = "OUTPUT/clf.pkl"
clf = pickle.load(open(filename, 'rb'))

In [117]:
X_test_oh = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())
X_test = X_test_oh

In [118]:
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [120]:
y_pred = clf.predict(X_test)

Export test predictions

In [121]:
output = pd.DataFrame({'PassengerId': df['PassengerId'], 'Survived': y_pred})
output.to_csv('OUTPUT/submission.csv', index=False)