# Titanic - Machine Learning from Disaster

### Summary
* [Introduction](#Introduction)
* [Data exploration](#Data-exploration)
    * [Count of passengers split by sex and passenger's class](#Count-of-passengers-split-by-sex-and-passenger's-class)
    * [Survival rate per passenger's class](#Survival-rate-per-passenger's-class)
    * [Number of survived and non survived passengers](#Number-of-survived-and-non-survived-passengers)
    * [Passengers age](#Passengers-age)
    * [Pairplot](#Pairplot)
* [Feature engineering](#Feature-engineering)
* [Logistic regression](#Logistic-regression)  
* [Decission tree](#Decission-tree)
* [Random forest](#Random-forest)
* [Ensemble models](#Ensemble-models)
* [Kaggle submission](#Kaggle-submission)

## Introduction

Solution for [Titanic - challenge](https://www.kaggle.com/c/titanic/overview). Data description can be found [here](https://www.kaggle.com/c/titanic/data).

In [None]:
# Import required packages
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, KBinsDiscretizer, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
# Global variables
RANDOM_STATE = 42

In [None]:
# load data
df = pd.read_csv("../data/train.csv", sep=",")
df.head(5)

### Split Train / Test (Validation) data

In [None]:
X = df.drop(columns='Survived')
y = df['Survived']
X_train, X_test, y_train_true, y_test_true = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

## Data exploration


In [None]:
# pip install --no-input plotly.express

In [None]:
# pip freeze > requirements.txt

In [None]:
import plotly.express as px
import plotly.graph_objects as go

### Number of survived and non survived passengers

In [None]:
tmp = pd.DataFrame(df.groupby("Survived").size(), columns=["count"]).reset_index()
fig = px.pie(tmp, values='count', names={0: 'Non-survived', 1: 'Survived'}, title='Survived vs Not-survived')
fig.show()

### Count of passengers split by sex and passenger's class

In [None]:
tmp = pd.DataFrame(df.groupby(by=["Pclass", "Sex"]).size(), columns=["count"]).reset_index()
fig = px.bar(tmp, x="Pclass", y="count", color="Sex", title="Count of passengers split by sex and passenger's class")
fig.show() 
# TODO: switch colors

### Survival rate per passenger's class

In [None]:
tmp = pd.DataFrame(df.groupby(by=["Pclass", "Sex", "Survived"]).size(), columns=["count"]).reset_index()
tmp['total'] = tmp.groupby(by=["Pclass", "Sex"])['count'].transform('sum')
tmp['rate'] = tmp['count'] / tmp['total']

male = tmp.loc[tmp['Sex'] == 'male']
male_survived = male[male['Survived'] == 1]
female = tmp.loc[tmp['Sex'] == 'female']
female_survived = female[female['Survived'] == 1]

In [None]:
# import plotly.graph_objects.layout as layout

fig = go.Figure(data=[
    go.Bar(name='Male', x=male_survived['Pclass'], y=male_survived['rate']),
    go.Bar(name='Female', x=female_survived['Pclass'], y=female_survived['rate'])
], layout=go.Layout(title='Survival rate per passenger\'s class'))

fig.update_layout(barmode='group')
fig.show()

### Passengers age

In [None]:
fig = px.histogram(df, x="Age", color="Survived", title='Passengers age')
fig.show()

# TODO: change legend
# TODO: increase bar width

### Pairplot

In [None]:
import seaborn as sns

tmp = df[['Survived', 'Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
sns.pairplot(tmp, hue='Survived')

## Feature engineering

In [None]:
def cabin_to_deck_mapper(d: pd.DataFrame) -> pd.DataFrame:
    d['Cabin'] = d['Cabin'].str[0] == 'A'

    return d

In [None]:
# Simple test for cabin_to_deck_mapper function
import pandas.testing as testing

actual_input = pd.DataFrame([
    ['A1'],
    ['B2'],
    ['C5'],
    ['E9'],
    [np.NAN],
], index=[0, 1, 2, 3, 4], columns=['Cabin'])
expected_output = pd.DataFrame([
    [True],
    [False],
    [False],
    [False],
    [False],
], index=[0, 1, 2, 3, 4], columns=['Cabin'])

actual_output = cabin_to_deck_mapper(actual_input)

testing.assert_frame_equal(expected_output, actual_output)

In [None]:
age_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'), # TODO: replace by mean-age-by-pclass
    KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='quantile')
)

In [None]:
fare_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    MinMaxScaler()
)

In [None]:
pipeline = ColumnTransformer([
    ('age_transformer', age_pipeline, ['Age']),
    ('sex_transformer', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Sex']),
    ('fare_transformer', fare_pipeline, ['Fare']),
    ('cabin_transformer', FunctionTransformer(cabin_to_deck_mapper), ['Cabin']),
    ('pass_through', 'passthrough', ['Pclass', 'SibSp']),
], remainder='drop')

In [None]:
# Transform data
pipeline.fit(X_train)
X_train_fe = pipeline.transform(X_train)
X_test_fe = pipeline.transform(X_test)

## Logistic regression

In [None]:
# Create and fit logistic regression classifier
m1 = LogisticRegression()
m1.fit(X_train_fe, y_train_true)

# make predictions
y_test_pred = m1.predict(X_test_fe)

#### Calculate scores

In [None]:
from sklearn import metrics

print("accuracy_score: %.4f" % metrics.accuracy_score(y_test_true, y_test_pred))
print("precision_score: %.4f" % metrics.precision_score(y_test_true, y_test_pred))
print("recall_score: %.4f" % metrics.recall_score(y_test_true, y_test_pred))
print("f1_score: %.4f" % metrics.f1_score(y_test_true, y_test_pred))

#### Confussion matrics

In [None]:
import matplotlib.pyplot as plt

metrics.ConfusionMatrixDisplay.from_estimator(m1, X_test_fe, y_test_true)
plt.show()

In [None]:
# TODO: add description and title
metrics.RocCurveDisplay.from_estimator(m1, X_test_fe, y_test_true)

### Cross validation

In [None]:
from sklearn.model_selection import cross_validate

cross_accuracy_log = cross_validate(m1, X_train_fe, y_train_true,
                    cv=5,
                    scoring='accuracy',
                    return_train_score=True
)

In [None]:
print("Cross-validation mean %.3f +- %.3f" % (100 * cross_accuracy_log['train_score'].mean(), 100 * cross_accuracy_log['train_score'].std()))

## Decission tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

m2 = DecisionTreeClassifier(max_depth=5)
m2.fit(X_train_fe, y_train_true)

# make predictions
y_test_pred = m2.predict(X_test_fe)

#### Calculate scores

In [None]:
print("accuracy_score: %.4f" % metrics.accuracy_score(y_test_true, y_test_pred))
print("precision_score: %.4f" % metrics.precision_score(y_test_true, y_test_pred))
print("recall_score: %.4f" % metrics.recall_score(y_test_true, y_test_pred))
print("f1_score: %.4f" % metrics.f1_score(y_test_true, y_test_pred))

#### Confussion matrics

In [None]:
metrics.ConfusionMatrixDisplay.from_estimator(m2, X_test_fe, y_test_true)
plt.show()

In [None]:
# TODO: add description and title
metrics.RocCurveDisplay.from_estimator(m2, X_test_fe, y_test_true)

In [None]:
plt.figure(figsize=(30, 20))
t = plot_tree(m2)
# t = plot_tree(m, feature_names=['Age'], class_names=[...])
# TODO: set feature names

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

m3 = RandomForestClassifier(max_depth=5)
m3.fit(X_train_fe, y_train_true)

# make predictions
y_test_pred = m3.predict(X_test_fe)

#### Calculate scores

In [None]:
print("accuracy_score: %.4f" % metrics.accuracy_score(y_test_true, y_test_pred))
print("precision_score: %.4f" % metrics.precision_score(y_test_true, y_test_pred))
print("recall_score: %.4f" % metrics.recall_score(y_test_true, y_test_pred))
print("f1_score: %.4f" % metrics.f1_score(y_test_true, y_test_pred))

#### Confussion matrics

In [None]:
metrics.ConfusionMatrixDisplay.from_estimator(m3, X_test_fe, y_test_true)
plt.show()

In [None]:
# TODO: add description and title
metrics.RocCurveDisplay.from_estimator(m3, X_test_fe, y_test_true)

## Ensemble models

In [None]:
from sklearn.ensemble import VotingClassifier

models = [
          ('logreg', m1),
          ('tree', m2),
          ('randomforest', m3)
]
model_ensemble = VotingClassifier(models)
y_test_pred = model_ensemble.fit(X_train_fe, y_train_true)

# make predictions
y_test_pred = model_ensemble.predict(X_test_fe)

#### Calculate scores

In [None]:
print("accuracy_score: %.4f" % metrics.accuracy_score(y_test_true, y_test_pred))
print("precision_score: %.4f" % metrics.precision_score(y_test_true, y_test_pred))
print("recall_score: %.4f" % metrics.recall_score(y_test_true, y_test_pred))
print("f1_score: %.4f" % metrics.f1_score(y_test_true, y_test_pred))

#### Confussion matrics

In [None]:
metrics.ConfusionMatrixDisplay.from_estimator(model_ensemble, X_test_fe, y_test_true)
plt.show()

## Kaggle submission

In [None]:
X_data_kaggle = pd.read_csv("../data/test.csv", sep=",")
X_test_kaggle = X_data_kaggle.loc[:, ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin', 'Name']]
# X_test_kaggle.isna().sum()

In [None]:
fare_mean = X_test_kaggle['Fare'].mean()
X_test_kaggle['Fare'].fillna(fare_mean, inplace=True)
# X_test_kaggle.isna().sum()

In [None]:
X_test_kaggle_fe = pipeline.transform(X_test_kaggle)
df_kaggle = pd.DataFrame({
    "PassengerId": X_data_kaggle['PassengerId'],
    "Survived": model_ensemble.predict(X_test_kaggle_fe)
})

In [None]:
df_kaggle.to_csv("./output/titanic_predictions.csv", index=False)

### Submission result

Submission score **0.77751**   
position 4130 out of 13765 submissions