In [289]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

TITANIC_FOLDER_PATH = path = os.path.join("datasets", "titanic")


def load_titanic_data(filename):
    file_path = os.path.join(TITANIC_FOLDER_PATH, filename)
    return pd.read_csv(file_path)

In [290]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

titanic_data = train_data.drop("Survived", axis=1)
titanic_labels = train_data["Survived"].copy()

In [325]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('category_encoder', OneHotEncoder())
])

from sklearn.base import BaseEstimator, TransformerMixin


class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        new_data = X.copy()
        new_data.loc[new_data["Fare"] == 0, "Fare"] = np.NaN

        new_data['Title'] = new_data['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
        new_data = new_data.replace({
            'Title': {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'the Countess': 'Mrs', 'Don': 'Mr', 'Mme': 'Mrs',
                      'Ms': 'Miss', 'Lady': 'Miss', 'Sir': 'Mr', 'Capt': 'Mr', 'Jonkheer': 'Mr', 'Dona': 'Mrs',
                      'Dr': 'Other',
                      'Rev': 'Other'}
        })

        new_data['Ticket_2letter'] = new_data['Ticket'] \
            .apply(lambda ticket: ticket[:2])
        new_data['Ticket_len'] = new_data['Ticket'] \
            .apply(lambda ticket: len(ticket))

        new_data['Cabin_sum'] = new_data[~new_data['Cabin'].isna()]['Cabin'] \
            .apply(lambda cabin: len(str(cabin).split()))

        new_data['Deck'] = new_data[~new_data['Cabin'].isna()]['Cabin'] \
            .apply(lambda cabin: str(cabin)[:1])
        idx = new_data[new_data['Deck'] == 'T'].index
        new_data.loc[idx, 'Deck'] = 'A'
        new_data['Deck'] = new_data['Deck'].replace(['A', 'B', 'C'], 'ABC')
        new_data['Deck'] = new_data['Deck'].replace(['D', 'E'], 'DE')
        new_data['Deck'] = new_data['Deck'].replace(['F', 'G'], 'FG')

        new_data['Family_Size'] = new_data['SibSp'] + new_data[
            'Parch'] + 1  #the 1 is the person. he's part of the family
        new_data['Family_Type'] = pd.cut(new_data['Family_Size'], [0, 1, 4, 7, 11],
                                         labels=['Alone', 'Small', 'Big', 'Very Big'])

        new_data["Age_Group"] = pd.cut(new_data["Age"], [0, 15, 30, 45, 60, 120],
                                       labels=['Child', 'Young Adult', 'Adult', 'Old', 'Very Old'])

        new_data["Fare_Group"] = pd.qcut(new_data["Fare"], 4, labels=['Cheap', 'Standard', 'Expensive',
                                                                      'Luxury'])

        new_data['Ticket_Frequency'] = new_data.groupby('Ticket')['Ticket'].transform('count')

        return new_data


class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[num_attribs + cat_attribs]

In [326]:
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma='auto')


def train_models_with_features(data, labels, num_attribs, cat_attribs):
    full_transformer_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs)
    ])

    full_pipeline = Pipeline([
        ('feature_adder', FeatureAdder()),
        ('feature_selector', FeatureSelector()),
        ('transformer', full_transformer_pipeline),
    ])

    titanic_data_prepared = full_pipeline.fit_transform(data)
    forest_scores = cross_val_score(forest_clf, titanic_data_prepared, labels, cv=10)
    print("Forest mean:", forest_scores.mean())
    svm_scores = cross_val_score(svm_clf, titanic_data_prepared, labels, cv=10)
    print("SVM mean:", svm_scores.mean())

In [327]:
num_attribs = ["Cabin_sum", "Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Sex", "Title", "Embarked", "Age_Group", "Fare_Group", "Pclass", "Deck", "Family_Type"]

full_transformer_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

full_pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('feature_selector', FeatureSelector()),
    ('transformer', full_transformer_pipeline),
])

titanic_train_prepared = full_pipeline.fit_transform(titanic_data)

In [328]:
titanic_train_prepared.shape

(891, 34)

In [300]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

classifier = Sequential()
classifier.add(Dense(activation="relu", input_dim=39, units=11, kernel_initializer="uniform"))
classifier.add(Dense(activation="relu", units=11, kernel_initializer="uniform"))
classifier.add(Dropout(0.5))
classifier.add(Dense(activation="relu", units=11, kernel_initializer="uniform"))
classifier.add(Dropout(0.5))
classifier.add(Dense(activation="relu", units=5, kernel_initializer="uniform"))
classifier.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
classifier.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_55 (Dense)            (None, 11)                440       
                                                                 
 dense_56 (Dense)            (None, 11)                132       
                                                                 
 dropout_14 (Dropout)        (None, 11)                0         
                                                                 
 dense_57 (Dense)            (None, 11)                132       
                                                                 
 dropout_15 (Dropout)        (None, 11)                0         
                                                                 
 dense_58 (Dense)            (None, 5)                 60        
                                                                 
 dense_59 (Dense)            (None, 1)               

In [301]:
titanic_train_prepared

array([[-0.15126507, -0.56573646,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.15126507,  0.66386103,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.15126507, -0.25833709, -0.4745452 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.15126507, -0.1046374 ,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.15126507, -0.25833709, -0.4745452 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.15126507,  0.20276197, -0.4745452 , ...,  0.        ,
         0.        ,  0.        ]])

In [302]:
history = classifier.fit(titanic_train_prepared, titanic_labels, batch_size=10, epochs=100,
                         validation_split=0.2, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [303]:

import tensorflow as tf
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=[titanic_train_prepared.shape[1]]),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),

    tf.keras.layers.Dense(1, activation='sigmoid'),
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(titanic_train_prepared, titanic_labels, epochs=200, batch_size=16, validation_split=0.2)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [304]:
titanic_test_prepared = full_pipeline.transform(test_data)

In [305]:
prediction = model.predict(titanic_test_prepared)
prediction



array([[2.63272203e-04],
       [7.81693980e-02],
       [5.28526980e-05],
       [7.22878724e-02],
       [5.17197810e-02],
       [1.58897936e-02],
       [8.89636159e-01],
       [2.96888652e-06],
       [9.99987304e-01],
       [1.31059787e-04],
       [5.00156097e-02],
       [8.27250183e-02],
       [1.00000000e+00],
       [2.00152940e-07],
       [1.00000000e+00],
       [9.98469055e-01],
       [5.33046958e-04],
       [2.11237058e-01],
       [2.27365433e-03],
       [7.16455588e-06],
       [9.70086873e-01],
       [9.93071079e-01],
       [9.98498559e-01],
       [9.70983624e-01],
       [1.00000000e+00],
       [7.32112038e-10],
       [1.00000000e+00],
       [2.00629279e-01],
       [9.92795050e-01],
       [1.53304773e-05],
       [4.88535479e-09],
       [1.71974771e-05],
       [7.65161335e-01],
       [2.08102111e-02],
       [5.21818222e-03],
       [1.41992345e-01],
       [7.67724812e-02],
       [7.23269908e-03],
       [2.99965851e-02],
       [7.16581881e-01],


In [306]:
prediction.round().astype('int32')

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
    

In [307]:
neural_submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": prediction.round().astype('int32').flatten()
})
neural_submission.to_csv("titanic_submission_clean_neural_more_features.csv", index=False)

# Look again at the random forest

In [337]:
from sklearn.model_selection import GridSearchCV

num_attribs = ["Cabin_sum" ,'Fare']
cat_attribs = ["Sex", "Title", "Embarked", 'Fare_Group', "Age_Group", "Family_Type", "Pclass", 'Deck']

full_transformer_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

full_pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('feature_selector', FeatureSelector()),
    ('transformer', full_transformer_pipeline),
])

titanic_train_prepared = full_pipeline.fit_transform(titanic_data)

param_grid = [
    {'n_estimators': [200, 220, 250], 'max_features': [20, 25], 'max_depth': [6], 'oob_score':[True]}
]

grid_search = GridSearchCV(forest_clf, param_grid, cv=10, verbose=2)
grid_search.fit(titanic_train_prepared, titanic_labels)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.4s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.4s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.4s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.4s
[CV] END max_depth=

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             param_grid=[{'max_depth': [6], 'max_features': [20, 25],
                          'n_estimators': [200, 220, 250],
                          'oob_score': [True]}],
             verbose=2)

In [338]:
grid_search.best_score_

0.8575031210986266

In [339]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=6, max_features=25, n_estimators=200,
                       oob_score=True, random_state=42)

In [340]:
best_forest = grid_search.best_estimator_
best_forest.fit(titanic_train_prepared, titanic_labels)

RandomForestClassifier(max_depth=6, max_features=25, n_estimators=200,
                       oob_score=True, random_state=42)

In [341]:
titanic_test_prepared = full_pipeline.transform(test_data)
titanic_predictions_forest = best_forest.predict(titanic_test_prepared)
titanic_predictions_forest

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [342]:
one_hot_categories = list(np.concatenate(full_pipeline['transformer'].named_transformers_["cat"]['category_encoder'].categories_).flat)
attributes = num_attribs + one_hot_categories
sorted(zip(best_forest.feature_importances_, attributes), reverse=True)

[(0.2968046008892959, 'Mr'),
 (0.16403392518969107, 'Fare'),
 (0.1046225997143548, 3),
 (0.09730793994210152, 'female'),
 (0.08877689127410013, 'male'),
 (0.025840830095268327, 1),
 (0.02352074304260052, 'Other'),
 (0.021175985867049198, 'Small'),
 (0.020334945820044473, 'Big'),
 (0.016261749085270276, 'Adult'),
 (0.01593554338197907, 'Master'),
 (0.01522658037098654, 'DE'),
 (0.01233917077695509, 'S'),
 (0.011440157622866718, 'Child'),
 (0.011125412690640617, 'Standard'),
 (0.0086432931509413, 2),
 (0.007611517364045636, 'Expensive'),
 (0.007226565559973198, 'C'),
 (0.0071305205298273795, 'Young Adult'),
 (0.0066900287881416715, 'ABC'),
 (0.005960510365325999, 'Old'),
 (0.005066360421312534, 'Very Old'),
 (0.0043256492662989855, 'Luxury'),
 (0.0042538089936097956, 'Alone'),
 (0.004177343774371802, 'Cabin_sum'),
 (0.003131742365449428, 'Q'),
 (0.0030566323845118946, 'Very Big'),
 (0.0026980431697345023, 'Miss'),
 (0.00248205766017053, 'Cheap'),
 (0.0017987010700181397, 'Mrs'),
 (0.0010

In [343]:
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": titanic_predictions_forest
})
submission.to_csv("titanic_submission_forest_with_deck.csv", index=False)