In [122]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

TITANIC_FOLDER_PATH = path = os.path.join("datasets", "titanic")


def load_titanic_data(filename):
    file_path = os.path.join(TITANIC_FOLDER_PATH, filename)
    return pd.read_csv(file_path)

In [123]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

titanic_data = train_data.copy()
titanic_labels = train_data["Survived"].copy()

In [124]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('category_encoder', OneHotEncoder())
])

from sklearn.base import BaseEstimator, TransformerMixin


class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        new_data = X.copy()
        new_data.loc[new_data["Fare"] == 0, "Fare"] = np.NaN

        new_data['Title'] = new_data['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
        new_data = new_data.replace({
            'Title': {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'the Countess': 'Mrs', 'Don': 'Mr', 'Mme': 'Mrs',
                      'Ms': 'Miss', 'Lady': 'Miss', 'Sir': 'Mr', 'Capt': 'Mr', 'Jonkheer': 'Mr', 'Dona': 'Mrs',
                      'Dr': 'Other',
                      'Rev': 'Other'}
        })

        new_data['Ticket_2letter'] = new_data['Ticket'] \
            .apply(lambda ticket: ticket[:2])
        new_data['Ticket_len'] = new_data['Ticket'] \
            .apply(lambda ticket: len(ticket))

        new_data['Cabin_sum'] = new_data[~new_data['Cabin'].isna()]['Cabin'] \
            .apply(lambda cabin: len(str(cabin).split()))

        new_data['Deck'] = new_data[~new_data['Cabin'].isna()]['Cabin'] \
            .apply(lambda cabin: str(cabin)[:1])
        idx = new_data[new_data['Deck'] == 'T'].index
        new_data.loc[idx, 'Deck'] = 'A'
        new_data['Deck'] = new_data['Deck'].replace(['A', 'B', 'C'], 'ABC')
        new_data['Deck'] = new_data['Deck'].replace(['D', 'E'], 'DE')
        new_data['Deck'] = new_data['Deck'].replace(['F', 'G'], 'FG')

        new_data['Family_Size'] = new_data['SibSp'] + new_data[
            'Parch'] + 1  #the 1 is the person. he's part of the family
        new_data['Family_Type'] = pd.cut(new_data['Family_Size'], [0, 1, 4, 7, 11],
                                         labels=['Alone', 'Small', 'Big', 'Very Big'])

        new_data["Age_Group"] = pd.cut(new_data["Age"], [0, 15, 30, 45, 60, 120],
                                       labels=['Child', 'Young Adult', 'Adult', 'Old', 'Very Old'])

        new_data["Fare_Group"] = pd.qcut(new_data["Fare"], 4, labels=['Cheap', 'Standard', 'Expensive',
                                                                      'Luxury'])

        new_data['Ticket_Frequency'] = new_data.groupby('Ticket')['Ticket'].transform('count')

        new_data['Lastname'] = new_data['Name'].str.split(', ').str[0]

        return new_data


class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[num_attribs + cat_attribs]

In [125]:
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma='auto')


def train_models_with_features(data, labels, num_attribs, cat_attribs):
    full_transformer_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs)
    ])

    full_pipeline = Pipeline([
        ('feature_adder', FeatureAdder()),
        ('feature_selector', FeatureSelector()),
        ('transformer', full_transformer_pipeline),
    ])

    titanic_data_prepared = full_pipeline.fit_transform(data)
    forest_scores = cross_val_score(forest_clf, titanic_data_prepared, labels, cv=10)
    print("Forest mean:", forest_scores.mean())
    svm_scores = cross_val_score(svm_clf, titanic_data_prepared, labels, cv=10)
    print("SVM mean:", svm_scores.mean())

In [126]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [127]:
num_attribs = ["Cabin_sum", "Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Sex", "Title", "Embarked", "Age_Group", "Fare_Group", "Pclass", "Deck", "Family_Type"]

full_transformer_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

full_pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('feature_selector', FeatureSelector()),
    ('transformer', full_transformer_pipeline),
])

titanic_train_prepared = full_pipeline.fit_transform(titanic_data)

In [128]:
titanic_train_prepared.shape

(891, 34)

# Look again at the random forest

In [129]:
from sklearn.model_selection import GridSearchCV

num_attribs = ["Cabin_sum" ,'Fare']
cat_attribs = ["Sex", "Title", "Embarked", 'Fare_Group', "Age_Group", "Family_Type", "Pclass", 'Deck']

full_transformer_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

full_pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('feature_selector', FeatureSelector()),
    ('transformer', full_transformer_pipeline),
])

titanic_train_prepared = full_pipeline.fit_transform(titanic_data)

param_grid = [
    {'n_estimators': [200, 220, 250], 'max_features': [20, 25], 'max_depth': [6], 'oob_score':[True]}
]

grid_search = GridSearchCV(forest_clf, param_grid, cv=10, verbose=2)
grid_search.fit(titanic_train_prepared, titanic_labels)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.2s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.2s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.2s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.3s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.2s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.2s
[CV] END max_depth=6, max_features=20, n_estimators=200, oob_score=True; total time=   0.2s
[CV] END max_depth=

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             param_grid=[{'max_depth': [6], 'max_features': [20, 25],
                          'n_estimators': [200, 220, 250],
                          'oob_score': [True]}],
             verbose=2)

In [130]:
grid_search.best_score_

0.8575031210986266

In [131]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=6, max_features=25, n_estimators=200,
                       oob_score=True, random_state=42)

In [132]:
best_forest = grid_search.best_estimator_
best_forest.fit(titanic_train_prepared, titanic_labels)

RandomForestClassifier(max_depth=6, max_features=25, n_estimators=200,
                       oob_score=True, random_state=42)

In [133]:
titanic_test_prepared = full_pipeline.transform(test_data)
titanic_predictions_forest = best_forest.predict(titanic_test_prepared)
titanic_predictions_forest

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [134]:
one_hot_categories = list(np.concatenate(full_pipeline['transformer'].named_transformers_["cat"]['category_encoder'].categories_).flat)
attributes = num_attribs + one_hot_categories
sorted(zip(best_forest.feature_importances_, attributes), reverse=True)

[(0.2968046008892959, 'Mr'),
 (0.16403392518969107, 'Fare'),
 (0.1046225997143548, 3),
 (0.09730793994210152, 'female'),
 (0.08877689127410013, 'male'),
 (0.025840830095268327, 1),
 (0.02352074304260052, 'Other'),
 (0.021175985867049198, 'Small'),
 (0.020334945820044473, 'Big'),
 (0.016261749085270276, 'Adult'),
 (0.01593554338197907, 'Master'),
 (0.01522658037098654, 'DE'),
 (0.01233917077695509, 'S'),
 (0.011440157622866718, 'Child'),
 (0.011125412690640617, 'Standard'),
 (0.0086432931509413, 2),
 (0.007611517364045636, 'Expensive'),
 (0.007226565559973198, 'C'),
 (0.0071305205298273795, 'Young Adult'),
 (0.0066900287881416715, 'ABC'),
 (0.005960510365325999, 'Old'),
 (0.005066360421312534, 'Very Old'),
 (0.0043256492662989855, 'Luxury'),
 (0.0042538089936097956, 'Alone'),
 (0.004177343774371802, 'Cabin_sum'),
 (0.003131742365449428, 'Q'),
 (0.0030566323845118946, 'Very Big'),
 (0.0026980431697345023, 'Miss'),
 (0.00248205766017053, 'Cheap'),
 (0.0017987010700181397, 'Mrs'),
 (0.0010

In [135]:
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": titanic_predictions_forest
})
submission.to_csv("titanic_submission_forest_with_deck.csv", index=False)

# Cat Boost Test

In [136]:
from sklearn.model_selection import GridSearchCV

num_attribs = ["Cabin_sum" ,'Fare']
cat_attribs = ["Sex", "Title", "Embarked", 'Fare_Group', "Age_Group", "Family_Type", "Pclass", 'Deck']

full_transformer_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

full_pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('feature_selector', FeatureSelector()),
    ('transformer', full_transformer_pipeline),
])

titanic_train_prepared = full_pipeline.fit_transform(titanic_data)

In [137]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier()

# Define parameters' grid
grid = {'verbose': [False],
        'thread_count': [-1],
        'depth': [4, 5],
        'iterations': [2000, 3000],
        'learning_rate': [0.0001, 0.0003]}

# Define GridSearchCV
grid_search = GridSearchCV(cat_model, grid, cv=10, verbose=2)
grid_search.fit(titanic_train_prepared, titanic_labels)


Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.4s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.6s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.5s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.5s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.4s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.4s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.4s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, verbose=False; total time=   2.4s
[CV] END depth=4, iterations=2000, learning_rate=0.0001, thread_count=-1, v

GridSearchCV(cv=10,
             estimator=<catboost.core.CatBoostClassifier object at 0x0000027CB718A760>,
             param_grid={'depth': [4, 5], 'iterations': [2000, 3000],
                         'learning_rate': [0.0001, 0.0003],
                         'thread_count': [-1], 'verbose': [False]},
             verbose=2)

In [142]:
grid_search.best_score_

0.8316479400749064

In [144]:
grid_search.best_params_

{'depth': 5,
 'iterations': 3000,
 'learning_rate': 0.0001,
 'thread_count': -1,
 'verbose': False}

In [138]:
cat_model = CatBoostClassifier(depth=5, iterations=3000, learning_rate=0.0003, thread_count=-1, verbose=1)

cat_model.fit(titanic_train_prepared, titanic_labels)

0:	learn: 0.6929378	total: 1.44ms	remaining: 4.33s
1:	learn: 0.6927513	total: 2.93ms	remaining: 4.4s
2:	learn: 0.6925535	total: 4.5ms	remaining: 4.5s
3:	learn: 0.6924191	total: 5.58ms	remaining: 4.18s
4:	learn: 0.6922186	total: 7.02ms	remaining: 4.21s
5:	learn: 0.6920205	total: 8.61ms	remaining: 4.3s
6:	learn: 0.6918321	total: 10.2ms	remaining: 4.36s
7:	learn: 0.6916669	total: 11.7ms	remaining: 4.38s
8:	learn: 0.6914909	total: 13.3ms	remaining: 4.41s
9:	learn: 0.6913258	total: 14.4ms	remaining: 4.3s
10:	learn: 0.6911449	total: 16ms	remaining: 4.36s
11:	learn: 0.6909848	total: 17.7ms	remaining: 4.4s
12:	learn: 0.6907892	total: 19.4ms	remaining: 4.45s
13:	learn: 0.6906176	total: 20.9ms	remaining: 4.46s
14:	learn: 0.6904309	total: 22.5ms	remaining: 4.49s
15:	learn: 0.6902187	total: 24.1ms	remaining: 4.49s
16:	learn: 0.6900608	total: 25.6ms	remaining: 4.49s
17:	learn: 0.6898667	total: 27ms	remaining: 4.48s
18:	learn: 0.6896795	total: 28.8ms	remaining: 4.51s
19:	learn: 0.6894916	total: 30.3

<catboost.core.CatBoostClassifier at 0x27cb7144130>

In [139]:
titanic_test_prepared = full_pipeline.transform(test_data)
titanic_predictions_cat_boost = cat_model.predict(titanic_test_prepared)
titanic_predictions_cat_boost


array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [140]:
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": titanic_predictions_cat_boost
})
submission.to_csv("titanic_submission_cat_boost.csv", index=False)


In [141]:
one_hot_categories = list(np.concatenate(full_pipeline['transformer'].named_transformers_["cat"]['category_encoder'].categories_).flat)
attributes = num_attribs + one_hot_categories
sorted(zip(cat_model.get_feature_importance(prettified=False), attributes), reverse=True)

[(30.976273479637197, 'Mr'),
 (13.753163307672638, 3),
 (13.075947589939458, 'female'),
 (12.172297900804093, 'male'),
 (5.3888867839533585, 'Fare'),
 (3.510714971601796, 1),
 (3.2595940695215244, 'Big'),
 (2.4913264490403986, 'Small'),
 (2.0827041889854367, 'Master'),
 (1.6621176376838998, 'DE'),
 (1.4953655831317876, 'Child'),
 (1.2944725976141116, 2),
 (1.2538275226220903, 'ABC'),
 (0.9717386348619882, 'Luxury'),
 (0.8881868467124051, 'Other'),
 (0.7750029068724756, 'Very Big'),
 (0.7614084589405686, 'Expensive'),
 (0.6197091417266098, 'Alone'),
 (0.5829428409860452, 'Standard'),
 (0.5175073860097128, 'Cheap'),
 (0.42875477170465537, 'S'),
 (0.3941103756750995, 'Adult'),
 (0.3506058640490094, 'Cabin_sum'),
 (0.31687445184754903, 'C'),
 (0.312305923956325, 'Old'),
 (0.2044558115600337, 'Young Adult'),
 (0.10949098209712961, 'Mrs'),
 (0.10836945074004231, 'Miss'),
 (0.10713271037112032, 'Q'),
 (0.07332595660601103, 'Very Old'),
 (0.06138540307545065, 'FG')]