# Imports

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from utils.data_handling import *

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import os

from statsmodels.graphics.mosaicplot import mosaic

import matplotlib as plt

# Load and Split Data

In [2]:
csv_path = os.path.join("datasets", "titanic-train.csv")
passengers_raw = pd.read_csv(csv_path)

train_set, test_set = train_test_split(passengers_raw, test_size=0.2, random_state=32)
print(len(train_set), "train +", len(test_set), "test")
train_set = train_set.drop(["PassengerId", "Name", "Ticket"], axis=1)
test_set = test_set.drop(["PassengerId", "Name", "Ticket"], axis=1)

712 train + 179 test


# Feature Engineering Pipeline

In [3]:
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

# cat_attribs = ["Pclass", "Sex", "Embarked"]
class_encoder = PipelineLabelBinarizer()
class_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Pclass"])),
    ('label_binarizer', class_encoder)
])

sex_encoder = PipelineLabelBinarizer()
sex_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Sex"])),
    ('label_binarizer', sex_encoder)
])

embarked_encoder = PipelineLabelBinarizer()
embarked_pipeline = Pipeline([
    ('fillna', FillNaWith(["Embarked"])),
    ('selector', DataFrameSelector(["Embarked"])),
    ('label_binarizer', embarked_encoder)
])

cabin_transformer = CabinTransformer()
cabin_pipeline = Pipeline([
    ('fillna', FillNaWith(["Cabin"], '')),
    ('selector', DataFrameSelector(["Cabin"])),
    ('cabin_transformer', cabin_transformer)
])

combined_features = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("class_pipeline", class_pipeline),
    ("sex_pipeline", sex_pipeline),
    ("embarked_pipeline", embarked_pipeline),
    ("cabin_pipeline", cabin_pipeline)
])

full_pipeline = Pipeline([
    ('features', combined_features),
    ('clf', RandomForestClassifier(max_features=16, n_estimators=70))
])

# Train Model

In [4]:
labelEncoder = LabelBinarizer();
train_set_labels = unwrap_labels(labelEncoder.fit_transform(train_set["Survived"].values))

full_pipeline = full_pipeline.fit(train_set, train_set_labels)

attributes = num_attribs + list(class_encoder.classes_) + list(sex_encoder.classes_) + list(embarked_encoder.classes_) + cabin_transformer.get_classes()

# Evaluate Model

In [5]:
test_set_labels = unwrap_labels(labelEncoder.transform(test_set["Survived"].values))

predict(full_pipeline, test_set, np.asarray(test_set_labels))

Correct:  142
%Correct:  79.3296089385
ROC AUC:  0.866979655712


0.86697965571205016

# Grid Search

In [6]:
param_grid = [
    {'clf__n_estimators': [50, 60], 'clf__max_features': [12]},
    # {'clf__n_estimators': [50, 60, 70], 'clf__max_features': [12, 14, 16]},
    # {'clf__bootstrap': [False], 'clf__n_estimators': [20, 50, 80], 'clf__max_features': [4, 10, 16]}
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_set, train_set_labels)

predict(grid_search.best_estimator_, test_set, test_set_labels)
print("Best Params: ", grid_search.best_params_)

feature_importances = grid_search.best_estimator_.named_steps['clf'].feature_importances_
named_feature_importances = sorted(zip(feature_importances, attributes), reverse=True)
print("Feature Importances: ")
named_feature_importances

Correct:  145
%Correct:  81.0055865922
ROC AUC:  0.872717788211
Best Params:  {'clf__max_features': 12, 'clf__n_estimators': 60}
Feature Importances: 


[(0.27952681992288586, 'female'),
 (0.2169199996672522, 'Age'),
 (0.20147457463252882, 'Fare'),
 (0.06741481157515003, 3),
 (0.06470184462025115, 'X'),
 (0.050746103843586146, 'SibSp'),
 (0.026833408835072455, 'Parch'),
 (0.014694437652850664, 'T'),
 (0.014106234574671955, 'Q'),
 (0.012022461456745891, 'male'),
 (0.010575733921454698, 2),
 (0.0094473118490239153, 1),
 (0.005591789076296292, 'cabin_number'),
 (0.0050890905319605764, 'B'),
 (0.0049958489888534572, 'Null'),
 (0.0047444199623553861, 'S'),
 (0.0032257152887395098, 'D'),
 (0.0031956073175367652, 'A'),
 (0.0028782566932597227, 'C'),
 (0.0010851510124744461, 'E'),
 (0.00059693331908708045, 'F'),
 (0.00013344525796297162, 'G'),
 (0.0, 'C')]

# Submission

In [15]:
import datetime

passengers_raw_labels = unwrap_labels(LabelBinarizer().fit_transform(passengers_raw["Survived"].values))
full_pipeline = full_pipeline.fit(passengers_raw, passengers_raw_labels)

submission_data_raw = pd.read_csv("datasets/titanic-test.csv")
predictions = full_pipeline.predict(submission_data_raw)

submission = pd.DataFrame()
submission['PassengerId'] = submission_data_raw.PassengerId
submission['Survived'] = predictions

now = datetime.datetime.now()
filename = now.strftime("%Y%m%d-%H%M%S") + '-submission_clucas.csv'
submission.to_csv(path_or_buf='submissions/'+filename, index=False)