# Imports

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from utils.data_loading import load_titanic_data as load
from utils.data_handling import *

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

from statsmodels.graphics.mosaicplot import mosaic

import matplotlib as plt

# Load and Split Data

In [2]:
passengers_raw = load()
train_set, test_set = train_test_split(passengers_raw, test_size=0.2, random_state=32)
print(len(train_set), "train +", len(test_set), "test")
train_set = train_set.drop(["PassengerId", "Name", "Ticket"], axis=1)
test_set = test_set.drop(["PassengerId", "Name", "Ticket"], axis=1)

712 train + 179 test


# Feature Engineering Pipeline

In [3]:
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

# cat_attribs = ["Pclass", "Sex", "Embarked"]
class_encoder = LabelBinarizer()
class_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Pclass"])),
    ('label_binarizer', class_encoder)
])

sex_encoder = LabelBinarizer()
sex_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Sex"])),
    ('label_binarizer', sex_encoder)
])

embarked_encoder = LabelBinarizer()
embarked_pipeline = Pipeline([
    ('fillna', FillNaWith(["Embarked"])),
    ('selector', DataFrameSelector(["Embarked"])),
    ('label_binarizer', embarked_encoder)
])

cabin_transformer = CabinTransformer()
cabin_pipeline = Pipeline([
    ('fillna', FillNaWith(["Cabin"], '')),
    ('selector', DataFrameSelector(["Cabin"])),
    ('cabin_transformer', cabin_transformer)
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("class_pipeline", class_pipeline),
    ("sex_pipeline", sex_pipeline),
    ("embarked_pipeline", embarked_pipeline),
    ("cabin_pipeline", cabin_pipeline)
])

# Train Model

In [4]:
labelEncoder = LabelBinarizer();
train_set_labels = unwrap_labels(labelEncoder.fit_transform(train_set["Survived"].values))
train_set_prepared = full_pipeline.fit_transform(train_set)

classifier = RandomForestClassifier()
classifier.fit(train_set_prepared, train_set_labels)

attributes = num_attribs + list(class_encoder.classes_) + list(sex_encoder.classes_) + list(embarked_encoder.classes_) + cabin_transformer.get_classes()

# Evaluate Model

In [13]:
test_set_labels = unwrap_labels(labelEncoder.transform(test_set["Survived"].values))
test_set_prepared = full_pipeline.transform(test_set)

predict(classifier, test_set_prepared, np.asarray(test_set_labels))

Correct:  142
%Correct:  79.3296089385
ROC AUC:  0.852699530516


0.85269953051643188

# Cross Validation

In [14]:
scores = cross_val_score(classifier, train_set_prepared, train_set_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

Scores:  [ 0.45329841  0.44405304  0.39361095  0.44405304  0.51730613  0.47471266
  0.51730613  0.48932261  0.41111323  0.39361095]
Mean:  0.453838714467
Standard deviation:  0.0436154802596


# Grid Search

In [15]:
param_grid = [
    {'n_estimators': [50, 60, 70], 'max_features': [12, 14, 16]},
    {'bootstrap': [False], 'n_estimators': [20, 50, 80], 'max_features': [4, 10, 16]}
]

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_set_prepared, train_set_labels)

predict(grid_search.best_estimator_, test_set_prepared, test_set_labels)
print("Best Params: ", grid_search.best_params_)

feature_importances = grid_search.best_estimator_.feature_importances_
named_feature_importances = sorted(zip(feature_importances, attributes), reverse=True)
print("Feature Importances: ")
named_feature_importances

Correct:  145
%Correct:  81.0055865922
ROC AUC:  0.874739175796
RSME:  0.874739175796
Best Params:  {'max_features': 16, 'n_estimators': 70}
Feature Importances: 


[(0.27811704269110515, 'female'),
 (0.22249126960548299, 'Age'),
 (0.19429401336535027, 'Fare'),
 (0.080828913329487537, 3),
 (0.069199654330933183, 'X'),
 (0.044757544975735586, 'SibSp'),
 (0.025866261363280015, 'Parch'),
 (0.013366829469114936, 'Q'),
 (0.01325620271317958, 'male'),
 (0.011037317201710339, 2),
 (0.0082787528452681943, 1),
 (0.0067568866148619636, 'T'),
 (0.0066571901379337741, 'Null'),
 (0.0045681345133907741, 'D'),
 (0.00424681225249558, 'B'),
 (0.0040133042750309265, 'S'),
 (0.0037846886018857804, 'A'),
 (0.003701016806624441, 'C'),
 (0.003467173635608192, 'cabin_number'),
 (0.00080790283707000976, 'E'),
 (0.0002691505287352045, 'F'),
 (0.00023393790571566599, 'G'),
 (0.0, 'C')]