# Imports

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from utils.data_handling import *

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import os

from statsmodels.graphics.mosaicplot import mosaic

import matplotlib as plt

# Load and Split Data

In [3]:
csv_path = os.path.join("datasets", "titanic-train.csv")
passengers_raw = pd.read_csv(csv_path)

train_set, test_set = train_test_split(passengers_raw, test_size=0.2, random_state=32)
print(len(train_set), "train +", len(test_set), "test")
train_set = train_set.drop(["PassengerId", "Name", "Ticket"], axis=1)
test_set = test_set.drop(["PassengerId", "Name", "Ticket"], axis=1)

712 train + 179 test


# Feature Engineering Pipeline

In [4]:
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

# cat_attribs = ["Pclass", "Sex", "Embarked"]
class_encoder = PipelineLabelBinarizer()
class_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Pclass"])),
    ('label_binarizer', class_encoder)
])

sex_encoder = PipelineLabelBinarizer()
sex_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Sex"])),
    ('label_binarizer', sex_encoder)
])

embarked_encoder = PipelineLabelBinarizer()
embarked_pipeline = Pipeline([
    ('fillna', FillNaWith(["Embarked"])),
    ('selector', DataFrameSelector(["Embarked"])),
    ('label_binarizer', embarked_encoder)
])

cabin_transformer = CabinTransformer()
cabin_pipeline = Pipeline([
    ('fillna', FillNaWith(["Cabin"], '')),
    ('selector', DataFrameSelector(["Cabin"])),
    ('cabin_transformer', cabin_transformer)
])

combined_features = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("class_pipeline", class_pipeline),
    ("sex_pipeline", sex_pipeline),
    ("embarked_pipeline", embarked_pipeline),
    ("cabin_pipeline", cabin_pipeline)
])

full_pipeline = Pipeline([
    ('features', combined_features),
    ('clf', RandomForestClassifier(max_features=16, n_estimators=70))
])

# Train Model

In [5]:
labelEncoder = LabelBinarizer();
train_set_labels = unwrap_labels(labelEncoder.fit_transform(train_set["Survived"].values))

full_pipeline = full_pipeline.fit(train_set, train_set_labels)

attributes = num_attribs + list(class_encoder.classes_) + list(sex_encoder.classes_) + list(embarked_encoder.classes_) + cabin_transformer.get_classes()

# Evaluate Model

In [12]:
test_set_labels = unwrap_labels(labelEncoder.transform(test_set["Survived"].values))

predict(full_pipeline, test_set, np.asarray(test_set_labels))

Correct:  143
%Correct:  79.8882681564
ROC AUC:  0.867175273865


0.86717527386541482

# Grid Search

In [19]:
param_grid = [
    {'clf__n_estimators': [50, 60], 'clf__max_features': [12]},
    # {'clf__n_estimators': [50, 60, 70], 'clf__max_features': [12, 14, 16]},
    # {'clf__bootstrap': [False], 'clf__n_estimators': [20, 50, 80], 'clf__max_features': [4, 10, 16]}
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_set, train_set_labels)

predict(grid_search.best_estimator_, test_set, test_set_labels)
print("Best Params: ", grid_search.best_params_)

feature_importances = grid_search.best_estimator_.named_steps['clf'].feature_importances_
named_feature_importances = sorted(zip(feature_importances, attributes), reverse=True)
print("Feature Importances: ")
named_feature_importances

Correct:  145
%Correct:  81.0055865922
ROC AUC:  0.86860980699
Best Params:  {'clf__max_features': 12, 'clf__n_estimators': 60}
Feature Importances: 


[(0.26944648581150227, 'female'),
 (0.22352443056248064, 'Age'),
 (0.19597210198978307, 'Fare'),
 (0.0687068592817945, 3),
 (0.061389701166082791, 'X'),
 (0.05041275774687614, 'SibSp'),
 (0.02905851760390318, 'Parch'),
 (0.01890909693337332, 'T'),
 (0.014141356068807476, 'Q'),
 (0.013280059219022056, 'male'),
 (0.01114567079973326, 2),
 (0.010974791166024896, 1),
 (0.0066402678074940127, 'Null'),
 (0.0064782035510926152, 'D'),
 (0.0057781604421310315, 'B'),
 (0.0035382512946244412, 'cabin_number'),
 (0.0031563024739848285, 'A'),
 (0.0026103539799442309, 'S'),
 (0.0024891590112556582, 'C'),
 (0.0012889998654350213, 'E'),
 (0.00081259634452738694, 'F'),
 (0.00019551334106591845, 'G'),
 (5.0363539061249508e-05, 'C')]