# Imports

In [127]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from utils.data_handling import *
from utils.feature_engineering import FeatureExtractor

from sklearn.pipeline import Pipeline, FeatureUnion
from mlxtend.classifier import StackingClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import os

# Load and Split Data

In [128]:
csv_path = os.path.join("datasets", "titanic-train.csv")
passengers_raw = pd.read_csv(csv_path)

train_set, test_set = train_test_split(passengers_raw, test_size=0.2, random_state=32)
print(len(train_set), "train +", len(test_set), "test")
train_set = train_set.drop(["PassengerId", "Name", "Ticket"], axis=1)
test_set = test_set.drop(["PassengerId", "Name", "Ticket"], axis=1)

712 train + 179 test


# Feature Engineering Pipeline

In [138]:
featureExtractor = FeatureExtractor()

clf1 = RandomForestClassifier(random_state=42, n_estimators=70, max_features=14, max_depth = 6)
clf2 = SVC(random_state=42, probability=True)
clf3 = DecisionTreeClassifier(random_state=42, max_depth=8, max_features=12)
lr = LogisticRegression(C=0.1)
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr,
                          use_probas=True)

full_pipeline = Pipeline([
    ('features', featureExtractor.get_feature_union()),
    ('clf', sclf)
])

# Grid Search

In [141]:
param_grid = [
    {'clf__randomforestclassifier__n_estimators': [70], # 65, 70, 75, 80
     'clf__randomforestclassifier__max_features': [12], # 10, 12, 14, 16
     'clf__randomforestclassifier__max_depth': [12], # 6, 8, 16, 20
     'clf__decisiontreeclassifier__max_depth': [8], # 4, 6, 10, 16
     'clf__decisiontreeclassifier__max_features': [12], # 4, 8, 12, 16 
     'clf__meta-logisticregression__C': [0.1], # 0.03, 0.1, 0.3, 1, 10,
    }
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=5)
grid_search.fit(train_set, train_set_labels)

predict(grid_search.best_estimator_, test_set, test_set_labels)
print("Best Params: ", grid_search.best_params_)


Correct:  143
%Correct:  79.8882681564
ROC AUC:  0.872130933751
Best Params:  {'clf__decisiontreeclassifier__max_depth': 8, 'clf__decisiontreeclassifier__max_features': 12, 'clf__meta-logisticregression__C': 0.1, 'clf__randomforestclassifier__max_depth': 12, 'clf__randomforestclassifier__max_features': 12, 'clf__randomforestclassifier__n_estimators': 70}


# Train Model

In [135]:
labelEncoder = LabelBinarizer();
train_set_labels = unwrap_labels(labelEncoder.fit_transform(train_set["Survived"].values))

full_pipeline = full_pipeline.fit(train_set, train_set_labels)

attributes = featureExtractor.get_attributes()

# Evaluate Model

In [136]:
test_set_labels = unwrap_labels(labelEncoder.transform(test_set["Survived"].values))

predict(full_pipeline, test_set, np.asarray(test_set_labels))

Correct:  141
%Correct:  78.7709497207
ROC AUC:  0.860915492958


0.86091549295774639

# Submission

In [137]:
import datetime

passengers_raw_labels = unwrap_labels(LabelBinarizer().fit_transform(passengers_raw["Survived"].values))
full_pipeline = full_pipeline.fit(passengers_raw, passengers_raw_labels)

submission_data_raw = pd.read_csv("datasets/titanic-test.csv")
predictions = full_pipeline.predict(submission_data_raw)

submission = pd.DataFrame()
submission['PassengerId'] = submission_data_raw.PassengerId
submission['Survived'] = predictions

now = datetime.datetime.now()
filename = now.strftime("%Y%m%d-%H%M%S") + '-submission_clucas.csv'
submission.to_csv(path_or_buf='submissions/'+filename, index=False)