# Imports

In [417]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from utils.data_handling import *
from utils.feature_engineering import FeatureExtractor

from sklearn.pipeline import Pipeline
from mlxtend.classifier import StackingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error

from xgboost import XGBClassifier

import pandas as pd
import numpy as np
import os

# Load and Split Data

In [418]:
csv_path = os.path.join("datasets", "titanic-train.csv")
passengers_raw = pd.read_csv(csv_path)

train_set, test_set = train_test_split(passengers_raw, test_size=0.2, random_state=32)
labelEncoder = LabelBinarizer();
train_set_labels = unwrap_labels(labelEncoder.fit_transform(train_set["Survived"].values))
test_set_labels = unwrap_labels(labelEncoder.transform(test_set["Survived"].values))

print(len(train_set), "train +", len(test_set), "test")

712 train + 179 test


# Feature Engineering Pipeline

In [419]:
featureExtractor = FeatureExtractor(useOrdinals=True)

rf = RandomForestClassifier(random_state=42, n_estimators=65, max_features=22, max_depth = 18, min_samples_leaf=2)
svc = SVC(random_state=42, probability=True, kernel='linear', C=0.01)
ada = AdaBoostClassifier(random_state=42, n_estimators=65, learning_rate = 0.8)
gb = GradientBoostingClassifier(random_state=42, n_estimators=65, max_depth=6)
et = ExtraTreesClassifier(random_state=42, n_estimators=65, max_depth=6, min_samples_leaf=2)

xgb = XGBClassifier(random_state=42, objective='binary:logistic',
                    learning_rate=0.1, n_estimators=65, max_depth=6, min_child_weight=1)

lr = LogisticRegression(C=0.3)

sclf = StackingClassifier(classifiers=[rf, xgb, ada, gb, et], # svc], 
                          meta_classifier=lr,
                          use_probas=True)

full_pipeline = Pipeline([
    ('features', featureExtractor.get_feature_union()),
    ('clf', sclf)
])

# Grid Search

In [420]:
param_grid = [
    {'clf__randomforestclassifier__n_estimators': [65], # 65, 70, 75, 80
     'clf__randomforestclassifier__max_features': [22], # 10, 12, 14, 16
     'clf__randomforestclassifier__max_depth': [18], # 6, 8, 16, 20
     # 'clf__svc__C': [0.01], #
     'clf__adaboostclassifier__n_estimators': [65],
     'clf__adaboostclassifier__learning_rate': [0.8],
     'clf__gradientboostingclassifier__n_estimators': [65],
     'clf__gradientboostingclassifier__max_depth': [6],
     'clf__extratreesclassifier__n_estimators': [65],
     'clf__extratreesclassifier__max_depth': [6],
     'clf__xgbclassifier__learning_rate': [0.1],
     'clf__xgbclassifier__n_estimators': [65], #
     'clf__xgbclassifier__max_depth': [6], # 4, 6, 10, 16
     'clf__meta-logisticregression__C': [0.3], # 0.03, 0.1, 0.3, 1, 10,
    }
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_set, train_set_labels)

predict(grid_search.best_estimator_, test_set, test_set_labels)
print("Best Params: ", grid_search.best_params_)

Correct:  148
%Correct:  82.6815642458
ROC AUC:  0.857524778299
Best Params:  {'clf__adaboostclassifier__learning_rate': 0.8, 'clf__adaboostclassifier__n_estimators': 65, 'clf__extratreesclassifier__max_depth': 6, 'clf__extratreesclassifier__n_estimators': 65, 'clf__gradientboostingclassifier__max_depth': 6, 'clf__gradientboostingclassifier__n_estimators': 65, 'clf__meta-logisticregression__C': 0.3, 'clf__randomforestclassifier__max_depth': 18, 'clf__randomforestclassifier__max_features': 22, 'clf__randomforestclassifier__n_estimators': 65, 'clf__xgbclassifier__learning_rate': 0.1, 'clf__xgbclassifier__max_depth': 6, 'clf__xgbclassifier__n_estimators': 65}


# Train Model

In [414]:
full_pipeline = full_pipeline.fit(train_set, train_set_labels)

attributes = featureExtractor.get_attributes()

# Evaluate Model

In [415]:
predict(full_pipeline, test_set, np.asarray(test_set_labels))

Correct:  148
%Correct:  82.6815642458
ROC AUC:  0.857524778299


0.85752477829942619

# Submission

In [416]:
import datetime

passengers_raw_labels = unwrap_labels(LabelBinarizer().fit_transform(passengers_raw["Survived"].values))
full_pipeline = full_pipeline.fit(passengers_raw, passengers_raw_labels)

submission_data_raw = pd.read_csv("datasets/titanic-test.csv")
predictions = full_pipeline.predict(submission_data_raw)

submission = pd.DataFrame()
submission['PassengerId'] = submission_data_raw.PassengerId
submission['Survived'] = predictions

now = datetime.datetime.now()
filename = now.strftime("%Y%m%d-%H%M%S") + '-submission_clucas.csv'
submission.to_csv(path_or_buf='submissions/'+filename, index=False)