In [1]:
import os
import pandas as pd
from pandas import Series, DataFrame

import numpy as np

In [2]:
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif

# Dimensionality Reduction


# Modeling Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Model Selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Model Performance
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict

In [3]:
os.chdir("..")
PATH_ROOT = os.getcwd()

In [4]:
from src.data.obtain import get_raw_data
from src.data.scrub import scrub_raw_data

In [5]:
if not os.path.exists('data/04-processed/titanic.csv'):
    print("Importing and scrubbing raw data.")
    df_raw = get_raw_data()
    df = scrub_raw_data(df_raw)
else:
    print("Retrieving cleaned data from backup.")
    df = pd.read_csv('data/04-processed/titanic.csv')

Retrieving cleaned data from backup.


In [6]:
X = df.copy().drop(['survived'], axis=1)
y = df['survived'].copy()

# Baseline Model

In [7]:
X_scaled = StandardScaler().fit_transform(X.fillna(0))
X_tr, X_te, y_tr, y_te = train_test_split(X_scaled, y, train_size=0.8, test_size=0.2)

In [8]:
lr_0 = LogisticRegression()
lr_0.fit(X_tr, y_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
y_pr = lr_0.predict(X_te)
accuracy_score(y_te, y_pr)

0.78244274809160308

In [10]:
for i in range(5):
    """
    Loop to prove the need for cross-validated output
    """    
    X_tr, X_te, y_tr, y_te = train_test_split(X_scaled, y, train_size=0.8, test_size=0.2)
    lr_0 = LogisticRegression()
    lr_0.fit(X_tr, y_tr)
    print accuracy_score(lr_0.predict(X_te), y_te).round(2)

0.76
0.77
0.76
0.77
0.79


# Grid Search for Logistic Regression

In [11]:
from src.models.train import run_classifier

In [12]:
pipe_lr = \
    Pipeline([
        ('scale', StandardScaler()),
        ('model', LogisticRegression())
    ])

grid_lr = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__penalty':['l1', 'l2'],
    'model__class_weight':[None, 'balanced']
}

In [15]:
# RUN GRID SEARCHES OVER 10 RANDOM SPLITS
# ---------------------------------------------------------------

list_models = \
map(lambda i:
    run_classifier(
        X=X, 
        y=y,
        UNCORR=X.columns.tolist(),
        TRAIN_SIZE=0.75,
        CLF=pipe_lr, 
        GRID=grid_lr, 
        SCORING='roc_auc'),
    range(10))

df_ten_models = \
(pd.concat(
    map(lambda model:
        Series({k:model[k] for k in model.keys() if 'score' in k}), list_models),
    axis=1))

df_ten_models.columns = map(lambda i: 'split_{}'.format(i), df_ten_models)


# PERFORMANCE ANALYSIS
# ---------------------------------------------------------------

(df_ten_models
 .T
 .drop(['train_score', 'test_score__accuracy'], axis=1)
 .plot.box(vert=False, title="Model Performance over ten random splits", xlim=(0, 1))
);

df_ten_models.round(2)

Unnamed: 0,split_0,split_1,split_2,split_3,split_4,split_5,split_6,split_7,split_8,split_9
test_score__accuracy,0.8,0.78,0.78,0.77,0.76,0.76,0.78,0.77,0.79,0.8
test_score__f1,0.74,0.7,0.72,0.74,0.68,0.7,0.7,0.68,0.68,0.75
test_score__precision,0.75,0.68,0.68,0.68,0.72,0.67,0.71,0.7,0.76,0.72
test_score__recall,0.73,0.73,0.78,0.8,0.65,0.73,0.69,0.66,0.62,0.79
test_score__roc_auc,0.79,0.77,0.78,0.78,0.74,0.76,0.76,0.75,0.75,0.79
train_score,0.84,0.84,0.85,0.84,0.85,0.84,0.84,0.85,0.84,0.84


In [16]:
choice = int(raw_input("Pick a split number from the table above.\nValid entries (0-9)\n"))
SELECTED_MODEL = list_models[choice]
gscv = SELECTED_MODEL.get('model')

print "GridSearch Results...\n"
(DataFrame(gscv.cv_results_)
 .set_index('params')
 .loc[:, ['mean_train_score', 'mean_test_score', 'mean_fit_time']]
 .sort_values('mean_test_score', ascending=False)
 .round(2)
 .head()
)

Pick a split number from the table above.
Valid entries (0-9)
9
GridSearch Results...



Unnamed: 0_level_0,mean_train_score,mean_test_score,mean_fit_time
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"{u'model__C': 10, u'model__class_weight': u'balanced', u'model__penalty': u'l1'}",0.85,0.84,0.02
"{u'model__C': 10, u'model__class_weight': u'balanced', u'model__penalty': u'l2'}",0.85,0.84,0.0
"{u'model__C': 1, u'model__class_weight': u'balanced', u'model__penalty': u'l2'}",0.85,0.84,0.0
"{u'model__C': 1, u'model__class_weight': u'balanced', u'model__penalty': u'l1'}",0.85,0.84,0.0
"{u'model__C': 1, u'model__class_weight': None, u'model__penalty': u'l2'}",0.85,0.84,0.0


In [37]:
SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model')

LogisticRegression(C=10, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [39]:
SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model').coef_[0].round(3)

array([-0.565, -0.405, -0.048, -0.005, -0.291, -0.46 ,  1.152, -0.101,
       -0.129, -0.414,  0.234,  0.   , -0.283])

# Grid Search for Decision Trees

In [40]:
pipe_dt = \
Pipeline([
    ('scale', StandardScaler()),
    ('select', SelectKBest(score_func=f_classif)),
    ('model', DecisionTreeClassifier())
])

grid_dt = {
    'select__k': [5, 9, 'all'],
    'model__max_depth':[3, 5, 7],
    'model__min_samples_split': [20, 40, 80],
    'model__class_weight': ['balanced', None]
}

In [42]:
# ---------------------------------------------------------------
# RUN GRID SEARCHES OVER 10 RANDOM SPLITS
# ---------------------------------------------------------------

list_models = \
map(lambda i:
    run_classifier(
        X=X, 
        y=y,
        UNCORR=X.columns.tolist(),
        TRAIN_SIZE=0.8,
        CLF=pipe_dt, 
        GRID=grid_dt, 
        SCORING='roc_auc'),
    range(10))

df_ten_models = \
(pd.concat(
    map(lambda model:
        Series({k:model[k] for k in model.keys() if 'score' in k}), list_models),
    axis=1))

df_ten_models.columns = map(lambda i: 'split_{}'.format(i), df_ten_models)
df_ten_models.round(2)

# ---------------------------------------------------------------
# PERFORMANCE ANALYSIS
# ---------------------------------------------------------------

(df_ten_models
 .T
 .drop(['train_score', 'test_score__accuracy'], axis=1)
 .plot.box(vert=False, title="Model Performance over ten random splits", xlim=(0, 1))
);

df_ten_models.round(2)

Unnamed: 0,split_0,split_1,split_2,split_3,split_4,split_5,split_6,split_7,split_8,split_9
test_score__accuracy,0.79,0.81,0.76,0.77,0.81,0.76,0.83,0.78,0.78,0.75
test_score__f1,0.74,0.73,0.64,0.67,0.76,0.67,0.76,0.69,0.71,0.67
test_score__precision,0.73,0.73,0.7,0.71,0.79,0.78,0.8,0.68,0.76,0.67
test_score__recall,0.74,0.72,0.59,0.64,0.73,0.59,0.72,0.7,0.67,0.67
test_score__roc_auc,0.78,0.79,0.72,0.74,0.8,0.74,0.81,0.76,0.76,0.74
train_score,0.85,0.84,0.85,0.84,0.83,0.84,0.83,0.85,0.84,0.85


In [43]:
# ---------------------------------------------------------------
# FINAL MODEL
# ---------------------------------------------------------------

choice = int(raw_input("Pick a split number from the table above.\nValid entries (0-9)\n"))
SELECTED_MODEL = list_models[choice]
gscv = SELECTED_MODEL.get('model')

print "GridSearch Results...\n"
(DataFrame(gscv.cv_results_)
 .set_index('params')
 .loc[:, ['mean_train_score', 'mean_test_score', 'mean_fit_time']]
 .sort_values('mean_test_score', ascending=False)
 .round(2)
 .head()
)

Pick a split number from the table above.
Valid entries (0-9)
4
GridSearch Results...



Unnamed: 0_level_0,mean_train_score,mean_test_score,mean_fit_time
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"{u'model__class_weight': None, u'model__min_samples_split': 80, u'model__max_depth': 7, u'select__k': u'all'}",0.87,0.83,0.0
"{u'model__class_weight': u'balanced', u'model__min_samples_split': 80, u'model__max_depth': 7, u'select__k': u'all'}",0.87,0.83,0.0
"{u'model__class_weight': None, u'model__min_samples_split': 80, u'model__max_depth': 5, u'select__k': u'all'}",0.86,0.83,0.0
"{u'model__class_weight': u'balanced', u'model__min_samples_split': 20, u'model__max_depth': 3, u'select__k': u'all'}",0.84,0.83,0.0
"{u'model__class_weight': u'balanced', u'model__min_samples_split': 40, u'model__max_depth': 3, u'select__k': u'all'}",0.84,0.83,0.0


In [44]:
SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model')

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=80,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [51]:
Series(SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model').feature_importances_,
       index=X.columns.tolist()).round(2).sort_values(ascending=False)

gender                  0.56
pclass_3                0.16
age                     0.13
fare                    0.08
cabinnumber__is_null    0.05
embarked_C              0.02
pclass_2                0.00
pclass_1                0.00
embarked_S              0.00
embarked_Q              0.00
age__is_null            0.00
parch                   0.00
sibsp                   0.00
dtype: float64