# Imports

In [1]:
import os
import numpy as np

import pandas as pd
from pandas import Series, DataFrame

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams

In [2]:
os.chdir("..")
PATH_ROOT = os.getcwd()

In [3]:
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif

# Modeling Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Model Selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Model Performance
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict

In [4]:
from src.data.obtain import get_raw_data
from src.data.scrub import scrub_raw_data

In [5]:
from src.models.persist import persist
from src.data.obtain import json_write

# Settings

In [6]:
plt.style.use('seaborn-white')
sns.set_context("talk")

rcParams['figure.figsize'] = 12, 5 
rcParams['font.family'] = 'Roboto'

font_title = {
    'size': 18, 
    'weight': "bold", 
    'name': 'Montserrat'
}

font_axes = {
    'size': 14, 
    'weight': "bold", 
    'name': 'Montserrat'
}

font_text = {
    'size': 14, 
    'weight': 400, 
    'name': 'Roboto'
}

%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Get Data

In [8]:
if not os.path.exists('data/04-processed/titanic.csv'):
    print("Importing and scrubbing raw data.")
    df_raw = get_raw_data()
    df = scrub_raw_data(df_raw)
else:
    print("Retrieving cleaned data from backup.")
    df = pd.read_csv('data/04-processed/titanic.csv')

Importing and scrubbing raw data.
Declaring an empty dictionary to persist information we'll need to scrub new data in production.
Fixing column names. (Removing special characters, converting to lowercase. Renaming long columns)
The following columns have missing data: 
['age', 'fare', 'cabinnumber', 'embarked']
Age is approximately normally distributed, but Fare is skewed.
Using the mean for Age and Median for Fare to impute missing data.
Cabin Number has over 70% values missing. Dropping this variable.
Embarked has only 2 values missing. Imputing with Mode.
Creating a column for Gender
Creating Dummies for Embarked and Passenger Class. 
Done. Now dropping these.
Dropping cabinnumber, ticket and name as they have no predictive value. (Too many uniques)
Downcasting numerics to occupy less space.
Backing up the data. 
Okay, all Done. Happy Exploring!


In [9]:
X = df.copy().drop(['survived'], axis=1).fillna(0)
y = df['survived'].copy()

# Train-Test Split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.8, test_size=0.2)

# Baseline Model

In [12]:
scaler = StandardScaler() 
scaler.fit(X_tr)

X_tr__scaled = scaler.transform(X_tr)

### Persist the Scaler

In [13]:
json_write(persist(scaler), '/home/src/data/scaler_params.json')

### Instantiate the model object, fit the data

In [14]:
lr_0 = LogisticRegression()
lr_0.fit(X_tr, y_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
X_te__scaled = scaler.transform(X_te)

y_pr = lr_0.predict(X_te__scaled)
print("Baseline Accuracy: {}".format(accuracy_score(y_te, y_pr)))

Baseline Accuracy: 0.74427480916


In [16]:
print("Without cross-validation, performance metrics arent reliable.")
for i in range(5):
    """
    Loop to prove the need for cross-validated output
    """    
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.8, test_size=0.2)
    scaler.fit(X_tr)
    
    X_tr = scaler.transform(X_tr)
    X_te = scaler.transform(X_te)

    lr_0 = LogisticRegression()
    lr_0.fit(X_tr, y_tr)
    print accuracy_score(lr_0.predict(X_te), y_te).round(2)

Without cross-validation, performance metrics arent reliable.
0.79
0.8
0.76
0.8
0.8


## Persisting the Baseline Model

In [17]:
print("Model Coefficients: \n{}".format(lr_0.coef_[0].round(2).tolist()))
print
print("Model Parameters: \n{}".format(lr_0.get_params()))

Model Coefficients: 
[-0.48, -0.32, 0.02, 0.1, -0.07, -0.08, -0.35, 0.17, 1.18, 0.12, 0.01, -0.11, 0.3, 0.1, -0.34]

Model Parameters: 
{'warm_start': False, 'C': 1.0, 'n_jobs': 1, 'verbose': 0, 'intercept_scaling': 1, 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'solver': 'liblinear', 'class_weight': None}


In [18]:
json_write(persist(lr_0), '/home/src/data/lr0_params.json')

---

# Grid Search for Logistic Regression

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X_scaled, y, train_size=0.7, test_size=0.2)

# Set up the Pipeline
pipe_lr = \
    Pipeline([
        ('scale', StandardScaler()),
        ('model', LogisticRegression())
    ])

# Set up the Grid Search
grid_lr = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__penalty':['l1', 'l2'],
    'model__class_weight':[None, 'balanced']
}

# Run the Grid Search
gscv_lr = GridSearchCV(
    estimator=pipe_lr, 
    param_grid=grid_lr, 
    scoring='recall', 
    cv=5)

gscv_lr.fit(X_tr, y_tr)
y_pr = gscv_lr.best_estimator_.predict(X_te)

print("Best score:\n{}\n".format(gscv_lr.best_score_.round(2)))
print("Best Params: \n{}\n".format(gscv_lr.best_params_))
print("OOS Accuracy: \n{}\n".format(accuracy_score(y_pr, y_te).round(2)))
print("Classification Report:")
print(classification_report(y_pr, y_te))
print("Coefficients: \n")
Series(gscv_lr.best_estimator_.named_steps.get('model').coef_[0], index=X.columns.tolist()).round(2).sort_values()

## Automated Running

In [None]:
from src.models.train import run_classifier

In [None]:
list_models = \
map(lambda i:
    run_classifier(
        X=X, 
        y=y,
        UNCORR=X.columns.tolist(),
        TRAIN_SIZE=0.75,
        CLF=pipe_lr, 
        GRID=grid_lr, 
        SCORING='roc_auc'),
    range(10))

df_ten_models = \
(pd.concat(
    map(lambda model:
        Series({k:model[k] for k in model.keys() if 'score' in k}), list_models),
    axis=1))

df_ten_models.columns = map(lambda i: 'split_{}'.format(i), df_ten_models)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

(df_ten_models
 .T
 .drop(['train_score', 'test_score__accuracy'], axis=1)
 .plot.box(vert=False, xlim=(0.55, 0.85), ax=ax)
);

ax.set_title("Model Performance over ten random splits\n", fontdict=font_title);
ax.set_xlabel("\nScore", fontdict=font_axes)
ax.set_ylabel("Performance Metric\n", fontdict=font_axes)


ax.grid(True, linestyle=":", alpha=0.6)

ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.savefig("reports/figures/01-model-performance-LR.png", bbox_inches='tight', pad_inches=0.5)
;

In [None]:
df_ten_models.round(2).T

In [None]:
choice = int(raw_input("Pick a split number from the table above.\nValid entries (0-9)\n"))
SELECTED_MODEL = list_models[choice]
gscv = SELECTED_MODEL.get('model')

print "GridSearch Results...\n"
(DataFrame(gscv.cv_results_)
 .set_index('params')
 .loc[:, ['mean_train_score', 'mean_test_score', 'mean_fit_time']]
 .sort_values('mean_test_score', ascending=False)
 .round(2)
 .head()
)

In [None]:
SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model').get_params()

In [None]:
Series(SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model').coef_[0].round(3),
       index=X.columns).round(2).sort_values()

# Grid Search for Decision Trees

In [None]:
pipe_dt = \
Pipeline([
    ('scale', StandardScaler()),
    ('select', SelectKBest(score_func=f_classif)),
    ('model', DecisionTreeClassifier())
])

grid_dt = {
    'select__k': [5, 9, 'all'],
    'model__max_depth':[3, 5, 7],
    'model__min_samples_split': [20, 40, 80],
    'model__class_weight': ['balanced', None]
}

In [None]:
# ---------------------------------------------------------------
# RUN GRID SEARCHES OVER 10 RANDOM SPLITS
# ---------------------------------------------------------------

list_models = \
map(lambda i:
    run_classifier(
        X=X, 
        y=y,
        UNCORR=X.columns.tolist(),
        TRAIN_SIZE=0.8,
        CLF=pipe_dt, 
        GRID=grid_dt, 
        SCORING='roc_auc'),
    range(10))

df_ten_models = \
(pd.concat(
    map(lambda model:
        Series({k:model[k] for k in model.keys() if 'score' in k}), list_models),
    axis=1))

df_ten_models.columns = map(lambda i: 'split_{}'.format(i), df_ten_models)
df_ten_models.round(2)

# ---------------------------------------------------------------
# PERFORMANCE ANALYSIS
# ---------------------------------------------------------------

(df_ten_models
 .T
 .drop(['train_score', 'test_score__accuracy'], axis=1)
 .plot.box(vert=False, title="Model Performance over ten random splits", xlim=(0, 1))
);

df_ten_models.round(2)

In [None]:
# ---------------------------------------------------------------
# FINAL MODEL
# ---------------------------------------------------------------

choice = int(raw_input("Pick a split number from the table above.\nValid entries (0-9)\n"))
SELECTED_MODEL = list_models[choice]
gscv = SELECTED_MODEL.get('model')

print "GridSearch Results...\n"
(DataFrame(gscv.cv_results_)
 .set_index('params')
 .loc[:, ['mean_train_score', 'mean_test_score', 'mean_fit_time']]
 .sort_values('mean_test_score', ascending=False)
 .round(2)
 .head()
)

In [None]:
SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model')

In [None]:
Series(SELECTED_MODEL.get('model').best_estimator_.named_steps.get('model').feature_importances_,
       index=X.columns.tolist()).round(2).sort_values(ascending=False)