In [2]:
from __future__ import print_function

In [3]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns

import pickle

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [4]:
from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_curve, auc

## load and split dataset

In [5]:
df = pd.read_csv('analytical_base_table.csv')

In [6]:
df.shape

(14068, 26)

In [7]:
y = df.status

X = df.drop('status', axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1234, stratify=df.status)

print(len(X_train), len(X_test))

11254 2814


## make model pipelines

In [11]:
pipelines = {
    'l1':make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', random_state=123)),
    'l2':make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', random_state=123)),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123))
}

In [13]:
#hyperparameters
hp_l1 = {
    'logisticregression__C':np.linspace(1e-3, 1e3, 10)
}
hp_l2 = {
    'logisticregression__C':np.linspace(1e-3, 1e3, 10)
}
hp_rf = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33]
}
hp_gb = {
    'gradientboostingclassifier__n_estimators': [100, 200],
    'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [1, 3, 5]
}

hyperparamteters = {
    'l1':hp_l1,
    'l2':hp_l2,
    'rf':hp_rf,
    'gb':hp_gb
}

## fit model and cross-validate

In [17]:
fitted_models = {}

for name, pipeline in pipelines.items():
    clf = GridSearchCV(pipeline, hyperparamteters[name], cv=10, n_jobs=-1)
    
    clf.fit(X_train,y_train)
    
    fitted_models[name]=clf
    
    print(name, "fitted")

rf fitted
l2 fitted
gb fitted
l1 fitted


In [18]:
for name, model in fitted_models.items():
    print(name, model.best_score_)

rf 0.979296250222
l2 0.849386884663
gb 0.975475386529
l1 0.849386884663


In [19]:
for name, model in fitted_models.items():
    pred = model.predict_proba(X_test)
    pred = [p[1] for p in pred]
    
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    print( name, auc(fpr, tpr) )

rf 0.991243277606
l2 0.901538837374
gb 0.989466254607
l1 0.901542307444


In [20]:
#save the winning model
with open('final_model.pkl', 'wb') as f:
    pickle.dump(fitted_models['rf'].best_estimator_, f)