In [140]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn import grid_search

from sklearn import linear_model
from sklearn import ensemble

In [141]:
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
model_name = 'stack_gb'

In [142]:
training_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/test_features.csv')

In [143]:
X_train = training_features.ix[:,2:]
y_train = training_features.SeriousDlqin2yrs

#X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)

In [144]:
X_test = test_features.ix[:,2:]
y_test = test_features.SeriousDlqin2yrs

In [145]:
'''
submission-rfc-tuned-2017-01-07-22-54-35.csv
submission-gb-tuned-2017-01-08-00-18-20.csv
submission-ada-tuned-2017-01-08-16-05-11.csv
submission-vote-ensemble-2017-01-08-16-11-39.csv
'''

rfc_mdl = joblib.load('models/rfc-tuned-2017-01-07-22-54-35.pkl')
gb_mdl = joblib.load('models/gb-tuned-2017-01-08-00-18-20.pkl')
ada_mdl = joblib.load('models/ada-tuned-2017-01-08-16-05-11.pkl')
ens_mdl = joblib.load('models/vote-ensemble-2017-01-08-16-11-39.pkl')

In [146]:
rfc_train_pred = rfc_mdl.predict(X_train)
gb_train_pred = gb_mdl.predict(X_train)
ada_train_pred = ada_mdl.predict(X_train) 
ens_train_pred = ens_mdl.predict(X_train) 

rfc_train_pred_proba = rfc_mdl.predict_proba(X_train)
gb_train_pred_proba = gb_mdl.predict_proba(X_train)
ada_train_pred_proba = ada_mdl.predict_proba(X_train)
ens_train_pred_proba = ens_mdl.predict_proba(X_train)

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.8s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    3.3s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:    5.2s
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:    7.5s
[Parallel(n_jobs=16)]: Done 2418 tasks      | elapsed:   10.3s
[Parallel(n_jobs=16)]: Done 3168 tasks      | elapsed:   13.4s
[Parallel(n_jobs=16)]: Done 4018 tasks      | elapsed:   16.9s
[Parallel(n_jobs=16)]: Done 4968 tasks      | elapsed:   20.9s
[Parallel(n_jobs=16)]: Done 5000 out of 5000 | elapsed:   21.0s finished
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.8s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.8s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    3.2s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed

In [147]:
rfc_test_pred = rfc_mdl.predict(X_test)
gb_test_pred = gb_mdl.predict(X_test)
ada_test_pred = ada_mdl.predict(X_test) 
ens_test_pred = ens_mdl.predict(X_test) 

rfc_test_pred_proba = rfc_mdl.predict_proba(X_test)
gb_test_pred_proba = gb_mdl.predict_proba(X_test)
ada_test_pred_proba = ada_mdl.predict_proba(X_test)
ens_test_pred_proba = ens_mdl.predict_proba(X_test)

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.3s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    2.4s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:    3.7s
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:    5.4s
[Parallel(n_jobs=16)]: Done 2418 tasks      | elapsed:    7.4s
[Parallel(n_jobs=16)]: Done 3168 tasks      | elapsed:    9.7s
[Parallel(n_jobs=16)]: Done 4018 tasks      | elapsed:   12.2s
[Parallel(n_jobs=16)]: Done 4968 tasks      | elapsed:   15.1s
[Parallel(n_jobs=16)]: Done 5000 out of 5000 | elapsed:   15.3s finished
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.3s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    2.4s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed

In [148]:
y_train_stack = y_train

'''X_train_stack = pd.DataFrame(
  np.hstack([rfc_train_pred.reshape(-1, 1), gb_train_pred.reshape(-1, 1), ada_train_pred.reshape(-1, 1), ens_train_pred.reshape(-1, 1), 
             rfc_train_pred_proba, gb_train_pred_proba, ada_train_pred_proba, ens_train_pred_proba]), 
  columns=['rfc', 'gb', 'ada', 'ens', 'rfc_0', 'rfc_1', 'gb_0', 'gb_1', 'ada_0', 'ada_1', 'ens_0', 'ens_1'])
X_train_stack.head()'''

X_train_stack = pd.DataFrame(
  np.hstack([rfc_train_pred_proba, gb_train_pred_proba, ada_train_pred_proba, ens_train_pred_proba]), 
  columns=['rfc_0', 'rfc_1', 'gb_0', 'gb_1', 'ada_0', 'ada_1', 'ens_0', 'ens_1'])
X_train_stack.head()

Unnamed: 0,rfc_0,rfc_1,gb_0,gb_1,ada_0,ada_1,ens_0,ens_1
0,0.598,0.402,0.620199,0.379801,0.500112,0.499888,0.599872,0.400128
1,0.9716,0.0284,0.932363,0.067637,0.506503,0.493497,0.877131,0.122869
2,0.9304,0.0696,0.648939,0.351061,0.501832,0.498168,0.668132,0.331868
3,0.9964,0.0036,0.976785,0.023215,0.509138,0.490862,0.91278,0.08722
4,0.903,0.097,0.901542,0.098458,0.506518,0.493482,0.845318,0.154682


In [149]:
y_test_stack = y_test

'''X_test_stack = pd.DataFrame(
  np.hstack([rfc_test_pred.reshape(-1, 1), gb_test_pred.reshape(-1, 1), ada_test_pred.reshape(-1, 1), ens_test_pred.reshape(-1, 1), 
             rfc_test_pred_proba, gb_test_pred_proba, ada_test_pred_proba, ens_test_pred_proba]), 
  columns=['rfc', 'gb', 'ada', 'ens', 'rfc_0', 'rfc_1', 'gb_0', 'gb_1', 'ada_0', 'ada_1', 'ens_0', 'ens_1'])
X_test_stack.head()'''

X_test_stack = pd.DataFrame(
  np.hstack([rfc_test_pred_proba, gb_test_pred_proba, ada_test_pred_proba, ens_test_pred_proba]), 
  columns=['rfc_0', 'rfc_1', 'gb_0', 'gb_1', 'ada_0', 'ada_1', 'ens_0', 'ens_1'])
X_test_stack.head()

Unnamed: 0,rfc_0,rfc_1,gb_0,gb_1,ada_0,ada_1,ens_0,ens_1
0,0.9838,0.0162,0.944072,0.055928,0.507468,0.492532,0.887375,0.112625
1,0.9414,0.0586,0.956867,0.043133,0.50677,0.49323,0.890358,0.109642
2,0.9782,0.0218,0.98624,0.01376,0.511088,0.488912,0.917212,0.082788
3,0.885,0.115,0.933952,0.066048,0.506121,0.493879,0.86584,0.13416
4,0.795,0.205,0.928321,0.071679,0.506962,0.493038,0.849081,0.150919


In [150]:
'''clf = joblib.load('models/rfc-2017-01-07-20-54-40.pkl')'''

clf = ensemble.GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=42)
clf.fit(X_train_stack, y_train_stack)

      Iter       Train Loss   Remaining Time 
         1           0.3377           39.79s
         2           0.2942           39.28s
         3           0.2633           39.78s
         4           0.2393           39.51s
         5           0.2199           38.97s
         6           0.2036           38.76s
         7           0.1899           38.80s
         8           0.1781           38.87s
         9           0.1680           39.28s
        10           0.1591           39.44s
        20           0.1127           37.65s
        30           0.0987           37.59s
        40           0.0934           36.80s
        50           0.0911           35.44s
        60           0.0897           33.47s
        70           0.0890           31.75s
        80           0.0887           29.83s
        90           0.0882           28.04s
       100           0.0878           26.10s
       200           0.0844            8.35s


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=250,
              presort='auto', random_state=42, subsample=1.0, verbose=True,
              warm_start=False)

In [151]:
flename = 'models/%s-%s.pkl' % (model_name, timestamp)

joblib.dump(clf, flename)

log = '%s,%s\n' % (flename, clf)

fle = open('model_log.csv', 'a')
fle.write(log)
fle.flush()
fle.close()

print flename

models/stack_gb-2017-01-08-20-10-57.pkl


In [152]:
X_submission = X_test_stack
y_submission = clf.predict_proba(X_submission.astype(np.float32))

In [153]:
submission = pd.DataFrame()
submission['Id'] = test_features.ID.astype(np.int)
submission['Probability'] = y_submission[:,1]
submission.head()

Unnamed: 0,Id,Probability
0,1,0.002858
1,2,0.027574
2,3,0.004759
3,4,0.04755
4,5,0.128248


In [154]:
flename = 'submissions/submission-%s-%s.csv' % (model_name, timestamp)
submission.to_csv(flename, index=False)
print flename

submissions/submission-stack_gb-2017-01-08-20-10-57.csv
