In [66]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn import grid_search

from sklearn import ensemble
from sklearn import linear_model
from sklearn import naive_bayes

In [67]:
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
model_name = 'vote-ensemble'

In [68]:
training_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/test_features.csv')

In [69]:
X_full = training_features.ix[:,2:]
y_full = training_features.SeriousDlqin2yrs

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)

In [70]:
clf1 = ensemble.GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=42)

In [71]:
clf2 = ensemble.RandomForestClassifier(n_estimators=5000, n_jobs=-1, verbose=True, random_state=42)

In [72]:
#clf3 = ensemble.BaggingClassifier(n_estimators=250, verbose=True, random_state=42)

In [73]:
clf4 = ensemble.AdaBoostClassifier(n_estimators=100, random_state=42)

In [74]:
clf5 = linear_model.LogisticRegression(n_jobs=-1, random_state=42)

In [75]:
clf6 = naive_bayes.GaussianNB()

In [None]:
clf = ensemble.VotingClassifier(
  estimators=[('gbc', clf1), ('rfc', clf2), ('ada', clf4), ('linear', clf5), ('nb', clf6)], voting='soft', weights=[5, 1, 1, 1, 1])
clf.fit(X_train_full, y_train_full)

      Iter       Train Loss   Remaining Time 
         1           0.4567            1.80m
         2           0.4386            1.80m
         3           0.4261            1.77m
         4           0.4162            1.78m
         5           0.4084            1.76m
         6           0.4023            1.74m
         7           0.3969            1.74m
         8           0.3923            1.73m
         9           0.3890            1.71m
        10           0.3857            1.71m
        20           0.3676            1.67m
        30           0.3601            1.63m


In [78]:
flename = 'models/%s-%s.pkl' % (model_name, timestamp)

joblib.dump(clf, flename)

log = '%s,%s\n' % (flename, clf)

fle = open('model_log.csv', 'a')
fle.write(log)
fle.flush()
fle.close()

print flename

models/vote-ensemble-2017-01-08-20-45-31.pkl


In [79]:
X_submission = test_features.ix[:,2:]
y_submission = clf.predict_proba(X_submission.astype(np.float32))

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    1.6s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:    2.7s
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:    4.1s
[Parallel(n_jobs=16)]: Done 2418 tasks      | elapsed:    5.8s
[Parallel(n_jobs=16)]: Done 3168 tasks      | elapsed:    7.7s
[Parallel(n_jobs=16)]: Done 4018 tasks      | elapsed:    9.8s
[Parallel(n_jobs=16)]: Done 4968 tasks      | elapsed:   12.2s
[Parallel(n_jobs=16)]: Done 5000 out of 5000 | elapsed:   12.3s finished


In [80]:
submission = pd.DataFrame()
submission['Id'] = test_features.ID.astype(np.int)
submission['Probability'] = y_submission[:,1]
submission.head()

Unnamed: 0,Id,Probability
0,1,0.093797
1,2,0.092693
2,3,0.065735
3,4,0.120535
4,5,0.125164


In [81]:
flename = 'submissions/submission-%s-%s.csv' % (model_name, timestamp)
submission.to_csv(flename, index=False)
print flename

submissions/submission-vote-ensemble-2017-01-08-20-45-31.csv
