In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn import grid_search

from sklearn import ensemble

In [2]:
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
model_name = 'rfc-tuned'

In [3]:
training_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/test_features.csv')

In [4]:
X_full = training_features.ix[:,2:]
y_full = training_features.SeriousDlqin2yrs

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)

In [5]:
'''clf = joblib.load('models/rfc-2017-01-07-20-54-40.pkl')'''

clf = ensemble.RandomForestClassifier(n_jobs=-1, random_state=42)

#tuned_parameters = [{'n_estimators': [10, 25, 50, 100, 250, 500, 1000]}]
tuned_parameters = [{'n_estimators': [1000, 1500, 2000, 3000, 5000]}]

clf = grid_search.GridSearchCV(clf, tuned_parameters, cv=5, scoring='roc_auc', verbose=10)
clf.fit(X_train_full, y_train_full)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] n_estimators=1000 ...............................................
[CV] ...................... n_estimators=1000, score=0.858641 -  32.6s
[CV] n_estimators=1000 ...............................................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   32.6s


[CV] ...................... n_estimators=1000, score=0.847258 -  33.7s
[CV] n_estimators=1000 ...............................................
[CV] ...................... n_estimators=1000, score=0.839012 -  34.1s
[CV] n_estimators=1000 ...............................................
[CV] ...................... n_estimators=1000, score=0.842395 -  34.1s
[CV] n_estimators=1000 ...............................................


[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  2.2min


[CV] ...................... n_estimators=1000, score=0.848850 -  33.2s
[CV] n_estimators=1500 ...............................................
[CV] ...................... n_estimators=1500, score=0.858436 -  49.7s
[CV] n_estimators=1500 ...............................................
[CV] ...................... n_estimators=1500, score=0.848059 -  49.5s
[CV] n_estimators=1500 ...............................................


[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:  4.5min


[CV] ...................... n_estimators=1500, score=0.839291 -  49.9s
[CV] n_estimators=1500 ...............................................
[CV] ...................... n_estimators=1500, score=0.842785 -  50.5s
[CV] n_estimators=1500 ...............................................
[CV] ...................... n_estimators=1500, score=0.848748 -  49.5s
[CV] n_estimators=2000 ...............................................
[CV] ...................... n_estimators=2000, score=0.858448 - 1.1min
[CV] n_estimators=2000 ...............................................
[CV] ...................... n_estimators=2000, score=0.848046 - 1.1min
[CV] n_estimators=2000 ...............................................


[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:  9.2min


[CV] ...................... n_estimators=2000, score=0.839590 - 1.1min
[CV] n_estimators=2000 ...............................................
[CV] ...................... n_estimators=2000, score=0.842775 - 1.1min
[CV] n_estimators=2000 ...............................................
[CV] ...................... n_estimators=2000, score=0.848597 - 1.1min
[CV] n_estimators=3000 ...............................................
[CV] ...................... n_estimators=3000, score=0.858547 - 1.7min
[CV] n_estimators=3000 ...............................................
[CV] ...................... n_estimators=3000, score=0.848290 - 1.7min
[CV] n_estimators=3000 ...............................................


[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed: 15.9min


[CV] ...................... n_estimators=3000, score=0.839641 - 1.7min
[CV] n_estimators=3000 ...............................................
[CV] ...................... n_estimators=3000, score=0.843121 - 1.7min
[CV] n_estimators=3000 ...............................................
[CV] ...................... n_estimators=3000, score=0.849178 - 1.7min
[CV] n_estimators=5000 ...............................................
[CV] ...................... n_estimators=5000, score=0.858654 - 2.8min
[CV] n_estimators=5000 ...............................................
[CV] ...................... n_estimators=5000, score=0.848248 - 2.8min
[CV] n_estimators=5000 ...............................................
[CV] ...................... n_estimators=5000, score=0.839823 - 2.8min
[CV] n_estimators=5000 ...............................................
[CV] ...................... n_estimators=5000, score=0.843528 - 2.8min
[CV] n_estimators=5000 ...............................................


[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed: 32.2min


[CV] ...................... n_estimators=5000, score=0.849406 - 2.7min


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 35.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [1000, 1500, 2000, 3000, 5000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=10)

In [12]:
clf.best_params_

{'n_estimators': 5000}

In [14]:
flename = 'models/%s-%s.pkl' % (model_name, timestamp)

joblib.dump(clf, flename)

log = '%s,%s\n' % (flename, clf)

fle = open('model_log.csv', 'a')
fle.write(log)
fle.flush()
fle.close()

print flename

models/rfc-tuned-2017-01-07-22-54-35.pkl


In [8]:
X_submission = test_features.ix[:,2:]
y_submission = clf.predict_proba(X_submission.astype(np.float32))

In [9]:
submission = pd.DataFrame()
submission['Id'] = test_features.ID.astype(np.int)
submission['Probability'] = y_submission[:,1]
submission.head()

Unnamed: 0,Id,Probability
0,1,0.0162
1,2,0.0586
2,3,0.0218
3,4,0.115
4,5,0.205


In [11]:
flename = 'submissions/submission-%s-%s.csv' % (model_name, timestamp)
submission.to_csv(flename, index=False)
print flename

submissions/submission-rfc-tuned-2017-01-07-22-54-35.csv
