In [32]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [38]:
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
model_name = 'rfc'

In [39]:
training_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/test_features.csv')

In [40]:
X_full = training_features.ix[:,2:]
y_full = training_features.SeriousDlqin2yrs

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, stratify=y_full, random_state=42)

In [41]:
'''rfc = joblib.load('models/rfc-2017-01-07-20-54-40.pkl')'''

rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', verbose=True, random_state=42)
rfc.fit(X_train_full, y_train_full)
y_pred_proba = rfc.predict_proba(X_test_full)
roc_auc_score(y_test_full, y_pred_proba[:,1])

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   16.9s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   34.8s finished
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.8s finished


0.84013661149857977

In [44]:
flename = 'models/%s-%s.pkl' % (model_name, timestamp)

joblib.dump(rfc, flename)

log = '%s,%s\n' % (flename, rfc)

fle = open('model_log.csv', 'a')
fle.write(log)
fle.flush()
fle.close()

print flename

models/rfc-2017-01-07-21-17-53.pkl


In [48]:
X_submission = test_features.ix[:,2:]
y_submission = rfc.predict_proba(X_submission.astype(np.float32))

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.6s finished


In [46]:
submission = pd.DataFrame()
submission['Id'] = test_features.ID.astype(np.int)
submission['Probability'] = y_submission[:,1]
submission.head()

Unnamed: 0,Id,Probability
0,1,0.01
1,2,0.08
2,3,0.04
3,4,0.05
4,5,0.13


In [47]:
submission.to_csv('submissions/submission-%s-%s.csv' % (model_name, timestamp), index=False)