In [31]:
import time

import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [2]:
trainDataFrame = pd.read_csv('./data/trainLevel1Preds.csv')
trainLabels = trainDataFrame['TARGET']
trainFeatures = trainDataFrame.drop(['TARGET'], axis=1)

In [4]:
print trainDataFrame.shape
print trainLabels.shape
print trainFeatures.shape
np.array_equal(trainLabels.index, trainFeatures.index)

(76020, 189)
(76020,)
(76020, 188)


True

In [27]:
dataTarget0 = trainDataFrame[trainDataFrame.TARGET == 0]
dataTarget1 = trainDataFrame[trainDataFrame.TARGET == 1]

def getBalancedTrainAndValidationSets():
    # shuffle is NOT working!!! .reindex does NOT work inplace !!!!!!!!!!!!!!!!!!!!!!!! FIX THIS !!!!!!!!!!!!!!!!
    dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dataTarget1.reindex(np.random.permutation(dataTarget1.index))

    trn0 = dataTarget0[0:36500]
    trn1 = dataTarget1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
        
    val0 = dataTarget0[36500:]
    val1 = dataTarget1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
        
    return X_train, y_train, X_val, y_val

In [28]:
# cross validated performance
n_folds = 5
clf = GradientBoostingClassifier(max_features=7)
trn_scores = []
val_scores = []
start = time.time()
for i in range(n_folds):
    X_train, y_train, X_val, y_val = getBalancedTrainAndValidationSets()
    clf.fit(X_train, y_train)
    # evaluate w .predict_proba() !!!! :
    trn_score = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
    val_score = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])
    # evaluate w .predict():
    # trn_score = roc_auc_score(y_train, clf.predict(X_train))
    # val_score = roc_auc_score(y_val, clf.predict(X_val))
    print "train auc: %.4f" % trn_score
    print "validation auc: %.4f" % val_score
    trn_scores.append(trn_score)
    val_scores.append(val_score)

print "-----------------------"
print "Train Mean: %.4f" % np.mean(trn_scores)
print "Validation Mean: %.4f" % np.mean(val_scores)
print "Total Time (mins): %.1f" % ((time.time()-start)/60.)

train auc: 0.8715
validation auc: 0.8626
train auc: 0.8713
validation auc: 0.8620
train auc: 0.8714
validation auc: 0.8624
train auc: 0.8732
validation auc: 0.8625
train auc: 0.8735
validation auc: 0.8631
-----------------------
Train Mean: 0.8722
Validation Mean: 0.8625
Total Time (mins): 5.8


In [19]:
print X_train.shape
print y_train.shape
print X_val.shape
print y_val.shape

(4500, 188)
(4500,)
(4508, 188)
(4508,)


In [20]:
confusion_matrix(y_val, clf.predict(X_val))

array([[2596,  404],
       [ 510,  998]])

### train on all data and save feature_importances_

In [30]:
model = GradientBoostingClassifier(max_features=7)
model.fit(trainFeatures, trainLabels)


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=7, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [32]:
feat_imps_for_level1preds_w_gbc_mf7_all_trainset_sorted = sorted(zip(trainFeatures.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)

In [33]:
feat_imps_for_level1preds_w_gbc_mf7_all_trainset_sorted

[('rfc_5', 0.022271219436144626),
 ('xgb_4', 0.020745370418321536),
 ('rfc_17', 0.017732956220901371),
 ('xgb_9', 0.017092360257458746),
 ('xgb_11', 0.016114674394330394),
 ('svm_16', 0.0160096314446889),
 ('rfc_2', 0.015823938142816171),
 ('xgb_17', 0.015756475145877455),
 ('xgb_12', 0.015720812054844834),
 ('rfc_18', 0.01550412648583859),
 ('rfc_13', 0.015199231880057527),
 ('xgb_8', 0.014816468129873922),
 ('svm_15', 0.0132701715292295),
 ('lr_19', 0.013166665953654186),
 ('fnn_10', 0.012978292938358766),
 ('rfc_3', 0.012093217772584007),
 ('gbc_7', 0.011777018041079928),
 ('gbc_2', 0.011646806421891431),
 ('svm_12', 0.011437337872007118),
 ('xgb_10', 0.011201223101746436),
 ('xgb_14', 0.011070345072233558),
 ('xgb_2', 0.011054223494070066),
 ('xgb_5', 0.010904807731260135),
 ('xgb_6', 0.01078255866133758),
 ('svm_17', 0.01054030719700017),
 ('gbc_18', 0.010350142038380193),
 ('svm_0', 0.010195283386965192),
 ('svm_9', 0.0099980222730563176),
 ('svm_5', 0.0094899618469975345),
 ('et

In [34]:
joblib.dump(feat_imps_for_level1preds_w_gbc_mf7_all_trainset_sorted, './data/feat_imps_for_level1preds_w_gbc_mf7_all_trainset_sorted')

['./data/feat_imps_for_level1preds_w_gbc_mf7_all_trainset_sorted']