In [1]:
import numpy as np
import pandas as pd
from __future__ import division
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn import linear_model
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import auc



In [2]:
train = pd.read_csv('../../finalData/train_sub200_all.csv')
test = pd.read_csv('../../finalData/test_sub200_all.csv')

In [3]:
train_y = train['vio3']
test_y = test['vio3']

In [4]:
train_X = train[train.columns[1:201]]
test_X = test[test.columns[1:201]]

In [5]:
gbm = xgb.XGBClassifier()
gbm_params = {
    'learning_rate': [0.05, 0.1],
    'n_estimators': [300, 500],
    'max_depth': [2, 3, 8, 12],
}
cv = StratifiedKFold(train_y)
grid = GridSearchCV(gbm, gbm_params,scoring='roc_auc',cv=cv,verbose=10,n_jobs=7)
grid.fit(train_X, train_y)

print (grid.best_params_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=300, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=500, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=500, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=500, learning_rate=0.05, max_depth=2 ...............
[CV] n_estimators=300, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=2, score=0.838722 - 1.4min
[CV] n_estimators=300, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=2, score=0.842578 - 1.4min
[CV] n_estimators=300, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=2, score=0.836284 - 1.4min
[CV] n_estimators=500, learning_rate=0.05, max_depth=3 ..

[Parallel(n_jobs=7)]: Done   4 tasks      | elapsed:  2.0min


[CV]  n_estimators=500, learning_rate=0.05, max_depth=2, score=0.840613 - 2.3min
[CV]  n_estimators=500, learning_rate=0.05, max_depth=2, score=0.836633 - 2.3min
[CV] n_estimators=500, learning_rate=0.05, max_depth=3 ...............
[CV] n_estimators=300, learning_rate=0.05, max_depth=8 ...............
[CV]  n_estimators=500, learning_rate=0.05, max_depth=2, score=0.843279 - 2.4min
[CV] n_estimators=300, learning_rate=0.05, max_depth=8 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=3, score=0.831085 - 2.0min
[CV] n_estimators=300, learning_rate=0.05, max_depth=8 ...............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=3, score=0.846720 - 2.1min
[CV] n_estimators=500, learning_rate=0.05, max_depth=8 ...............
[CV]  n_estimators=500, learning_rate=0.05, max_depth=3, score=0.840513 - 3.5min
[CV] n_estimators=500, learning_rate=0.05, max_depth=8 ...............
[CV]  n_estimators=500, learning_rate=0.05, max_depth=3, score=0.846339 - 3.6min
[CV] n_

[Parallel(n_jobs=7)]: Done  11 tasks      | elapsed:  5.6min


[CV]  n_estimators=500, learning_rate=0.05, max_depth=3, score=0.829278 - 3.5min
[CV] n_estimators=300, learning_rate=0.05, max_depth=12 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=8, score=0.811515 - 8.9min
[CV] n_estimators=300, learning_rate=0.05, max_depth=12 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=8, score=0.844583 - 9.5min
[CV] n_estimators=300, learning_rate=0.05, max_depth=12 ..............
[CV]  n_estimators=300, learning_rate=0.05, max_depth=8, score=0.813096 - 9.1min
[CV] n_estimators=500, learning_rate=0.05, max_depth=12 ..............
[CV]  n_estimators=500, learning_rate=0.05, max_depth=8, score=0.814277 -14.1min
[CV] n_estimators=500, learning_rate=0.05, max_depth=12 ..............
[CV]  n_estimators=500, learning_rate=0.05, max_depth=8, score=0.813384 -14.4min
[CV] n_estimators=500, learning_rate=0.05, max_depth=12 ..............
[CV]  n_estimators=500, learning_rate=0.05, max_depth=8, score=0.842095 -15.2min
[CV] n_

[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed: 20.1min


[CV]  n_estimators=300, learning_rate=0.1, max_depth=2, score=0.836382 - 1.5min
[CV] n_estimators=300, learning_rate=0.1, max_depth=2 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=2, score=0.844199 - 1.5min
[CV] n_estimators=300, learning_rate=0.1, max_depth=2 ................
[CV]  n_estimators=300, learning_rate=0.05, max_depth=12, score=0.799031 -18.7min
[CV] n_estimators=500, learning_rate=0.1, max_depth=2 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=2, score=0.839136 - 1.5min
[CV] n_estimators=500, learning_rate=0.1, max_depth=2 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=2, score=0.843162 - 2.5min
[CV] n_estimators=500, learning_rate=0.1, max_depth=2 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=2, score=0.836830 - 2.5min
[CV] n_estimators=300, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=3, score=0.835821 - 2.2min
[CV] n_estim

[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed: 31.0min


[CV]  n_estimators=300, learning_rate=0.1, max_depth=3, score=0.845501 - 2.1min
[CV] n_estimators=500, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=300, learning_rate=0.05, max_depth=12, score=0.839340 -20.5min
[CV] n_estimators=500, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=3, score=0.825838 - 2.2min
[CV] n_estimators=300, learning_rate=0.1, max_depth=8 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=3, score=0.832438 - 3.6min
[CV] n_estimators=300, learning_rate=0.1, max_depth=8 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=3, score=0.842317 - 3.6min
[CV] n_estimators=300, learning_rate=0.1, max_depth=8 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=3, score=0.822305 - 3.6min
[CV] n_estimators=500, learning_rate=0.1, max_depth=8 ................
[CV]  n_estimators=300, learning_rate=0.1, max_depth=8, score=0.813718 - 8.5min
[CV] n_estim

[Parallel(n_jobs=7)]: Done  48 out of  48 | elapsed: 65.0min finished


{'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 2}


In [6]:
preds_probs = grid.best_estimator_.predict(test_X)
preds_raw = grid.best_estimator_.predict_proba(test_X)
sum(preds_raw == test_y)/len(test_y)

TypeError: invalid type comparison

In [None]:
from sklearn import metrics

In [None]:
metrics.roc_auc_score(y_true = test_y, y_score = preds_probs[:,1])