In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score, roc_curve
from collections import Counter
import random
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
import warnings
warnings.filterwarnings("default", "", DeprecationWarning, "", 0)

In [3]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain["label"].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['label'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "n_estimators: %d" % cvresult.shape[0]
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['label'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['label'], dtrain_predprob)

In [4]:
train = pd.DataFrame.from_csv("../data/features/block_0_10000")
labels = pd.DataFrame.from_csv("../data/train.txt", sep='\t')
test = pd.DataFrame.from_csv("../data/features/block_0_20000")
labels_test = pd.DataFrame.from_csv("../data/train.txt", sep='\t')
evals = pd.DataFrame.from_csv("../data/features/block_0_30000")
labels_test = pd.DataFrame.from_csv("../data/train.txt", sep='\t')

In [5]:
train["label"] = labels["human-generated"]
test["label"] = labels_test["human-generated"]
evals["label"] = labels["human-generated"]

In [6]:
label_column = train["label"]
label_column_test = test["label"]
label_column_eval = evals["label"]
features = train.drop(["label"], axis=1)
features_test = test.drop(["label"], axis=1)
features_eval = evals.drop(["label"], axis=1)

In [7]:
dtrain = xgb.DMatrix(features.values, label_column.values, feature_names=features.columns)
deval = xgb.DMatrix(features_eval.values, label_column_eval.values, feature_names=features_eval.columns)
dtest = xgb.DMatrix(features_test.values, feature_names=features_test.columns)

# dtrain = xgb.DMatrix(features_test.values, label_column_test.values, feature_names=features_test.columns)
# dtest = xgb.DMatrix(features.values, feature_names=features.columns)

In [8]:
param = {'eta':0.1, 'max_depth':5, 'min_child_weight':1, 'gamma':0.1,
         'silent':0, 'subsample':0.8, 'colsample_bytree': 0.8,  
         'objective':'binary:logistic', 'eval_metric':'auc'}

In [9]:
bst = xgb.train(param, dtrain, num_boost_round=16)
preds = bst.predict(dtest)
print roc_auc_score(label_column_test, preds)

0.675731594868


In [10]:
xgb.cv(param, dtrain, metrics=("auc"), shuffle=True, num_boost_round=10)

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.634107,0.008499,0.685703,0.004771
1,0.651291,0.010905,0.710904,0.00222
2,0.656231,0.005924,0.723176,0.001878
3,0.663925,0.004143,0.733925,0.004162
4,0.664821,0.001064,0.743344,0.005925
5,0.663885,0.00055,0.751952,0.006136
6,0.663475,0.001395,0.759131,0.005349
7,0.663857,0.002058,0.764271,0.005933
8,0.664329,0.002051,0.77065,0.006131
9,0.664867,0.004934,0.778056,0.005915


In [11]:
bst_early = xgb.train(param, dtrain, evals=[(deval,"ev1")], num_boost_round=200, early_stopping_rounds=5)
preds = bst_early.predict(dtest)
print roc_auc_score(label_column_test, preds)

[0]	ev1-auc:0.635119
Will train until ev1-auc hasn't improved in 5 rounds.
[1]	ev1-auc:0.663674
[2]	ev1-auc:0.671708
[3]	ev1-auc:0.672349
[4]	ev1-auc:0.673063
[5]	ev1-auc:0.673889
[6]	ev1-auc:0.674935
[7]	ev1-auc:0.674331
[8]	ev1-auc:0.675056
[9]	ev1-auc:0.674902
[10]	ev1-auc:0.67559
[11]	ev1-auc:0.67474
[12]	ev1-auc:0.674801
[13]	ev1-auc:0.675025
[14]	ev1-auc:0.67536
[15]	ev1-auc:0.676632
[16]	ev1-auc:0.677555
[17]	ev1-auc:0.678774
[18]	ev1-auc:0.678403
[19]	ev1-auc:0.678621
[20]	ev1-auc:0.678906
[21]	ev1-auc:0.679342
[22]	ev1-auc:0.678234
[23]	ev1-auc:0.678612
[24]	ev1-auc:0.678375
[25]	ev1-auc:0.6778
[26]	ev1-auc:0.678664
Stopping. Best iteration:
[21]	ev1-auc:0.679342

0.676423765019


In [12]:
bst_early.best_ntree_limit

22

In [36]:
xgb1 = XGBClassifier(
 learning_rate = 0.3,
 n_estimators=30,
 max_depth=6,
 min_child_weight=2,
 gamma=0.3,
 subsample=0.9,
 colsample_bytree=0.6,
 reg_alpha=100,
 objective= 'binary:logistic',
 scale_pos_weight=1)

In [37]:
param_test1 = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch1 = GridSearchCV(estimator = xgb1, param_grid = param_test1, scoring='roc_auc', verbose=1, 
                        n_jobs=6,iid=False, cv=5)

gsearch1.fit(train[features.columns],train["label"])
print gsearch1.grid_scores_
print gsearch1.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=6)]: Done  25 out of  25 | elapsed:  2.5min finished


[mean: 0.66857, std: 0.01114, params: {'reg_lambda': 1e-05}, mean: 0.66857, std: 0.01114, params: {'reg_lambda': 0.01}, mean: 0.66883, std: 0.01088, params: {'reg_lambda': 0.1}, mean: 0.66849, std: 0.01016, params: {'reg_lambda': 1}, mean: 0.66868, std: 0.00785, params: {'reg_lambda': 100}]
{'reg_lambda': 0.1} 0.668828425957


In [15]:
%%timeit
2

100000000 loops, best of 3: 10.4 ns per loop
