In [1]:
import multiprocessing

import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation, metrics 
from sklearn.grid_search import GridSearchCV 

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
kagTrainDat = pd.read_csv('~/Documents/git/DataMining/train.csv')
kagTestDat = pd.read_csv('~/Documents/git/DataMining/test.csv')

y_train = kagTrainDat['ACTION']
X_train = kagTrainDat.ix[:, kagTrainDat.columns != 'ACTION']

X_test = kagTestDat.ix[:, kagTestDat.columns != 'id']

In [3]:
gbm = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                        n_estimators=1500,
                        max_depth=5)

In [4]:
xg_train = xgb.DMatrix(X_train.values, label=y_train.values)
cv_result = xgb.cv(gbm.get_params(), xg_train,
                   num_boost_round=gbm.get_params()['n_estimators'],
                   nfold=5, metrics="auc",
                   early_stopping_rounds=100,
                   verbose_eval=500)

Will train until cv error hasn't decreased in 100 rounds.
[0]	cv-test-auc:0.6240458+0.009494636199454945	cv-train-auc:0.6530128+0.007535541955294244


[500]	cv-test-auc:0.8418338000000001+0.007877956318741558	cv-train-auc:0.979636+0.0008614042024508625


Stopping. Best iteration:
[840] cv-mean:0.8473200000000001	cv-std:0.011482912278686102


In [4]:
def grid_search(estimator, test_parameters):
    return GridSearchCV(estimator=estimator, param_grid=test_parameters,
                        scoring='roc_auc', n_jobs=4, iid=False, cv=5)

In [5]:
gbm2 = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                         n_estimators=840,
                         max_depth=5)

param_test1 = {
    'max_depth': list(range(3, 9, 2)),
    'min_child_weight': list(range(1, 6, 2))
}

grid_search1 = grid_search(gbm2, param_test1)

grid_search1.fit(X_train, y_train)
grid_search1.grid_scores_, grid_search1.best_params_, grid_search1.best_score_

([mean: 0.83189, std: 0.01255, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: 0.82965, std: 0.01321, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.82935, std: 0.01218, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: 0.84935, std: 0.01385, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: 0.84689, std: 0.01375, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: 0.84456, std: 0.01183, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: 0.85196, std: 0.01305, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: 0.85146, std: 0.01268, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: 0.84867, std: 0.01356, params: {'min_child_weight': 5, 'max_depth': 7}],
 {'max_depth': 7, 'min_child_weight': 1},
 0.85195922314451311)

In [None]:
param_test2 = {
    'max_depth': list(range(6, 8, 1)),
    'min_child_weight': list(range(1, 2, 1))
}

grid_search2 = grid_search(gbm, param_test2)

grid_search2.fit(X_train, y_train)
grid_search2.grid_scores_, grid_search2.best_params_, grid_search2.best_score_

In [6]:
gbm_final = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                              n_estimators=840,
                              max_depth=7,
                              min_child_weight=1)

gbm_final.fit(X_train, y_train, eval_metric='auc')

predictions = gbm_final.predict(X_test)
predict_prob = gbm_final.predict_proba(X_test)[:, 1]

y_test = (predictions + predict_prob)/2.0

In [7]:
submission = pd.Series(data=y_test, name='Action', index=kagTestDat['id'])

submission.to_csv("~/Documents/git/DataMining/submission_xgboost.csv",
                  index=True,
                  sep=',',
                  header=True)
