In [1]:
import multiprocessing

import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation, metrics 
from sklearn.grid_search import GridSearchCV 

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
kagTrainDat = pd.read_csv('~/Documents/git/DataMining/train.csv')
kagTestDat = pd.read_csv('~/Documents/git/DataMining/test.csv')

y_train = kagTrainDat['ACTION']
X_train = kagTrainDat.ix[:, kagTrainDat.columns != 'ACTION']

X_test = kagTestDat.ix[:, kagTestDat.columns != 'id']

In [3]:
gbm = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                        n_estimators=1500,
                        max_depth=5)

In [4]:
xg_train = xgb.DMatrix(X_train.values, label=y_train.values)
cv_result = xgb.cv(gbm.get_params(), xg_train,
                   num_boost_round=gbm.get_params()['n_estimators'],
                   nfold=5, metrics="auc",
                   early_stopping_rounds=100,
                   verbose_eval=500)


Will train until cv error hasn't decreased in 100 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
[0]	cv-test-auc:0.624009+0.009532923811716922	cv-train-auc:0.6529736+0.007525845031622675


[500]	cv-test-auc:0.8412712000000001+0.008099015678463647	cv-train-auc:0.9801028+0.0010994886811604811


Stopping. Best iteration:
[678] cv-mean:0.8453262	cv-std:0.010205374376278427


In [5]:
def grid_search(estimator, test_parameters):
    return GridSearchCV(estimator=estimator, param_grid=test_parameters,
                        scoring='roc_auc', n_jobs=multiprocessing.cpu_count(),
                        iid=False, cv=5)

In [6]:
gbm2 = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                         n_estimators=678,
                         max_depth=5)

param_test1 = {
    'max_depth': list(range(3, 9, 2)),
    'min_child_weight': list(range(1, 6, 2))
}

grid_search1 = grid_search(gbm2, param_test1)

grid_search1.fit(X_train, y_train)
grid_search1.grid_scores_, grid_search1.best_params_, grid_search1.best_score_

([mean: 0.82604, std: 0.01384, params: {'min_child_weight': 1, 'max_depth': 3},
  mean: 0.82530, std: 0.01366, params: {'min_child_weight': 3, 'max_depth': 3},
  mean: 0.82516, std: 0.01200, params: {'min_child_weight': 5, 'max_depth': 3},
  mean: 0.84940, std: 0.01306, params: {'min_child_weight': 1, 'max_depth': 5},
  mean: 0.84612, std: 0.01357, params: {'min_child_weight': 3, 'max_depth': 5},
  mean: 0.84293, std: 0.01143, params: {'min_child_weight': 5, 'max_depth': 5},
  mean: 0.85233, std: 0.01365, params: {'min_child_weight': 1, 'max_depth': 7},
  mean: 0.85222, std: 0.01283, params: {'min_child_weight': 3, 'max_depth': 7},
  mean: 0.84868, std: 0.01340, params: {'min_child_weight': 5, 'max_depth': 7}],
 {'max_depth': 7, 'min_child_weight': 1},
 0.85232618392450132)

In [7]:
param_test2 = {
    'max_depth': list(range(6, 8, 1)),
    'min_child_weight': list(range(1, 2, 1))
}

grid_search2 = grid_search(gbm2, param_test2)

grid_search2.fit(X_train, y_train)
grid_search2.grid_scores_, grid_search2.best_params_, grid_search2.best_score_

([mean: 0.85081, std: 0.01430, params: {'min_child_weight': 1, 'max_depth': 6},
  mean: 0.85233, std: 0.01365, params: {'min_child_weight': 1, 'max_depth': 7}],
 {'max_depth': 7, 'min_child_weight': 1},
 0.85232618392450132)

In [11]:
gbm_test = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                            n_estimators=1500,
                            max_depth=7,
                            min_child_weight=1)

xg_train2 = xgb.DMatrix(X_train.values, label=y_train.values)
cv_result2 = xgb.cv(gbm_test.get_params(), xg_train,
                    num_boost_round=gbm.get_params()['n_estimators'],
                    nfold=5, metrics="auc",
                    early_stopping_rounds=100,
                    verbose_eval=500)

Will train until cv error hasn't decreased in 100 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
[0]	cv-test-auc:0.6421134000000001+0.012265641452447565	cv-train-auc:0.6841790000000001+0.01028755693058367


Stopping. Best iteration:
[279] cv-mean:0.8436060000000001	cv-std:0.006814960924319401


In [13]:
gbm_test = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                            n_estimators=279,
                            max_depth=7,
                            min_child_weight=1)

# Fit the algorithm on the data
gbm_test.fit(X_train, y_train, eval_metric='auc')
        
# Predict training set:
train_predict_prob = gbm_test.predict_proba(X_train)[:, 1]

print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, train_predict_prob))

AUC Score (Train): 0.986453


In [17]:
gbm3 = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                         n_estimators=279,
                         max_depth=7,
                         min_child_weight=1)

param_test3 = {
    'gamma': [i/10.0 for i in range(0, 5)]
}

grid_search3 = grid_search(gbm3, param_test3)

grid_search3.fit(X_train, y_train)
grid_search3.grid_scores_, grid_search3.best_params_, grid_search3.best_score_

([mean: 0.84969, std: 0.01399, params: {'gamma': 0.0},
  mean: 0.84780, std: 0.01541, params: {'gamma': 0.1},
  mean: 0.84991, std: 0.01364, params: {'gamma': 0.2},
  mean: 0.84973, std: 0.01353, params: {'gamma': 0.3},
  mean: 0.84884, std: 0.01482, params: {'gamma': 0.4}],
 {'gamma': 0.2},
 0.84990665772170337)

In [18]:
gbm_test = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                             n_estimators=279,
                             max_depth=7,
                             min_child_weight=1,
                             gamma=0.2)

# Fit the algorithm on the data
gbm_test.fit(X_train, y_train, eval_metric='auc')
        
# Predict training set:
train_predict_prob = gbm_test.predict_proba(X_train)[:, 1]

print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, train_predict_prob))

AUC Score (Train): 0.985838


In [22]:
gbm_test = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                             n_estimators=1500,
                             max_depth=7,
                             min_child_weight=1,
                             gamma=0.2)

xg_train3 = xgb.DMatrix(X_train.values, label=y_train.values)
cv_result3 = xgb.cv(gbm_test.get_params(), xg_train,
                    num_boost_round=gbm.get_params()['n_estimators'],
                    nfold=5, metrics="auc",
                    early_stopping_rounds=100,
                    verbose_eval=500)

Will train until cv error hasn't decreased in 100 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
[0]	cv-test-auc:0.6416242000000001+0.012908669960921597	cv-train-auc:0.6834988+0.01067304909386254


[500]	cv-test-auc:0.8454118000000002+0.010237635242574346	cv-train-auc:0.9971966+0.0001894735865496684


Stopping. Best iteration:
[523] cv-mean:0.8456368000000001	cv-std:0.010235853855932082


In [23]:
gbm4 = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                         n_estimators=279,
                         max_depth=7,
                         min_child_weight=1)

param_test4 = {
    'subsample': [i/10.0 for i in range(3, 11)],
    'colsample_bytree': [i/10.0 for i in range(3, 11)]
}

grid_search4 = grid_search(gbm4, param_test4)

grid_search4.fit(X_train, y_train)
grid_search4.grid_scores_, grid_search4.best_params_, grid_search4.best_score_

([mean: 0.84409, std: 0.00959, params: {'colsample_bytree': 0.3, 'subsample': 0.3},
  mean: 0.84951, std: 0.00877, params: {'colsample_bytree': 0.3, 'subsample': 0.4},
  mean: 0.85001, std: 0.01008, params: {'colsample_bytree': 0.3, 'subsample': 0.5},
  mean: 0.85022, std: 0.00877, params: {'colsample_bytree': 0.3, 'subsample': 0.6},
  mean: 0.85250, std: 0.00912, params: {'colsample_bytree': 0.3, 'subsample': 0.7},
  mean: 0.85142, std: 0.01072, params: {'colsample_bytree': 0.3, 'subsample': 0.8},
  mean: 0.85175, std: 0.01237, params: {'colsample_bytree': 0.3, 'subsample': 0.9},
  mean: 0.84725, std: 0.00915, params: {'colsample_bytree': 0.3, 'subsample': 1.0},
  mean: 0.84611, std: 0.00826, params: {'colsample_bytree': 0.4, 'subsample': 0.3},
  mean: 0.85420, std: 0.00970, params: {'colsample_bytree': 0.4, 'subsample': 0.4},
  mean: 0.85509, std: 0.00905, params: {'colsample_bytree': 0.4, 'subsample': 0.5},
  mean: 0.85514, std: 0.01044, params: {'colsample_bytree': 0.4, 'subsample'

In [25]:
gbm_test = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                             n_estimators=279,
                             max_depth=7,
                             min_child_weight=1,
                             gamma=0.2,
                             colsample_bytree=0.5,
                             subsample=0.7)

# Fit the algorithm on the data
gbm_test.fit(X_train, y_train, eval_metric='auc')
        
# Predict training set:
train_predict_prob = gbm_test.predict_proba(X_train)[:, 1]

print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, train_predict_prob))

gbm_test.n_estimators = 1500

xg_train3 = xgb.DMatrix(X_train.values, label=y_train.values)
cv_result3 = xgb.cv(gbm_test.get_params(), xg_train,
                    num_boost_round=gbm.get_params()['n_estimators'],
                    nfold=5, metrics="auc",
                    early_stopping_rounds=100,
                    verbose_eval=500)

Will train until cv error hasn't decreased in 100 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
[0]	cv-test-auc:0.5997776+0.017198154419588173	cv-train-auc:0.6239564+0.005930525123460827


AUC Score (Train): 0.979738


[500]	cv-test-auc:0.855729+0.011887590369793182	cv-train-auc:0.9951886+0.00018396369206993685


Stopping. Best iteration:
[609] cv-mean:0.8567316	cv-std:0.012131000661116139


In [19]:
gbm_final = xgb.XGBClassifier(nthread=multiprocessing.cpu_count(),
                              n_estimators=279,
                              max_depth=7,
                              min_child_weight=1,
                              gamma=0.2)

gbm_final.fit(X_train, y_train, eval_metric='auc')

predictions = gbm_final.predict(X_test)
predict_prob = gbm_final.predict_proba(X_test)[:, 1]

y_test = predict_prob

In [20]:
submission = pd.Series(data=y_test, name='Action', index=kagTestDat['id'])

submission.to_csv("~/Documents/git/DataMining/submission_xgboost.csv",
                  index=True,
                  sep=',',
                  header=True)