In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import zipfile
import sklearn
import xgboost as xgb
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score

In [2]:
z = zipfile.ZipFile('train.csv.zip')
df = pd.read_csv(z.open('train.csv'))

In [3]:
df['var3'].describe()

count     76020.000000
mean      -1523.199277
std       39033.462364
min     -999999.000000
25%           2.000000
50%           2.000000
75%           2.000000
max         238.000000
Name: var3, dtype: float64

In [4]:
df.replace(to_replace={'var3': {-999999: 2}}, inplace=True)

In [5]:
x = df.iloc[:,:-1]
y = df.TARGET

In [6]:
selectK = SelectKBest(f_classif, k=100)
selectK.fit(x, y)
x_sel = selectK.transform(x)
features = x.columns[selectK.get_support()]

 190 193 221 223 235 239 245 249 262 263 304 308 316 320 328 350] are constant.


In [7]:
# x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1126, stratify=y, test_size=0.3)

In [8]:
clf = xgb.XGBClassifier(
                max_depth = 5,
                n_estimators=525,
                learning_rate=0.02, 
                nthread=4,
                subsample=0.95,
                colsample_bytree=0.85, 
                seed=4242
)

In [9]:
cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(clf, x_sel, y, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

KeyboardInterrupt: 

In [None]:
clf.fit(x_train, y_train, early_stopping_rounds=50, eval_metric="auc", eval_set=[(x_test, y_test)])

In [None]:
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
sel_test = selectK.transform(test)    
y_pred = clf.predict_proba(sel_test)

In [None]:
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

## Grid Search

In [10]:
import sys
class flushfile(object):
    def __init__(self, f):
        self.f = f

    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)

    def write(self, x):
        self.f.write(x)
        self.f.flush()

    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)        

### Grid Search #1

In [15]:
from sklearn.grid_search import GridSearchCV
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.1],
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 7, 11]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [12]:
grid.fit(x_sel, y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] n_estimators=100, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=100, learning_rate=0.1, max_depth=3, score=0.831969 -   6.1s
[CV] n_estimators=100, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=100, learning_rate=0.1, max_depth=3, score=0.834196 -   6.9s
[CV] n_estimators=100, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=100, learning_rate=0.1, max_depth=3, score=0.842906 -   6.8s
[CV] n_estimators=500, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=3, score=0.826961 -  29.5s


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    6.1s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   49.7s


[CV] n_estimators=500, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=3, score=0.830471 -  29.5s
[CV] n_estimators=500, learning_rate=0.1, max_depth=3 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=3, score=0.838799 -  29.8s
[CV] n_estimators=1000, learning_rate=0.1, max_depth=3 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=3, score=0.821769 -  56.9s
[CV] n_estimators=1000, learning_rate=0.1, max_depth=3 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=3, score=0.823983 -  57.0s
[CV] n_estimators=1000, learning_rate=0.1, max_depth=3 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=3, score=0.830945 -  57.8s
[CV] n_estimators=100, learning_rate=0.1, max_depth=7 ................
[CV]  n_estimators=100, learning_rate=0.1, max_depth=7, score=0.828725 -  14.1s
[CV] n_estimators=100, learning_rate=0.1, max_depth=7 ................
[CV]  n_estimators=1

[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:  5.4min


[CV] n_estimators=500, learning_rate=0.1, max_depth=7 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=7, score=0.807271 - 1.1min
[CV] n_estimators=500, learning_rate=0.1, max_depth=7 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=7, score=0.809292 - 1.1min
[CV] n_estimators=500, learning_rate=0.1, max_depth=7 ................
[CV]  n_estimators=500, learning_rate=0.1, max_depth=7, score=0.816808 - 1.1min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=7 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=7, score=0.793121 - 2.2min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=7 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=7, score=0.792721 - 2.4min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=7 ...............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=7, score=0.802070 - 2.3min
[CV] n_estimators=100, learning_rate=0.1, max_depth=11 ...............
[CV]  n_estimators=1

[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed: 13.3min
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed: 21.7min


[CV] n_estimators=1000, learning_rate=0.1, max_depth=11 ..............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=11, score=0.779233 - 3.4min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=11 ..............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=11, score=0.783280 - 3.3min
[CV] n_estimators=1000, learning_rate=0.1, max_depth=11 ..............
[CV]  n_estimators=1000, learning_rate=0.1, max_depth=11, score=0.794498 - 3.3min


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 31.7min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=3, shuffle=False, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 500, 1000], 'learning_rate': [0.1], 'max_depth': [3, 7, 11]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=10)

In [13]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

### Grid Search #2

In [20]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.05],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [21]:
grid.fit(x_sel, y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] n_estimators=50, learning_rate=0.05, max_depth=3 ................
[CV]  n_estimators=50, learning_rate=0.05, max_depth=3, score=0.819960 -   4.1s
[CV] n_estimators=50, learning_rate=0.05, max_depth=3 ................
[CV]  n_estimators=50, learning_rate=0.05, max_depth=3, score=0.818120 -   4.2s
[CV] n_estimators=50, learning_rate=0.05, max_depth=3 ................
[CV]  n_estimators=50, learning_rate=0.05, max_depth=3, score=0.831482 -   4.0s
[CV] n_estimators=100, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=100, learning_rate=0.05, max_depth=3, score=0.828535 -   9.0s


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    4.1s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   21.6s


[CV] n_estimators=100, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=100, learning_rate=0.05, max_depth=3, score=0.827822 -   9.4s
[CV] n_estimators=100, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=100, learning_rate=0.05, max_depth=3, score=0.838799 -  11.3s
[CV] n_estimators=200, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=3, score=0.832084 -  14.2s
[CV] n_estimators=200, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=3, score=0.834375 -  13.4s
[CV] n_estimators=200, learning_rate=0.05, max_depth=3 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=3, score=0.842502 -  15.5s
[CV] n_estimators=50, learning_rate=0.05, max_depth=5 ................
[CV]  n_estimators=50, learning_rate=0.05, max_depth=5, score=0.828902 -   6.6s
[CV] n_estimators=50, learning_rate=0.05, max_depth=5 ................
[CV]  n_estimators

[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:   56.8s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:  1.8min


[CV] n_estimators=100, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=100, learning_rate=0.05, max_depth=5, score=0.832732 -  12.5s
[CV] n_estimators=100, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=100, learning_rate=0.05, max_depth=5, score=0.834168 -  13.7s
[CV] n_estimators=100, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=100, learning_rate=0.05, max_depth=5, score=0.843696 -  13.1s
[CV] n_estimators=200, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=5, score=0.831739 -  28.6s
[CV] n_estimators=200, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=5, score=0.835096 -  23.9s
[CV] n_estimators=200, learning_rate=0.05, max_depth=5 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=5, score=0.844106 -  24.3s
[CV] n_estimators=50, learning_rate=0.05, max_depth=7 ................
[CV]  n_estimator

[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:  3.3min
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:  5.2min


[CV] n_estimators=200, learning_rate=0.05, max_depth=7 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=7, score=0.828685 -  42.1s
[CV] n_estimators=200, learning_rate=0.05, max_depth=7 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=7, score=0.832836 -  37.0s
[CV] n_estimators=200, learning_rate=0.05, max_depth=7 ...............
[CV]  n_estimators=200, learning_rate=0.05, max_depth=7, score=0.841887 -  35.6s


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  7.1min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=3, shuffle=False, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100, 200], 'learning_rate': [0.05], 'max_depth': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=10)

In [22]:
grid.best_params_

{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}

### Grid Search #3

In [24]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.05],
    'n_estimators': [200, 300, 400],
    'max_depth': [4, 5, 6],
    'subsample': [0.9, 0.925, 0.95, 0.975],
    'colsample_bytree': [0.8, 0.825, 0.85, 0.875, 0.9],
    'seed': [1126]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [25]:
grid.fit(x_sel, y)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=4, score=0.832325 -  15.3s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=4, score=0.834789 -  16.7s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=4, score=0.843784 -  18.9s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4, score=0.832733 -  18.2s


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   15.3s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  1.2min


[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4, score=0.834688 -  16.2s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4, score=0.843864 -  21.0s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4, score=0.832289 -  19.0s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4, score=0.835278 -  17.2s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimat

[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:  3.5min


[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=4, score=0.830988 -  24.4s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=4, score=0.834061 -  22.6s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=4, score=0.843782 -  28.2s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=4, score=0.831721 -  22.2s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300,

[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:  5.4min
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:  8.4min


[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=4, score=0.830365 -  33.3s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=4, score=0.833237 -  36.4s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=4, score=0.843293 -  36.3s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=4, score=0.831023 -  31.9s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400,

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 12.4min
[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed: 16.7min


[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5, score=0.834174 -  20.5s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5, score=0.844361 -  20.9s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5, score=0.831129 -  18.3s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5, score=0.835017 -  16.8s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimat

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed: 19.6min
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed: 24.5min


[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=5, score=0.828103 -  34.5s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=5, score=0.831853 -  34.3s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=5, score=0.841603 -  33.0s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=5, score=0.829961 -  32.7s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=400,

[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed: 30.5min
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed: 34.8min


[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=6, score=0.828457 -  27.1s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=6, score=0.830798 -  27.2s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=6, score=0.842057 -  27.0s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.8, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=6, score=0.829148 -  27.0s
[CV] colsample_bytree=0.8, learning_rate=0.05, n_estimators=300,

[Parallel(n_jobs=1)]: Done  97 tasks       | elapsed: 40.8min
[Parallel(n_jobs=1)]: Done 112 tasks       | elapsed: 49.5min


[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4, score=0.834698 -  17.2s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4, score=0.843943 -  15.3s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4, score=0.832367 -  15.6s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4, score=0.834984 -  15.7s
[CV] colsample_bytree=0.825, learning_ra

[Parallel(n_jobs=1)]: Done 127 tasks       | elapsed: 53.9min
[Parallel(n_jobs=1)]: Done 144 tasks       | elapsed: 61.9min


[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=5, score=0.830795 -  17.9s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=5, score=0.834502 -  19.9s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=5, score=0.844268 -  20.4s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5, score=0.831838 -  18.6s
[CV] colsample_bytree=0.825, learning_rate=0.05,

[Parallel(n_jobs=1)]: Done 161 tasks       | elapsed: 68.0min
[Parallel(n_jobs=1)]: Done 180 tasks       | elapsed: 78.5min


[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=6, score=0.831325 -  21.1s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=6, score=0.833507 -  26.4s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.9, seed=1126, max_depth=6, score=0.843312 -  20.1s
[CV] colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.825, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=6, score=0.830121 -  23.8s
[CV] colsample_bytree=0.825, learning_rate=0.05,

[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed: 86.5min
[Parallel(n_jobs=1)]: Done 220 tasks       | elapsed: 98.3min


[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4, score=0.834661 -  14.2s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=4, score=0.844245 -  14.3s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4, score=0.832627 -  15.0s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=4, score=0.835526 -  14.4s
[CV] colsample_bytree=0.85, learning_rate=0.05, 

[Parallel(n_jobs=1)]: Done 241 tasks       | elapsed: 104.7min
[Parallel(n_jobs=1)]: Done 264 tasks       | elapsed: 115.0min


[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5, score=0.829253 -  30.0s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5, score=0.833544 -  27.7s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5, score=0.843720 -  27.6s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=5, score=0.830852 -  28.5s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estima

[Parallel(n_jobs=1)]: Done 287 tasks       | elapsed: 128.1min
[Parallel(n_jobs=1)]: Done 312 tasks       | elapsed: 140.9min


[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6, score=0.826278 -  47.8s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6, score=0.827856 -  48.8s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6, score=0.840139 -  48.7s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.85, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=6, score=0.827403 -  48.3s
[CV] colsample_bytree=0.85, learning_rate=0.05, n_estima

[Parallel(n_jobs=1)]: Done 337 tasks       | elapsed: 152.3min
[Parallel(n_jobs=1)]: Done 364 tasks       | elapsed: 163.6min


[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5, score=0.834474 -  18.2s
[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.925, seed=1126, max_depth=5, score=0.844270 -  22.2s
[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5, score=0.830683 -  19.9s
[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=200, subsample=0.95, seed=1126, max_depth=5, score=0.834878 -  18.5s
[CV] colsample_bytree=0.875, learning_ra

[Parallel(n_jobs=1)]: Done 391 tasks       | elapsed: 176.3min
[Parallel(n_jobs=1)]: Done 420 tasks       | elapsed: 191.4min


[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6, score=0.826049 -  45.1s
[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6, score=0.828338 -  38.7s
[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.9, seed=1126, max_depth=6, score=0.840630 -  38.4s
[CV] colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=6 
[CV]  colsample_bytree=0.875, learning_rate=0.05, n_estimators=400, subsample=0.925, seed=1126, max_depth=6, score=0.827990 -  38.1s
[CV] colsample_bytree=0.875, learning_rate=0.05,

[Parallel(n_jobs=1)]: Done 449 tasks       | elapsed: 204.4min
[Parallel(n_jobs=1)]: Done 480 tasks       | elapsed: 218.0min


[CV] colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5, score=0.829279 -  33.8s
[CV] colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5, score=0.832707 -  33.0s
[CV] colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.9, seed=1126, max_depth=5, score=0.844233 -  32.3s
[CV] colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=5 
[CV]  colsample_bytree=0.9, learning_rate=0.05, n_estimators=300, subsample=0.925, seed=1126, max_depth=5, score=0.830467 -  31.8s
[CV] colsample_bytree=0.9, learning_rate=0.05, n_estimators=300,

[Parallel(n_jobs=1)]: Done 511 tasks       | elapsed: 233.7min
[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed: 248.9min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=3, shuffle=False, random_state=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'colsample_bytree': [0.8, 0.825, 0.85, 0.875, 0.9], 'learning_rate': [0.05], 'n_estimators': [200, 300, 400], 'subsample': [0.9, 0.925, 0.95, 0.975], 'seed': [1126], 'max_depth': [4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=10)

In [26]:
best_clf = grid.best_estimator_
print grid.best_params_

{'colsample_bytree': 0.875,
 'learning_rate': 0.05,
 'max_depth': 4,
 'n_estimators': 200,
 'seed': 1126,
 'subsample': 0.975}

In [34]:
cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(best_clf, x_sel, y, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Auc: 0.839 (+/- 0.006)


### Early Stopping

In [56]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1125, stratify=y, test_size=0.3)

In [57]:
best_clf.fit(x_train, y_train, early_stopping_rounds=25, eval_metric="auc", eval_set=[(x_test, y_test)], verbose=True)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.875,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1126, silent=True, subsample=0.975)

### Fuck

In [60]:
print grid.best_params_

{'colsample_bytree': 0.875, 'learning_rate': 0.05, 'n_estimators': 200, 'subsample': 0.975, 'seed': 1126, 'max_depth': 4}


In [72]:
clf = xgb.XGBClassifier(
                max_depth = 4,
                n_estimators=200,
                learning_rate=0.045, 
                nthread=4,
                subsample=0.975,
                colsample_bytree=0.875, 
                seed=1126
)

In [73]:
clf.fit(x_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(x_test, y_test)], verbose=True)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.875,
       gamma=0, learning_rate=0.045, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=200, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1126, silent=True, subsample=0.975)

In [58]:
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
sel_test = selectK.transform(test)    
y_pred = best_clf.predict_proba(sel_test)

In [59]:
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

## Grid Search #4

In [None]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.040, 0.045, 0.050, 0.055, 0.060],
    'n_estimators': [175, 200, 225, 250],
    'max_depth': [4],
    'subsample': [0.975, 0.980, 0.985, 0.990],
    'colsample_bytree': [0.8625, 0.875, 0.8875],
    'seed': [1027]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=True, n_jobs=-1)