In [12]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import zipfile
import sklearn
import xgboost as xgb
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

In [2]:
z = zipfile.ZipFile('train.csv.zip')
df = pd.read_csv(z.open('train.csv'))

In [3]:
df['var3'].describe()

count     76020.000000
mean      -1523.199277
std       39033.462364
min     -999999.000000
25%           2.000000
50%           2.000000
75%           2.000000
max         238.000000
Name: var3, dtype: float64

In [4]:
df.replace(to_replace={'var3': {-999999: 2}}, inplace=True)

In [5]:
x = df.iloc[:,:-1]
y = df.TARGET

In [6]:
selectK = SelectKBest(f_classif, k=100)
selectK.fit(x, y)
x_sel = selectK.transform(x)
features = x.columns[selectK.get_support()]

 190 193 221 223 235 239 245 249 262 263 304 308 316 320 328 350] are constant.


In [7]:
# x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1126, stratify=y, test_size=0.3)

In [8]:
clf = xgb.XGBClassifier(
                max_depth = 5,
                n_estimators=525,
                learning_rate=0.02, 
                nthread=4,
                subsample=0.95,
                colsample_bytree=0.85, 
                seed=4242
)

In [9]:
cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(clf, x_sel, y, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

KeyboardInterrupt: 

In [None]:
clf.fit(x_train, y_train, early_stopping_rounds=50, eval_metric="auc", eval_set=[(x_test, y_test)])

In [None]:
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
sel_test = selectK.transform(test)    
y_pred = clf.predict_proba(sel_test)

In [None]:
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

## Grid Search

In [11]:
import sys
class flushfile(object):
    def __init__(self, f):
        self.f = f

    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)

    def write(self, x):
        self.f.write(x)
        self.f.flush()

    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)        

### Grid Search #1

In [None]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.1],
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 7, 11]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [None]:
grid.fit(x_sel, y)

In [None]:
grid.best_params_

### Grid Search #2

In [None]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.05],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [None]:
grid.fit(x_sel, y)

In [None]:
grid.best_params_

### Grid Search #3

In [None]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.05],
    'n_estimators': [200, 300, 400],
    'max_depth': [4, 5, 6],
    'subsample': [0.9, 0.925, 0.95, 0.975],
    'colsample_bytree': [0.8, 0.825, 0.85, 0.875, 0.9],
    'seed': [1126]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [None]:
grid.fit(x_sel, y)

In [None]:
best_clf = grid.best_estimator_
print grid.best_params_

In [None]:
cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(best_clf, x_sel, y, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

### Early Stopping

In [None]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1125, stratify=y, test_size=0.3)

In [None]:
best_clf.fit(x_train, y_train, early_stopping_rounds=25, eval_metric="auc", eval_set=[(x_test, y_test)], verbose=True)

### Fuck

In [None]:
print grid.best_params_

In [None]:
clf = xgb.XGBClassifier(
                max_depth = 4,
                n_estimators=200,
                learning_rate=0.045, 
                nthread=4,
                subsample=0.975,
                colsample_bytree=0.875, 
                seed=1126
)

In [None]:
clf.fit(x_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(x_test, y_test)], verbose=True)

In [None]:
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
sel_test = selectK.transform(test)    
y_pred = best_clf.predict_proba(sel_test)

In [None]:
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

## Grid Search #4

In [17]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.040, 0.045, 0.050, 0.055, 0.060],
    'n_estimators': [175, 200, 225, 250],
    'max_depth': [4],
    'subsample': [0.975, 0.980, 0.985, 0.990],
    'colsample_bytree': [0.8625, 0.875, 0.8875],
    'seed': [1027]
}
cv = cross_validation.StratifiedKFold(y, n_folds=8, shuffle=True, random_state=1027)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=-1)

In [18]:
grid.fit(x_sel, y)

Fitting 8 folds for each of 240 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 33.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 38.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 42.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 47.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 52.0min
[Paralle

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=8, shuffle=True, random_state=1027),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'colsample_bytree': [0.8625, 0.875, 0.8875], 'learning_rate': [0.04, 0.045, 0.05, 0.055, 0.06], 'n_estimators': [175, 200, 225, 250], 'subsample': [0.975, 0.98, 0.985, 0.99], 'seed': [1027], 'max_depth': [4]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=10)

In [19]:
grid.best_params_

{'colsample_bytree': 0.875,
 'learning_rate': 0.045,
 'max_depth': 4,
 'n_estimators': 250,
 'seed': 1027,
 'subsample': 0.985}