In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import zipfile
import sklearn
import xgboost as xgb
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score

In [2]:
z = zipfile.ZipFile('train.csv.zip')
df = pd.read_csv(z.open('train.csv'))

In [26]:
df['var3'].describe()

count    76020.000000
mean         2.716483
std          9.447971
min          0.000000
25%          2.000000
50%          2.000000
75%          2.000000
max        238.000000
Name: var3, dtype: float64

In [3]:
df.replace(to_replace={'var3': {-999999: 2}}, inplace=True)

In [4]:
x = df.iloc[:,:-1]
y = df.TARGET

In [5]:
selectK = SelectKBest(f_classif, k=100)
selectK.fit(x, y)
x_sel = selectK.transform(x)
features = x.columns[selectK.get_support()]

 190 193 221 223 235 239 245 249 262 263 304 308 316 320 328 350] are constant.


In [66]:
# x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1126, stratify=y, test_size=0.3)

In [79]:
clf = xgb.XGBClassifier(
                max_depth = 5,
                n_estimators=525,
                learning_rate=0.02, 
                nthread=4,
                subsample=0.95,
                colsample_bytree=0.85, 
                seed=4242
)

In [82]:
cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(clf, x_sel, y, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Auc: 0.839 (+/- 0.009)


In [None]:
clf.fit(x_train, y_train, early_stopping_rounds=50, eval_metric="auc", eval_set=[(x_test, y_test)])

In [70]:
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
sel_test = selectK.transform(test)    
y_pred = clf.predict_proba(sel_test)

In [74]:
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

## Grid Search

In [6]:
import sys
class flushfile(object):
    def __init__(self, f):
        self.f = f

    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)

    def write(self, x):
        self.f.write(x)
        self.f.flush()

    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)        

In [7]:
from sklearn.grid_search import GridSearchCV
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.2],
    'n_estimators': [100, 500, 100],
    'max_depth': [3, 7, 11]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=2)

In [None]:
grid.fit(x_sel, y)