In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from scipy import stats
from sklearn.linear_model import ElasticNet



In [3]:
train = pd.read_csv('./train.csv')
cont = np.array([x.startswith('cont') for x in train.columns])
sc = StandardScaler()
train.ix[:,cont] = sc.fit_transform(train.ix[:,cont])
trainOHE = pd.get_dummies(train)
validation_set, train_set = train_test_split(trainOHE, test_size=.8)
print('Size of train set: {}'.format(len(train_set)))
print('Size of validation set: {}'.format(len(validation_set)))

Size of train set: 150655
Size of validation set: 37663


In [4]:
trainStd_X = trainOHE.ix[:,np.logical_and(trainOHE.columns!='id', trainOHE.columns!='loss')]
train_y = trainOHE.ix[:,trainOHE.columns=='loss']
validation_std_X = validation_set.ix[:,np.logical_and(trainOHE.columns!='id', trainOHE.columns!='loss')]
validation_y = validation_set.ix[:,trainOHE.columns=='loss']

In [57]:
# parameters
eta = stats.distributions.uniform(loc=.01, scale=.2)
min_child_weight = stats.randint(low=1,high=10)
alphas = np.logspace(-6,1,num=100)
depth = stats.randint(low=2, high=15)
n_estimators = stats.randint(75,300)

#param dict
param_distribution = {'max_depth': depth,
                      'learning_rate': eta,
                      'n_estimators':n_estimators,
                      'reg_alpha':alphas,
                      'min_child_weight': min_child_weight}

In [6]:
scoring = make_scorer(mean_absolute_error)

In [59]:
boostingRandCV = RandomizedSearchCV(XGBRegressor(nthread=1),param_distributions=param_distribution,
                                    n_jobs=-2, verbose=4,n_iter=20,scoring=scoring)

In [60]:
boostingRandCV.fit(trainStd_X, train_y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] min_child_weight=9, max_depth=11, reg_alpha=5.09413801482e-06, learning_rate=0.18029888001, n_estimators=124 
[CV] min_child_weight=9, max_depth=11, reg_alpha=5.09413801482e-06, learning_rate=0.18029888001, n_estimators=124 
[CV] min_child_weight=9, max_depth=11, reg_alpha=5.09413801482e-06, learning_rate=0.18029888001, n_estimators=124 
[CV]  min_child_weight=9, max_depth=11, reg_alpha=5.09413801482e-06, learning_rate=0.18029888001, n_estimators=124, score=1203.754056 -52.4min
[CV]  min_child_weight=9, max_depth=11, reg_alpha=5.09413801482e-06, learning_rate=0.18029888001, n_estimators=124, score=1204.679183 -52.4min
[CV] min_child_weight=5, max_depth=10, reg_alpha=0.000351119173422, learning_rate=0.156458181444, n_estimators=147 
[CV] min_child_weight=5, max_depth=10, reg_alpha=0.000351119173422, learning_rate=0.156458181444, n_estimators=147 
[CV]  min_child_weight=9, max_depth=11, reg_alpha=5.09413801482e-06, learnin

[Parallel(n_jobs=-2)]: Done  19 tasks      | elapsed: 434.2min


[CV] min_child_weight=1, max_depth=13, reg_alpha=0.104761575279, learning_rate=0.10231658606, n_estimators=237 
[CV]  min_child_weight=8, max_depth=12, reg_alpha=1.38488637139e-06, learning_rate=0.0928990888881, n_estimators=237, score=1192.445641 -73.4min
[CV]  min_child_weight=8, max_depth=12, reg_alpha=1.38488637139e-06, learning_rate=0.0928990888881, n_estimators=237, score=1189.701302 -72.5min
[CV] min_child_weight=1, max_depth=13, reg_alpha=0.104761575279, learning_rate=0.10231658606, n_estimators=237 
[CV] min_child_weight=1, max_depth=13, reg_alpha=0.104761575279, learning_rate=0.10231658606, n_estimators=237 
[CV]  min_child_weight=1, max_depth=13, reg_alpha=0.104761575279, learning_rate=0.10231658606, n_estimators=237, score=1207.137529 -86.0min
[CV] min_child_weight=9, max_depth=7, reg_alpha=0.123284673944, learning_rate=0.0642330911099, n_estimators=243 
[CV]  min_child_weight=1, max_depth=13, reg_alpha=0.104761575279, learning_rate=0.10231658606, n_estimators=237, score=11

[Parallel(n_jobs=-2)]: Done  60 out of  60 | elapsed: 1066.6min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=20, n_jobs=-2,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x116826c18>, 'min_child_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x115e2c668>, 'reg_alpha': array([  1.00000e-06,   1.17681e-06, ...,   8.49753e+00,   1.00000e+01]), 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x116826898>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x116826518>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=mak

In [None]:
testModel = XGBRegressor(max_depth=15,learning_rate=0.1,
                         n_estimators=1000)
from sklearn.model_selection import cross_val_score
score = cross_val_score(testModel, trainStd_X, train_y, scoring=scoring, n_jobs=-2)

In [63]:
score

array([ 0.53694975,  0.53889455,  0.54976885])

In [None]:
# parameters
eta = stats.distributions.uniform(loc=.01, scale=.2)
alphas = np.logspace(-6,1,num=100)
depth = stats.randint(low=2, high=15)
n_estimators = stats.randint(500,1500)

#param dict
param_distribution = {'max_depth': depth,
                      'learning_rate': eta,
                      'n_estimators':n_estimators,
                      'reg_alpha':alphas}

# run algo
boostingRandCV2 = RandomizedSearchCV(XGBRegressor(nthread=1),param_distributions=param_distribution,
                                    n_jobs=-2, verbose=4,n_iter=20,scoring=scoring)
boostingRandCV2.fit(trainStd_X, train_y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] reg_alpha=0.00559081018251, max_depth=12, learning_rate=0.0425255446743, n_estimators=619 
[CV] reg_alpha=0.00559081018251, max_depth=12, learning_rate=0.0425255446743, n_estimators=619 
[CV] reg_alpha=0.00559081018251, max_depth=12, learning_rate=0.0425255446743, n_estimators=619 
