In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn import preprocessing

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import xgboost as xgb

In [12]:
!gsutil cp -r gs://smartandnimble/identifyme/xgb/toy "data/identifyme/xgb"

Copying gs://smartandnimble/identifyme/xgb/toy/test.buffer...
/ [1 files][  4.3 MiB/  4.3 MiB]                                                Copying gs://smartandnimble/identifyme/xgb/toy/train.buffer...
Copying gs://smartandnimble/identifyme/xgb/toy/trainfull.buffer...              / [2 files][  6.6 MiB/  6.6 MiB]                                                
Copying gs://smartandnimble/identifyme/xgb/toy/valid.buffer...                  / [3 files][ 11.2 MiB/ 11.2 MiB]                                                
/ [3 files][ 11.2 MiB/ 12.2 MiB]                                                / [4 files][ 12.2 MiB/ 12.2 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m -o ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://smartandnimble/identifyme/xgb/toy/y_valid_w5_ol

In [17]:
%%time
def write_submission(preds, output):
    sample = pd.read_csv('../data/sampleSubmission.csv')
    preds = pd.DataFrame(
        preds, index=sample.id.values, columns=sample.columns[1:])
    preds.to_csv(output, index_label='id')


def score(params):
    print "Training with params : "
    print params
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    
    dtrain = xgb.DMatrix(folder+"train.buffer")
    dvalid = xgb.DMatrix(folder+"valid.buffer")
    
    y_valid=pd.read_csv(folder+"y_valid_w5_old.csv", header=None, squeeze=True)

    #evallist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)#, evallist, early_stopping_rounds=3)
    predictions = model.predict(dvalid).reshape((int(dvalid.num_row()), 150))
    score = log_loss(y_valid, predictions)
    print "\tScore {0}\n\n".format(score)
    return {'loss': score, 'status': STATUS_OK}


def optimize(trials):
    space = {
             'n_estimators' : hp.choice('n_estimators', np.arange(1, 1001, dtype=int)),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.choice('max_depth', np.arange(1, 14, dtype=int)),
             'min_child_weight' : hp.choice('min_child_weight', np.arange(1, 7, dtype=int)),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'max_delta_step': hp.choice('max_delta_step', np.arange(1, 11, 2, dtype=int)),
             'num_class' : 150,
             'eval_metric': 'mlogloss',#'merror',
             'objective': 'multi:softprob',
             'nthread' : 5,
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print best


folder = "data/identifyme/xgb/toy/"

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)

Training with params : 
{'colsample_bytree': 0.65, 'silent': 1, 'eval_metric': 'mlogloss', 'max_delta_step': 9, 'nthread': 5, 'min_child_weight': 5, 'n_estimators': 475, 'subsample': 0.7000000000000001, 'eta': 0.375, 'objective': 'multi:softprob', 'num_class': 150, 'max_depth': 2, 'gamma': 0.8500000000000001}
	Score 1.5519879208


Training with params : 
{'colsample_bytree': 0.55, 'silent': 1, 'eval_metric': 'mlogloss', 'max_delta_step': 5, 'nthread': 5, 'min_child_weight': 1, 'n_estimators': 85, 'subsample': 0.75, 'eta': 0.25, 'objective': 'multi:softprob', 'num_class': 150, 'max_depth': 9, 'gamma': 0.9500000000000001}
	Score 1.21179822865


Training with params : 
{'colsample_bytree': 0.9500000000000001, 'silent': 1, 'eval_metric': 'mlogloss', 'max_delta_step': 9, 'nthread': 5, 'min_child_weight': 2, 'n_estimators': 315, 'subsample': 0.65, 'eta': 0.35000000000000003, 'objective': 'multi:softprob', 'num_class': 150, 'max_depth': 13, 'gamma': 0.7000000000000001}
	Score 1.25577964871




KeyboardInterrupt: 