In [1]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterSampler
from copy import deepcopy
from sklearn.metrics import accuracy_score

In [2]:
X=pd.read_hdf("X_train.hd5")
y=pd.read_hdf("y_train.hd5")
w=pd.read_hdf("w_train.hd5")
classifier = joblib.load('clf.joblib')

In [3]:
#parameter grid
param_grid = {
              "clip_weight" : [10,15,20],
              "learning_rate" : [0.001,0.1,0.5],
              "n_estimators": [300,500,800],
              "subsample" : [0.6, 0.8, 1],
              "reg_lambda":[0.1, 0.5, 1, 2, 10],
              "max_depth": [3,5,7,10],                                                                                                                                                                                                
              }

In [4]:
#10 parameter samples
sampler=ParameterSampler(param_grid,10)
samples=[params for params in sampler]

#array to store accuracy score per parameter set 
scores=np.array([]) 

In [5]:
def accuracyscore(clf,X_test,y_test,w_test,rw):
    X_test.reset_index(drop=True,inplace=True)
    y_test.reset_index(drop=True,inplace=True)
    w_test.reset_index(drop=True,inplace=True)
    #removing category 0 from y_true (and hence those rows are deleted from X_true and w_true too)
    y_test_ignore0=y_test[y_test>0]
    X_test_ignore0=X_test[y_test>0]
    w_test_ignore0=w_test[y_test>0]
    #calculating predicted probability of y and removing category 0 probability again
    y_pred_prob=clf.predict_proba(X_test_ignore0)
    y_pred_prob/=rw.reshape(1,-1) 
    y_pred_prob/=np.sum(y_pred_prob,axis=1,keepdims=True)
    y_pred_prob_ignore0=np.delete(y_pred_prob,0,axis=1)
    #obtaining y_pred (without category 0)
    y_pred_ignore0=np.argmax(y_pred_prob_ignore0,axis=1)+1
    #computing accuracy score
    y_pred=y_pred_ignore0.ravel()
    y_true=y_test_ignore0.ravel()
    w=w_test_ignore0.ravel()
    score=accuracy_score(y_true,y_pred,normalize=True,sample_weight=w)
    print score
    return score

In [6]:
for params in sampler:
    skf = StratifiedKFold(n_splits=5)
    totalscore=0.
    clip=params.pop('clip_weight')
    for train_index, test_index in skf.split(X, y):
        #obtaining training annd testing sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        w_train, w_test = w.iloc[train_index], w.iloc[test_index]
        #reweighting with clipped weights
        h=np.histogram(y_train,weights=w_train,bins=13,range=[-0.5,12.5])
        a=1./h[0]
        a/=min(a)
        rw=np.clip(a,0,clip)
        w_train*=rw[y_train]
        #creating classifier with XGBoost parameters
        clf=deepcopy(classifier)
        clf.set_params(**params)
        #computing accuracy score 
        %time clf.fit(X_train,y_train,w_train)
        totalscore+=accuracyscore(clf,X_test,y_test,w_test,rw)
    accu_score=totalscore/5.
    scores=np.append(scores,accu_score) 

CPU times: user 3min 39s, sys: 4.25 s, total: 3min 43s
Wall time: 28 s
0.422089192286
CPU times: user 3min 34s, sys: 3.63 s, total: 3min 38s
Wall time: 27.3 s
0.423067693815
CPU times: user 3min 50s, sys: 4.82 s, total: 3min 55s
Wall time: 29.5 s
0.418697307863
CPU times: user 3min 37s, sys: 3.94 s, total: 3min 41s
Wall time: 27.8 s
0.419934520352
CPU times: user 2min 54s, sys: 2.3 s, total: 2min 56s
Wall time: 22.1 s
0.421662281924
CPU times: user 6min 21s, sys: 6.81 s, total: 6min 28s
Wall time: 48.6 s
0.426081607389
CPU times: user 5min 32s, sys: 6.25 s, total: 5min 38s
Wall time: 42.4 s
0.426722404497
CPU times: user 6min 47s, sys: 5.84 s, total: 6min 53s
Wall time: 51.7 s
0.423248523178
CPU times: user 4min 37s, sys: 2.76 s, total: 4min 40s
Wall time: 35.1 s
0.425882048327
CPU times: user 5min 41s, sys: 6.27 s, total: 5min 47s
Wall time: 43.5 s
0.422510390014
CPU times: user 10min 31s, sys: 16.8 s, total: 10min 47s
Wall time: 1min 21s
0.229258331372
CPU times: user 14min 14s, sys:

In [7]:
print "Accuracy scores-"
for i in range (0,len(samples)):
    print "Parameters: ", samples[i]
    print "Accuracy: ", scores[i]

Accuracy scores-
Parameters:  {'clip_weight': 15, 'learning_rate': 0.001, 'n_estimators': 300, 'subsample': 0.6, 'reg_lambda': 10, 'max_depth': 10}
Accuracy:  0.421090199248
Parameters:  {'clip_weight': 20, 'learning_rate': 0.5, 'n_estimators': 800, 'subsample': 0.6, 'reg_lambda': 0.5, 'max_depth': 3}
Accuracy:  0.424888994681
Parameters:  {'clip_weight': 20, 'learning_rate': 0.5, 'n_estimators': 800, 'subsample': 0.6, 'reg_lambda': 10, 'max_depth': 10}
Accuracy:  0.227634670106
Parameters:  {'clip_weight': 10, 'learning_rate': 0.1, 'n_estimators': 800, 'subsample': 0.8, 'reg_lambda': 10, 'max_depth': 5}
Accuracy:  0.227634670106
Parameters:  {'clip_weight': 15, 'learning_rate': 0.5, 'n_estimators': 800, 'subsample': 1, 'reg_lambda': 0.5, 'max_depth': 3}
Accuracy:  0.227634670106
Parameters:  {'clip_weight': 20, 'learning_rate': 0.001, 'n_estimators': 800, 'subsample': 1, 'reg_lambda': 1, 'max_depth': 7}
Accuracy:  0.227634670106
Parameters:  {'clip_weight': 20, 'learning_rate': 0.1, '

In [18]:
print "Best estimator-"
print samples[np.argmax(scores)]

Best estimator-
{'clip_weight': 20, 'learning_rate': 0.5, 'n_estimators': 800, 'subsample': 0.6, 'reg_lambda': 0.5, 'max_depth': 3}
