In [10]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterSampler
from copy import deepcopy
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss

In [11]:
X=pd.read_hdf("X_train.hd5")
y=pd.read_hdf("y_train.hd5")
w=pd.read_hdf("w_train.hd5")
classifier = joblib.load('clf.joblib')

In [12]:
#parameter grid
param_grid = {
              "clip_weight" : [30,40,50,60],
              "learning_rate" : [0.001,0.1,0.5],
              "n_estimators": [300,500,800],
              "subsample" : [0.6, 0.8, 1],
              "reg_lambda":[0.1, 0.5, 1, 2, 10],  
              "max_depth": [3,5,7,10],                                                                                                                                                                                                
              }

In [13]:
#10 parameter samples
sampler=ParameterSampler(param_grid,10)
samples=[params for params in sampler]

#array to store accuracy scores 
accu_scores=np.array([]) 
accu_scores_rw=np.array([])
#array to store cross-entropy
cross_scores=np.array([])

In [14]:
for params in sampler:
    skf = StratifiedKFold(n_splits=5)
    clipweight=params.pop('clip_weight')
    for train_index, test_index in skf.split(X, y):
        
        #obtaining training and testing sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        w_train, w_test = w.iloc[train_index], w.iloc[test_index]
        #reweighting w_train with clipped weights
        h=np.histogram(y_train,weights=w_train,bins=13,range=[-0.5,12.5])
        a=1./h[0]
        a/=min(a)
        rw_train=np.clip(a,0,clipweight)
        w_train*=rw_train[y_train]
        #classifier with XGBoost parameters and training it
        clf=deepcopy(classifier)
        clf.set_params(**params)
        %time clf.fit(X_train,y_train,w_train)
        
        #test metric 1: no reweighting w_test
        #test metric 2: reweighting w_test with clip weight 80
        h=np.histogram(y_test,weights=w_test,bins=13,range=[-0.5,12.5])
        a=1./h[0]
        a/=min(a)
        rw_test=np.clip(a,0,80)
        w_test_rw=w_test*rw_test[y_test]
        
        #ignoring category 0 from y_true (and hence from X_true and w_true too)
        X_test.reset_index(drop=True,inplace=True)
        y_test.reset_index(drop=True,inplace=True)
        w_test.reset_index(drop=True,inplace=True)
        w_test_rw.reset_index(drop=True,inplace=True)
        X_test_ignore0=X_test[y_test>0]
        y_test_ignore0=y_test[y_test>0]
        w_test_ignore0=w_test[y_test>0]
        w_test_rw_ignore0=w_test_rw[y_test>0]
        #getting predicted y probability, ignoring category 0
        y_pred_prob=clf.predict_proba(X_test_ignore0)
        y_pred_prob/=rw_train.reshape(1,-1) 
        y_pred_prob/=np.sum(y_pred_prob,axis=1,keepdims=True)
        y_pred_prob_ignore0=np.delete(y_pred_prob,0,axis=1)
        y_pred_prob_ignore0/=np.sum(y_pred_prob_ignore0,axis=1,keepdims=True)
        #getting sample weight values, ignoring category 0
        weight=w_test_ignore0.ravel()
        weight_rw=w_test_rw_ignore0.ravel()
        
        #calculating accuracy score
        y_pred_ignore0=np.argmax(y_pred_prob_ignore0,axis=1)+1
        y_pred=y_pred_ignore0.ravel()
        y_true=y_test_ignore0.ravel()
        accu_scores=np.append(accu_scores,accuracy_score(y_true,y_pred,normalize=True,sample_weight=weight))
        accu_scores_rw=np.append(accu_scores_rw,accuracy_score(y_true,y_pred,normalize=True,
                                                               sample_weight=weight_rw))
        
        #calculation cross entropy score
        enc=OneHotEncoder(handle_unknown='ignore')
        y_label=enc.fit_transform(y_test_ignore0.reshape(-1,1)).toarray()
        cross_scores=np.append(cross_scores,
                               log_loss(y_label,y_pred_prob_ignore0,normalize=True,sample_weight=w_test_ignore0))

CPU times: user 6min 25s, sys: 11.1 s, total: 6min 36s
Wall time: 49.7 s
CPU times: user 6min 32s, sys: 11.2 s, total: 6min 43s
Wall time: 50.5 s
CPU times: user 6min 26s, sys: 10.1 s, total: 6min 36s
Wall time: 49.6 s
CPU times: user 6min 33s, sys: 10.9 s, total: 6min 44s
Wall time: 50.6 s
CPU times: user 6min 32s, sys: 11 s, total: 6min 43s
Wall time: 50.5 s
CPU times: user 8min 20s, sys: 7.06 s, total: 8min 27s
Wall time: 1min 3s
CPU times: user 7min 7s, sys: 3.55 s, total: 7min 11s
Wall time: 53.9 s
CPU times: user 7min 4s, sys: 3.5 s, total: 7min 8s
Wall time: 53.6 s
CPU times: user 7min 32s, sys: 4.7 s, total: 7min 36s
Wall time: 57.2 s
CPU times: user 7min 12s, sys: 3.64 s, total: 7min 16s
Wall time: 54.6 s
CPU times: user 8min 58s, sys: 10.4 s, total: 9min 9s
Wall time: 1min 8s
CPU times: user 9min 5s, sys: 11.1 s, total: 9min 16s
Wall time: 1min 9s
CPU times: user 8min 52s, sys: 10.6 s, total: 9min 2s
Wall time: 1min 7s
CPU times: user 10min 31s, sys: 15.5 s, total: 10min 47s


In [18]:
print "Scores ignoring category 0-"
print ""
scores1=np.split(accu_scores,10)
scores2=np.split(accu_scores_rw,10)
scores3=np.split(cross_scores,10)
for i in range (0,len(scores)):
    print "Parameters: ", samples[i]
    print "Accuracy with w_test: %0.5f +/- %0.5f"%(scores1[i].mean(),scores1[i].std())
    print "Accuracy with w_test_rw: %0.5f +/- %0.5f"%(scores2[i].mean(),scores2[i].std())
    print "Cross entropy: %0.5f +/- %0.5f"%(scores3[i].mean(),scores3[i].std())
    print ""

Scores ignoring category 0-

Parameters:  {'clip_weight': 40, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'reg_lambda': 2, 'max_depth': 5}
Accuracy with w_test: 0.22734 +/- 0.00205
Accuracy with w_test_rw: 0.09792 +/- 0.00055
Cross entropy: 2.31641 +/- 0.00211

Parameters:  {'clip_weight': 40, 'learning_rate': 0.5, 'n_estimators': 800, 'subsample': 0.8, 'reg_lambda': 1, 'max_depth': 3}
Accuracy with w_test: 0.34939 +/- 0.00373
Accuracy with w_test_rw: 0.15302 +/- 0.00131
Cross entropy: 2.17896 +/- 0.00266

Parameters:  {'clip_weight': 60, 'learning_rate': 0.1, 'n_estimators': 800, 'subsample': 1, 'reg_lambda': 0.5, 'max_depth': 10}
Accuracy with w_test: 0.41716 +/- 0.00367
Accuracy with w_test_rw: 0.19361 +/- 0.00143
Cross entropy: 1.89381 +/- 0.00753

Parameters:  {'clip_weight': 60, 'learning_rate': 0.5, 'n_estimators': 500, 'subsample': 1, 'reg_lambda': 2, 'max_depth': 7}
Accuracy with w_test: 0.46234 +/- 0.00366
Accuracy with w_test_rw: 0.24651 +/- 0.00308
Cross en

In [22]:
print "Best estimator:"
print samples[5]

Best estimator:
{'clip_weight': 30, 'learning_rate': 0.001, 'n_estimators': 300, 'subsample': 0.8, 'reg_lambda': 0.5, 'max_depth': 10}
