In [46]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterSampler
from copy import deepcopy
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.preprocessing import RobustScaler

In [47]:
X=pd.read_hdf("X_train.hd5")
y=pd.read_hdf("y_train.hd5")
w=pd.read_hdf("w_train.hd5")
classifier = joblib.load('clf.joblib')

In [48]:
#parameter grid
param_grid = {
              "clip_weight" : [10,20,30,40],
              "learning_rate" : [0.001,0.1,0.5],
              "n_estimators": [300,500,800],
              "subsample" : [0.6, 0.8, 1],
              "reg_lambda":[0.1, 0.5, 1, 2, 10],  
              "max_depth": [3,5,7,10],                                                                                                                                                                                                
              }

In [49]:
#parameter samples
sampler=ParameterSampler(param_grid,3)
samples=[params for params in sampler]

print samples

#array to store accuracy scores 
accu_scores=np.array([]) 
#array to store cross-entropy
cross_scores=np.array([])

scaler=RobustScaler()

[{'clip_weight': 20, 'learning_rate': 0.5, 'n_estimators': 500, 'subsample': 0.8, 'reg_lambda': 1, 'max_depth': 5}, {'clip_weight': 30, 'learning_rate': 0.001, 'n_estimators': 300, 'subsample': 0.8, 'reg_lambda': 10, 'max_depth': 7}, {'clip_weight': 30, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'reg_lambda': 0.1, 'max_depth': 7}]


In [50]:
for params in sampler:
    
    skf = StratifiedKFold(n_splits=5)
    clipweight=params.pop('clip_weight')
    for train_index, test_index in skf.split(X, y):

        #obtaining training and testing sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        w_train, w_test = w.iloc[train_index], w.iloc[test_index]
        w_train=np.abs(w_train)
        
        #scale data
        X_train=pd.DataFrame(scaler.fit_transform(X_train))
        X_test=pd.DataFrame(scaler.transform(X_test))

        #reweighting w_train with clipped weights
        h=np.histogram(y_train,weights=w_train,bins=13,range=[-0.5,12.5])
        a=1./h[0]
        a/=min(a)
        rw=np.clip(a,0,clipweight)
        w_train*=rw[y_train]
        
        #classifier with XGBoost parameters and training it
        clf=deepcopy(classifier)
        clf.set_params(**params)
        %time clf.fit(X_train,y_train,w_train)
    
        
        #ignoring category 0 from y_true (and hence from X_true and w_true too)
        X_test.reset_index(drop=True,inplace=True)
        y_test.reset_index(drop=True,inplace=True)
        w_test.reset_index(drop=True,inplace=True)
        X_test_ignore0=X_test[y_test>0]
        y_test_ignore0=y_test[y_test>0]
        w_test_ignore0=w_test[y_test>0]
        
        #getting predicted y probability, ignoring category 0
        y_pred_prob=clf.predict_proba(X_test_ignore0)
        y_pred_prob/=rw.reshape(1,-1) 
        y_pred_prob/=np.sum(y_pred_prob,axis=1,keepdims=True)
        y_pred_prob_ignore0=np.delete(y_pred_prob,0,axis=1)
        y_pred_prob_ignore0/=np.sum(y_pred_prob_ignore0,axis=1,keepdims=True)
        
        #getting sample weight values, ignoring category 0
        weight=w_test_ignore0.ravel()
    
        #calculating accuracy score
        y_pred_ignore0=np.argmax(y_pred_prob_ignore0,axis=1)+1
        y_pred=y_pred_ignore0.ravel()
        y_true=y_test_ignore0.ravel()
        accu_scores=np.append(accu_scores,accuracy_score(y_true,y_pred,normalize=True,sample_weight=weight))
        
        print y_pred[0:20]
        print y_true[0:20]
        
        print y_pred.shape
        print y_true.shape
        
        #calculation cross entropy score
        enc=OneHotEncoder(handle_unknown='ignore')
        y_label=enc.fit_transform(y_test_ignore0.reshape(-1,1)).toarray()
        cross_scores=np.append(cross_scores,
                               log_loss(y_label,y_pred_prob_ignore0,normalize=True,sample_weight=w_test_ignore0))

CPU times: user 3min 12s, sys: 2.1 s, total: 3min 14s
Wall time: 24.4 s
[ 1  1  2  1  2  1  2  1  2  2  2 10 10  2  1  2  2  1  2  2]
[ 5  1  6  1  6  1  6  5  2  6  7  8 12  3  1  2  2  1  2  6]
(5965,)
(5965,)
CPU times: user 3min 10s, sys: 2.24 s, total: 3min 13s
Wall time: 24.2 s
[ 2  2  2  2  2  2  2  2  2  2  2  2  1  2  2  2  2  2  2 10]
[6 4 6 6 6 4 6 4 6 6 6 6 6 6 6 4 6 6 6 6]
(5964,)
(5964,)
CPU times: user 3min 7s, sys: 1.98 s, total: 3min 9s
Wall time: 23.7 s
[ 2  2  2  2  2  2  2  1  2  2  1  2  2  2  2 10  2  2  2  2]
[ 6  6  6  6  6  6  6  6  6  6  6  6  6  6  6 12 12  6  6  6]
(5960,)
(5960,)
CPU times: user 3min 20s, sys: 2.81 s, total: 3min 23s
Wall time: 25.4 s
[1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 1 2 2 2 2]
[ 5  5  5  5  5  5  5  5  5  5  5  6  6  6  6  5  6 10 10  5]
(5959,)
(5959,)
CPU times: user 3min 16s, sys: 2.21 s, total: 3min 18s
Wall time: 24.8 s
[1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2]
[ 6  6  6  6 10  6 10 10  6 10 10 10  6  6  6 10  6 10 10 10]
(5957,)
(5957,

In [51]:
scores1=np.split(accu_scores,3)
scores2=np.split(cross_scores,3)

accu_mean=np.array([])
cross_mean=np.array([])

for i in range (0,len(scores1)):
    print "Parameters: ", samples[i]
    print "Accuracy with w_test: %0.5f +/- %0.5f"%(scores1[i].mean(),scores1[i].std())
    print "Cross entropy: %0.5f +/- %0.5f"%(scores2[i].mean(),scores2[i].std())
    print ""
    
    accu_mean=np.append(accu_mean, scores1[i].mean())
    cross_mean=np.append(cross_mean, scores2[i].mean())

Parameters:  {'clip_weight': 20, 'learning_rate': 0.5, 'n_estimators': 500, 'subsample': 0.8, 'reg_lambda': 1, 'max_depth': 5}
Accuracy with w_test: 0.42707 +/- 0.00227
Cross entropy: 1.86280 +/- 0.00631

Parameters:  {'clip_weight': 30, 'learning_rate': 0.001, 'n_estimators': 300, 'subsample': 0.8, 'reg_lambda': 10, 'max_depth': 7}
Accuracy with w_test: 0.22912 +/- 0.00206
Cross entropy: 2.38685 +/- 0.00041

Parameters:  {'clip_weight': 30, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.8, 'reg_lambda': 0.1, 'max_depth': 7}
Accuracy with w_test: 0.22912 +/- 0.00206
Cross entropy: 2.42764 +/- 0.00025



In [56]:
#finding highest accuracy and lowest cross entropy
cross_mean.sort()
accu_mean.sort()

print "Accuracy mean in sorted order: ",accu_mean
print "Cross entropy mean in sorted order: ",cross_mean

Accuracy mean in sorted order:  [ 0.2291188   0.2291188   0.42706769]
Cross entropy mean in sorted order:  [ 1.86279873  2.38685232  2.42763885]
