In [1]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterSampler
from copy import deepcopy
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.preprocessing import RobustScaler

In [2]:
X=pd.read_hdf("X_train.hd5")
y=pd.read_hdf("y_train.hd5")
w=pd.read_hdf("w_train.hd5")
classifier = joblib.load('clf.joblib')

In [3]:
#parameter grid
param_grid = {
              "clip_weight" : [10,20,30,40],
              "learning_rate" : [0.001,0.1,0.5],
              "n_estimators": [300,500,800],
              "subsample" : [0.6, 0.8, 1],
              "reg_lambda":[0.1, 0.5, 1, 2, 10],  
              "max_depth": [3,5,7,10],                                                                                                                                                                                                
              }

In [None]:
#parameter samples
sampler=ParameterSampler(param_grid,3)
samples=[params for params in sampler]


In [5]:
#array to store accuracy scores 
accu_scores=np.array([]) 


#array to store cross-entropy
cross_scores=np.array([])


scaler=RobustScaler()

In [19]:
accu_mean=np.array([]) 
accu_stdev=np.array([]) 
cross_mean=np.array([]) 
cross_stdev=np.array([])

df=pd.DataFrame(samples)
print df

   clip_weight  learning_rate  max_depth  n_estimators  reg_lambda  subsample
0           20          0.001         10           300           1        0.8
1           30          0.100          7           500           2        0.6
2           40          0.100          5           800           1        0.8


In [6]:
for params in sampler:
    
    skf = StratifiedKFold(n_splits=5)
    clipweight=params.pop('clip_weight')
    for train_index, test_index in skf.split(X, y):

        #obtaining training and testing sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        w_train, w_test = w.iloc[train_index], w.iloc[test_index]
        w_train=np.abs(w_train)
        
        #scale data
        X_train=pd.DataFrame(scaler.fit_transform(X_train))
        X_test=pd.DataFrame(scaler.transform(X_test))

        #reweighting w_train with clipped weights
        h=np.histogram(y_train,weights=w_train,bins=13,range=[-0.5,12.5])
        a=1./h[0]
        a/=min(a)
        rw=np.clip(a,0,clipweight)
        w_train*=rw[y_train]
        
        #classifier with XGBoost parameters and training it
        clf=deepcopy(classifier)
        clf.set_params(**params)
        %time clf.fit(X_train,y_train,w_train)
    
        
        #ignoring category 0 from y_true (and hence from X_true and w_true too)
        X_test.reset_index(drop=True,inplace=True)
        y_test.reset_index(drop=True,inplace=True)
        w_test.reset_index(drop=True,inplace=True)
        X_test_ignore0=X_test[y_test>0]
        y_test_ignore0=y_test[y_test>0]
        w_test_ignore0=w_test[y_test>0]
        
        #getting predicted y probability, ignoring category 0
        y_pred_prob=clf.predict_proba(X_test_ignore0)
        y_pred_prob/=rw.reshape(1,-1) 
        y_pred_prob/=np.sum(y_pred_prob,axis=1,keepdims=True)
        y_pred_prob_ignore0=np.delete(y_pred_prob,0,axis=1)
        y_pred_prob_ignore0/=np.sum(y_pred_prob_ignore0,axis=1,keepdims=True)
        
        #getting sample weight values, ignoring category 0
        weight=w_test_ignore0.ravel()
    
        #calculating accuracy score
        y_pred_ignore0=np.argmax(y_pred_prob_ignore0,axis=1)+1
        y_pred=y_pred_ignore0.ravel()
        y_true=y_test_ignore0.ravel()
        accu_scores=np.append(accu_scores,accuracy_score(y_true,y_pred,normalize=True,sample_weight=weight))
        
        print y_pred[0:20]
        print y_true[0:20]
        
        print y_pred.shape
        print y_true.shape
        
        #calculation cross entropy score
        enc=OneHotEncoder(handle_unknown='ignore')
        y_label=enc.fit_transform(y_test_ignore0.reshape(-1,1)).toarray()
        cross_scores=np.append(cross_scores,
                               log_loss(y_label,y_pred_prob_ignore0,normalize=True,sample_weight=w_test_ignore0))

CPU times: user 8min 11s, sys: 12.5 s, total: 8min 24s
Wall time: 1min 3s
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[ 2  8  1  7 10  7  1  1  2 12 10  7  4  2  1  2  2  1  1  2]
(5942,)
(5942,)
CPU times: user 7min 39s, sys: 12.6 s, total: 7min 51s
Wall time: 59 s
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 4 4 9 7]
(5940,)
(5940,)
CPU times: user 8min 2s, sys: 11.8 s, total: 8min 13s
Wall time: 1min 1s
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 4 2 2 2]
(5939,)
(5939,)
CPU times: user 8min 13s, sys: 12.9 s, total: 8min 26s
Wall time: 1min 3s
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[ 4  4  4  4  4  4  4  4 10 10 11 10  2  2 11  2  2  2  2 11]
(5937,)
(5937,)
CPU times: user 7min 23s, sys: 13 s, total: 7min 36s
Wall time: 57.1 s
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[11 11 11 11 11  5 11 11 11 11 11 11 11  5 11 11 11  9 11 11]
(5935,)
(5935,)
CPU times: user 6min 49s, sys: 10.3 s, total: 6min 59s
Wall time: 52.5 s
[1 1

In [20]:
scores1=np.split(accu_scores,3)
scores2=np.split(cross_scores,3)

for i in range (0,len(scores1)):
    print "Parameters: ", samples[i]
    print "Accuracy with w_test: %0.5f +/- %0.5f"%(scores1[i].mean(),scores1[i].std())
    print "Cross entropy: %0.5f +/- %0.5f"%(scores2[i].mean(),scores2[i].std())
    print ""
    
    accu_mean=np.append(accu_mean,scores1[i].mean())
    accu_stdev=np.append(accu_stdev,scores1[i].std())
    cross_mean=np.append(cross_mean,scores2[i].mean())
    cross_stdev=np.append(cross_stdev,scores2[i].std())
    
    

Parameters:  {'clip_weight': 20, 'learning_rate': 0.001, 'n_estimators': 300, 'subsample': 0.8, 'reg_lambda': 1, 'max_depth': 10}
Accuracy with w_test: 0.22695 +/- 0.00173
Cross entropy: 2.22826 +/- 0.00159

Parameters:  {'clip_weight': 30, 'learning_rate': 0.1, 'n_estimators': 500, 'subsample': 0.6, 'reg_lambda': 2, 'max_depth': 7}
Accuracy with w_test: 0.22695 +/- 0.00173
Cross entropy: 2.45789 +/- 0.00039

Parameters:  {'clip_weight': 40, 'learning_rate': 0.1, 'n_estimators': 800, 'subsample': 0.8, 'reg_lambda': 1, 'max_depth': 5}
Accuracy with w_test: 0.41736 +/- 0.00420
Cross entropy: 1.89480 +/- 0.00796



In [21]:
#adding mean and stdev
df['accu_mean']=accu_mean
df['accu_stdev']=accu_stdev
df['cross_mean']=cross_mean
df['cross_stdev']=cross_stdev

print df

   clip_weight  learning_rate  max_depth  n_estimators  reg_lambda  subsample  \
0           20          0.001         10           300           1        0.8   
1           30          0.100          7           500           2        0.6   
2           40          0.100          5           800           1        0.8   

   accu_mean  accu_stdev  cross_mean  cross_stdev  
0   0.226947    0.001731    2.228264     0.001587  
1   0.226947    0.001731    2.457889     0.000391  
2   0.417360    0.004199    1.894798     0.007959  


In [30]:
#sorting accuracy in descending order
df_accu=df.sort_values('accu_mean',ascending=False,inplace=False)
print df_accu

   clip_weight  learning_rate  max_depth  n_estimators  reg_lambda  subsample  \
2           40          0.100          5           800           1        0.8   
0           20          0.001         10           300           1        0.8   
1           30          0.100          7           500           2        0.6   

   accu_mean  accu_stdev  cross_mean  cross_stdev  
2   0.417360    0.004199    1.894798     0.007959  
0   0.226947    0.001731    2.228264     0.001587  
1   0.226947    0.001731    2.457889     0.000391  


In [31]:
#sorting cross entropy in ascending order
df_cross=df.sort_values('cross_mean',ascending=True,inplace=False)
print df_cross

   clip_weight  learning_rate  max_depth  n_estimators  reg_lambda  subsample  \
2           40          0.100          5           800           1        0.8   
0           20          0.001         10           300           1        0.8   
1           30          0.100          7           500           2        0.6   

   accu_mean  accu_stdev  cross_mean  cross_stdev  
2   0.417360    0.004199    1.894798     0.007959  
0   0.226947    0.001731    2.228264     0.001587  
1   0.226947    0.001731    2.457889     0.000391  
