In [63]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterSampler
from copy import deepcopy
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.preprocessing import RobustScaler

In [64]:
X=pd.read_hdf("X_train.hd5")
y=pd.read_hdf("y_train.hd5")
w=pd.read_hdf("w_train.hd5")
classifier = joblib.load('clf.joblib')

In [65]:
#parameter grid
param_grid = {
              "clip_weight" : [10,20,30,40],
              "learning_rate" : [0.1,0.3,0.5],
              "n_estimators": [300,500,800],
              "subsample" : [0.6, 0.8, 1],
              "reg_lambda":[0.1, 0.5, 1, 2, 10],  
              "max_depth": [3,5,7,10],                                                                                                                                                                                                
              }

In [66]:
#parameter samples
sampler=ParameterSampler(param_grid,2)
samples=[params for params in sampler]


In [67]:
#array to store accuracy scores 
accu_scores=np.array([]) 

#array to store cross-entropy
cross_scores=np.array([])

scaler=RobustScaler()

In [68]:
accu_mean=np.array([]) 
accu_stdev=np.array([]) 
cross_mean=np.array([]) 
cross_stdev=np.array([])

df=pd.DataFrame(samples)
print df

   clip_weight  learning_rate  max_depth  n_estimators  reg_lambda  subsample
0           40            0.1          5           500         2.0        1.0
1           40            0.1          5           500         0.1        0.6


In [69]:
for params in sampler:
    
    skf = StratifiedKFold(n_splits=5)
    clipweight=params.pop('clip_weight')
    for train_index, test_index in skf.split(X, y):

        #obtaining training and testing sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        w_train, w_test = w.iloc[train_index], w.iloc[test_index]
        w_train=np.abs(w_train)
        
        #scale data
        X_train=pd.DataFrame(scaler.fit_transform(X_train))
        X_test=pd.DataFrame(scaler.transform(X_test))
        
        print X_train.shape

        #reweighting w_train with clipped weights
        h=np.histogram(y_train,weights=w_train,bins=13,range=[-0.5,12.5])
        a=1./h[0]
        a/=min(a)
        rw=np.clip(a,0,clipweight)
        w_train*=rw[y_train]
        
        #classifier with XGBoost parameters and training it
        clf=deepcopy(classifier)
        clf.set_params(**params)
        %time clf.fit(X_train,y_train,w_train)
    
        
        #ignoring category 0 from y_true (and hence from X_true and w_true too)
        X_test.reset_index(drop=True,inplace=True)
        y_test.reset_index(drop=True,inplace=True)
        w_test.reset_index(drop=True,inplace=True)
        X_test_ignore0=X_test[y_test>0]
        y_test_ignore0=y_test[y_test>0]
        w_test_ignore0=w_test[y_test>0]
        
        print X_test[0:5]
        
        #getting predicted y probability, ignoring category 0
        y_pred_prob=clf.predict_proba(X_test_ignore0)
        print y_pred_prob[0:5]
        y_pred_prob/=rw.reshape(1,-1) 
        y_pred_prob/=np.sum(y_pred_prob,axis=1,keepdims=True)
        y_pred_prob_ignore0=np.delete(y_pred_prob,0,axis=1)
        y_pred_prob_ignore0/=np.sum(y_pred_prob_ignore0,axis=1,keepdims=True)
        
        #getting sample weight values, ignoring category 0
        weight=w_test_ignore0.ravel()
    
        #calculating accuracy score
        y_pred_ignore0=np.argmax(y_pred_prob_ignore0,axis=1)+1
        y_pred=y_pred_ignore0.ravel()
        y_true=y_test_ignore0.ravel()
        accu_scores=np.append(accu_scores,accuracy_score(y_true,y_pred,normalize=True,sample_weight=weight))
        
        #calculation cross entropy score
        enc=OneHotEncoder(handle_unknown='ignore')
        y_label=enc.fit_transform(y_test_ignore0.reshape(-1,1)).toarray()
        cross_scores=np.append(cross_scores,
                               log_loss(y_label,y_pred_prob_ignore0,normalize=True,sample_weight=w_test_ignore0))

(119123, 16)
CPU times: user 8min 38s, sys: 12.2 s, total: 8min 50s
Wall time: 1min 6s
         0         1         2         3         4         5         6   \
0 -0.245889  0.551608  0.255154  0.551009  0.034501  0.340558 -0.974969   
1 -0.674200  0.143663  0.707600  0.297717 -0.962081  0.114367 -0.413214   
2  0.510669 -0.564997  1.140287  0.448426  0.467207  0.260538 -0.784481   
3 -0.222189  3.047801  0.153037  3.931183 -1.166030 -0.179787  0.715586   
4 -0.265030  0.637034  0.506088  0.566395 -0.502239  0.626142  0.548802   

         7         8         9         10        11        12        13  \
0  0.472595 -0.547761 -0.304186  0.422118  0.530836  0.613554  0.380569   
1 -1.292167 -0.411515  1.102561  0.022617  1.033454 -0.444124  0.976909   
2  0.553544  0.272635  0.223742  0.998752  0.643571  1.000000  0.726936   
3  0.151718 -1.146140  1.546865 -0.743949 -0.608351  2.578947  0.205969   
4 -0.507490 -0.523888  0.868350 -0.276287 -0.151916  0.364095  0.861126   

         14

In [70]:
print accu_scores.shape


scores1=np.split(accu_scores,2)
scores2=np.split(cross_scores,2)

for i in range (0,len(scores1)):
    print "Parameters: ", samples[i]
    print "Accuracy with w_test: %0.5f +/- %0.5f"%(scores1[i].mean(),scores1[i].std())
    print "Cross entropy: %0.5f +/- %0.5f"%(scores2[i].mean(),scores2[i].std())
    print ""
    
    accu_mean=np.append(accu_mean,scores1[i].mean())
    accu_stdev=np.append(accu_stdev,scores1[i].std())
    cross_mean=np.append(cross_mean,scores2[i].mean())
    cross_stdev=np.append(cross_stdev,scores2[i].std())

(10,)
Parameters:  {'clip_weight': 40, 'learning_rate': 0.1, 'n_estimators': 500, 'subsample': 1, 'reg_lambda': 2, 'max_depth': 5}
Accuracy with w_test: 0.41844 +/- 0.00247
Cross entropy: 1.93494 +/- 0.00558

Parameters:  {'clip_weight': 40, 'learning_rate': 0.1, 'n_estimators': 500, 'subsample': 0.6, 'reg_lambda': 0.1, 'max_depth': 5}
Accuracy with w_test: 0.22685 +/- 0.00253
Cross entropy: 2.22247 +/- 0.00267



In [71]:
#adding mean and stdev
df['accu_mean']=accu_mean
df['accu_stdev']=accu_stdev
df['cross_mean']=cross_mean
df['cross_stdev']=cross_stdev

In [72]:
#sorting accuracy in descending order
df_accu=df.sort_values('accu_mean',ascending=False,inplace=False)
print df_accu

   clip_weight  learning_rate  max_depth  n_estimators  reg_lambda  subsample  \
0           40            0.1          5           500         2.0        1.0   
1           40            0.1          5           500         0.1        0.6   

   accu_mean  accu_stdev  cross_mean  cross_stdev  
0   0.418441    0.002470    1.934936     0.005585  
1   0.226850    0.002525    2.222472     0.002672  


In [73]:
#sorting cross entropy in ascending order
df_cross=df.sort_values('cross_mean',ascending=True,inplace=False)
print df_cross

   clip_weight  learning_rate  max_depth  n_estimators  reg_lambda  subsample  \
0           40            0.1          5           500         2.0        1.0   
1           40            0.1          5           500         0.1        0.6   

   accu_mean  accu_stdev  cross_mean  cross_stdev  
0   0.418441    0.002470    1.934936     0.005585  
1   0.226850    0.002525    2.222472     0.002672  
