## Expectation Reflection + Least Absolute Deviations

In the following, we demonstrate how to apply Least Absolute Deviations (LAD) for classification task such as medical diagnosis.

We import the necessary packages to the Jupyter notebook:

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

import expectation_reflection as ER

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
data_list = np.loadtxt('data_list_30sets.txt',dtype='str')
#data_list = ['29parkinson','30paradox2','31renal','32patientcare','33svr','34newt','35pcos']
print(data_list)

['1paradox' '2peptide' '3stigma' '4nki' '5mental' '6smoking' '7anemia'
 '8language' '9coag' '10tazamia' '11hepato' '12heat' '13ef' '14cervix'
 '15heart' '16liver' '17nwosu' '18school' '19ibs' '21survival'
 '29parkinson' '30paradox2' '31renal' '33svr' '35pcos' '36probiotic'
 '101kidney' '102breast_cancer' '103diabetes_niddk'
 '104diabetic_retinopathy']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    Xy = np.loadtxt('../classification_data/%s/data_processed_knn3.dat'%data_name) 
    X = Xy[:,:-1]
    #y = Xy[:,-1]
    # 2020.07.15: convert y from {-1,+1} to {0,1}:
    y = (Xy[:,-1]+1)/2. 

    #print(np.unique(y,return_counts=True))

    X,y = make_data_balance(X,y)

    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
def measure_performance(X_train,X_test,y_train,y_test):

    n = X_train.shape[1]

    l2 = [0.0001,0.001,0.01,0.1,1.,10.,100.]
    #l2 = [0.0001,0.001,0.01,0.1,1.,10.]
    nl2 = len(l2)

    # cross validation 
    kf = 4   
    kfold = KFold(n_splits=kf,shuffle=False)

    h01 = np.zeros(kf)
    w1 = np.zeros((kf,n))
    cost1 = np.zeros(kf)

    h0 = np.zeros(nl2)
    w = np.zeros((nl2,n))
    cost = np.zeros(nl2)            
    for il2 in range(len(l2)):            
        for i,(train_index,val_index) in enumerate(kfold.split(y_train)):
            X_train1, X_val = X_train[train_index], X_train[val_index]
            y_train1, y_val = y_train[train_index], y_train[val_index]
            #h01[i],w1[i,:] = ER.fit(X_train1,y_train1,niter_max=100,l2=l2[il2])
            h01[i],w1[i,:] = ER.fit_LAD(X_train1,y_train1,niter_max=100,l2=l2[il2])

            y_val_pred,p_val_pred = ER.predict(X_val,h01[i],w1[i])
            cost1[i] = ((p_val_pred - y_val)**2).mean()

        h0[il2] = h01.mean(axis=0)
        w[il2,:] = w1.mean(axis=0)
        cost[il2] = cost1.mean()

    # optimal value of l2:
    il2_opt = np.argmin(cost)
    print('optimal l2:',l2[il2_opt])

    # performance:
    y_test_pred,p_test_pred = ER.predict(X_test,h0[il2_opt],w[il2_opt,:])

    fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)

    roc_auc = auc(fp,tp)
    #print('AUC:', roc_auc)

    acc = accuracy_score(y_test,y_test_pred)
    #print('Accuracy:', acc)

    precision = precision_score(y_test,y_test_pred)
    #print('Precision:',precision)

    recall = recall_score(y_test,y_test_pred)
    #print('Recall:',recall)

    f1_score = 2*precision*recall/(precision+recall)
    
    return acc,roc_auc,precision,recall,f1_score

In [6]:
n_data = len(data_list)
roc_auc = np.zeros(n_data)   ; acc = np.zeros(n_data)
precision = np.zeros(n_data) ; recall = np.zeros(n_data)
f1_score = np.zeros(n_data)
for data_id in range(n_data):
    X_train,X_test,y_train,y_test = read_data(data_id)
    acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id],f1_score[data_id] =\
            measure_performance(X_train,X_test,y_train,y_test)
    print(data_id,acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id],f1_score[data_id])

data_name: 1paradox
(array([0., 1.]), array([60, 60]))
optimal l2: 0.0001
0 0.85 0.8772032902467685 0.7692307692307693 0.8695652173913043 0.8163265306122449
data_name: 2peptide
(array([0., 1.]), array([23, 23]))
optimal l2: 0.01
1 1.0 1.0 1.0 1.0 1.0
data_name: 3stigma
(array([0., 1.]), array([2725, 2725]))
optimal l2: 0.001
2 0.9922935779816514 0.9937085564494572 1.0 0.9849137931034483 0.992399565689468
data_name: 4nki
(array([0., 1.]), array([77, 77]))
optimal l2: 1.0
3 0.8441558441558441 0.8885135135135135 0.8048780487804879 0.8918918918918919 0.8461538461538461
data_name: 5mental
(array([0., 1.]), array([147, 147]))
optimal l2: 1.0
4 0.5918367346938775 0.6552238805970149 0.5346534653465347 0.8059701492537313 0.6428571428571428
data_name: 6smoking
(array([0., 1.]), array([722, 722]))
optimal l2: 0.0001
5 1.0 0.9999999999999999 1.0 1.0 1.0
data_name: 7anemia
(array([0., 1.]), array([43, 43]))
optimal l2: 0.0001
6 0.6744186046511628 0.7434782608695653 0.7368421052631579 0.608695652173

In [7]:
print('acc_mean:',acc.mean())
print('roc_mean:',roc_auc.mean())
print('precision:',precision.mean())
print('recall:',recall.mean())
print('f1_score:',f1_score.mean())

acc_mean: 0.8218841701104008
roc_mean: 0.8794048576053061
precision: 0.8344351375173699
recall: 0.8292308095173696
f1_score: 0.8229904538450736


In [8]:
np.savetxt('result_knn3_ER_LAD.dat',(roc_auc,acc,precision,recall,f1_score),fmt='%f')