## Expectation Reflection + ElasticNet

We import the necessary packages to the Jupyter notebook:

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

import expectation_reflection as ER
from sklearn.linear_model import ElasticNet

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
#data_list = ['1paradox']
data_list = np.loadtxt('data_list.txt',dtype='str')
#data_list = ['29parkinson','30paradox2','31renal','32patientcare','33svr','34newt','35pcos']
print(data_list)

['1paradox' '2peptide' '3stigma' '4nki' '5mental' '6smoking' '7anemia'
 '8language' '9coag' '10tazamia' '11hepato' '12heat' '13ef' '14cervix'
 '15heart' '16liver' '17nwosu' '18school' '19ibs' '21survival' '101kidney'
 '102breast_cancer' '103diabetes_niddk' '104diabetic_retinopathy'
 '29parkinson' '30paradox2' '31renal' '33svr' '35pcos' '36probiotic']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    Xy = np.loadtxt('../data/%s/data_processed.dat'%data_name) 
    X = Xy[:,:-1]
    y = Xy[:,-1]

    #print(np.unique(y,return_counts=True))

    X,y = make_data_balance(X,y)

    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
def measure_performance(X_train,X_test,y_train,y_test):

    n = X_train.shape[1]

    alpha_list = [0.0001,0.001,0.01,0.1,1.,10.,100.]
    nalpha = len(alpha_list)

    l1_ratio = 0.5
    
    # cross validation 
    kf = 4   
    kfold = KFold(n_splits=kf,shuffle=False,random_state=1)

    h01 = np.zeros(kf)
    w1 = np.zeros((kf,n))
    cost1 = np.zeros(kf)

    h0 = np.zeros(nalpha)
    w = np.zeros((nalpha,n))
    cost = np.zeros(nalpha)            
    for ialpha,alpha in enumerate(alpha_list):                
        for i,(train_index,val_index) in enumerate(kfold.split(y_train)):
            X_train1, X_val = X_train[train_index], X_train[val_index]
            y_train1, y_val = y_train[train_index], y_train[val_index]
            #h01[i],w1[i,:] = ER.fit(X_train1,y_train1,niter_max=100,l2=l2[il2])
            #h01[i],w1[i,:] = ER.fit_LAD(X_train1,y_train1,niter_max=100,l2=l2[il2])
            h01[i],w1[i,:] = ER.fit_ElasticNet(X_train1,y_train1,niter_max=100,alpha=alpha,l1_ratio=l1_ratio)
            
            y_val_pred,p_val_pred = ER.predict(X_val,h01[i],w1[i])
            cost1[i] = ((p_val_pred - y_val)**2).mean()

        h0[ialpha] = h01.mean(axis=0)
        w[ialpha,:] = w1.mean(axis=0)
        cost[ialpha] = cost1.mean()

    # optimal value of l2:
    ialpha_opt = np.argmin(cost)
    print('optimal alpha:',alpha_list[ialpha_opt])

    # performance:
    y_test_pred,p_test_pred = ER.predict(X_test,h0[ialpha_opt],w[ialpha_opt,:])

    fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)

    roc_auc = auc(fp,tp)
    #print('AUC:', roc_auc)

    acc = accuracy_score(y_test,y_test_pred)
    #print('Accuracy:', acc)

    precision = precision_score(y_test,y_test_pred)
    #print('Precision:',precision)

    recall = recall_score(y_test,y_test_pred)
    #print('Recall:',recall)

    return acc,roc_auc,precision,recall

In [6]:
n_data = len(data_list)
roc_auc = np.zeros(n_data)   ; acc = np.zeros(n_data)
precision = np.zeros(n_data) ; recall = np.zeros(n_data)
for data_id in range(n_data):
    X_train,X_test,y_train,y_test = read_data(data_id)
    acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id] =\
            measure_performance(X_train,X_test,y_train,y_test)
    print(data_id,acc[data_id],roc_auc[data_id]) 

data_name: 1paradox
(array([0., 1.]), array([60, 60]))












optimal alpha: 0.0001
0 0.8666666666666667 0.8960047003525264
data_name: 2peptide
(array([0., 1.]), array([23, 23]))




























optimal alpha: 0.001
1 1.0 1.0
data_name: 3stigma
(array([0., 1.]), array([2725, 2725]))
optimal alpha: 0.0001
2 0.9963302752293578 0.999828686868687
data_name: 4nki
(array([0., 1.]), array([77, 77]))


































































optimal alpha: 0.0001
3 0.8051948051948052 0.8434547908232118
data_name: 5mental
(array([0., 1.]), array([147, 147]))






















optimal alpha: 0.1
4 0.6870748299319728 0.7065901517956312
data_name: 6smoking
(array([0., 1.]), array([722, 722]))










































optimal alpha: 0.001
5 0.9986149584487535 1.0
data_name: 7anemia
(array([0., 1.]), array([43, 43]))






















optimal alpha: 0.0001
6 0.8372093023255814 0.8934782608695652
data_name: 8language
(array([0., 1.]), array([267, 267]))














optimal alpha: 0.01
7 0.7415730337078652 0.7993813273340832
data_name: 9coag
(array([0., 1.]), array([504, 504]))
optimal alpha: 0.01
8 0.6031746031746031 0.6596049647177419
data_name: 10tazamia
(array([0., 1.]), array([124, 124]))








optimal alpha: 0.1
9 0.75 0.8521988030184752
data_name: 11hepato
(array([0., 1.]), array([63, 63]))










optimal alpha: 0.1
10 0.7142857142857143 0.7711693548387095
data_name: 12heat
(array([0., 1.]), array([83, 83]))
optimal alpha: 0.1
11 0.7349397590361446 0.7604651162790699
data_name: 13ef
(array([0., 1.]), array([93, 93]))












optimal alpha: 0.0001
12 1.0 1.0
data_name: 14cervix
(array([0., 1.]), array([24, 24]))






optimal alpha: 0.0001
13 0.9166666666666666 0.9851851851851852
data_name: 15heart
(array([0., 1.]), array([138, 138]))




optimal alpha: 0.01
14 0.8188405797101449 0.9109430791850451
data_name: 16liver
(array([0., 1.]), array([167, 167]))
optimal alpha: 0.001
15 0.6347305389221557 0.7499282227964399
data_name: 17nwosu
(array([0., 1.]), array([59, 59]))




















optimal alpha: 0.001
16 1.0 1.0
data_name: 18school
(array([0., 1.]), array([68, 68]))






















































optimal alpha: 0.1
17 0.8529411764705882 0.879757785467128
data_name: 19ibs
(array([0., 1.]), array([33, 33]))












optimal alpha: 0.01
18 0.8787878787878788 0.9346153846153846
data_name: 21survival
(array([0., 1.]), array([123, 123]))






















optimal alpha: 0.1
19 0.7723577235772358 0.8542780748663102
data_name: 101kidney
(array([0., 1.]), array([149, 149]))
optimal alpha: 0.01
20 0.9731543624161074 1.0
data_name: 102breast_cancer
(array([0., 1.]), array([212, 212]))






optimal alpha: 0.001
21 0.9669811320754716 0.9932263814616755
data_name: 103diabetes_niddk
(array([0., 1.]), array([252, 252]))
optimal alpha: 0.01
22 0.6944444444444444 0.7857503152585119
data_name: 104diabetic_retinopathy
(array([0., 1.]), array([536, 536]))












optimal alpha: 0.0001
23 0.7369402985074627 0.8049510602453253
data_name: 29parkinson
(array([0., 1.]), array([48, 48]))












optimal alpha: 0.1
24 0.8125 0.9453262786596119
data_name: 30paradox2
(array([0., 1.]), array([52, 52]))












optimal alpha: 0.001
25 0.8846153846153846 0.9535232383808097
data_name: 31renal
(array([0., 1.]), array([47, 47]))
















optimal alpha: 0.01
26 0.8723404255319149 0.8945454545454545
data_name: 33svr
(array([0., 1.]), array([41, 41]))












optimal alpha: 0.01
27 1.0 1.0
data_name: 35pcos
(array([0., 1.]), array([177, 177]))










optimal alpha: 0.01
28 0.8022598870056498 0.8884065372829416
data_name: 36probiotic
(array([0., 1.]), array([10, 10]))








optimal alpha: 0.01
29 0.6 0.7916666666666667


In [7]:
print('acc_mean:',acc.mean())
print('roc_mean:',roc_auc.mean())

print('precision:',precision.mean())
print('recall:',recall.mean())

acc_mean: 0.831754148224419
roc_mean: 0.8851426607171398
precision: 0.8408168770179886
recall: 0.81955475605573


In [8]:
#np.savetxt('ER_ElasticNet_result.dat',(roc_auc,acc,precision,recall),fmt='%f')