In [1]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
import expectation_reflection as ER

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [None]:
data_name = 'diabetes'

In [3]:
# load data
X = np.loadtxt('../%s_X.txt'%data_name)
y = np.loadtxt('../%s_y.txt'%data_name)
np.unique(y,return_counts=True)

(array([0., 1.]), array([481, 252]))

In [4]:
X, y = shuffle(X, y)

### Prediction

In [5]:
def inference(X_train,y_train,X_test,y_test,method='ER'):
    if method == 'ER':
        h0,w = ER.fit(X_train,y_train,niter_max=2000,l2=0.001,classtype='binary')
        y_pred = ER.predict(X_test,h0,w,classtype='binary')
    else:      
        if method == 'LR': # LogisticRegression with cross-validation
            model = LogisticRegressionCV(cv=4,random_state=0,penalty='l2',solver='liblinear')
            
        if method == 'LRnonCV':
            model = LogisticRegression(solver='liblinear')
        
        #X_train_rescale = MinMaxScaler().fit_transform(X_train)
        #X_test_rescale = MinMaxScaler().fit_transform(X_test) 
        #model.fit(X_train_rescale,y_train)
        #y_pred = model.predict(X_test_rescale)
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
                
    accuracy = accuracy_score(y_test,y_pred)
    
    return accuracy    

In [6]:
def compare_inference(X,y,train_size):
    npred = 100
    accuracy = np.zeros((len(list_methods),npred))
    precision = np.zeros((len(list_methods),npred))
    recall = np.zeros((len(list_methods),npred))
    accuracy_train = np.zeros((len(list_methods),npred))
    for ipred in range(npred):
        X_train0,X_test,y_train0,y_test = train_test_split(X,y,test_size=0.2,random_state = ipred)

        idx_train = np.random.choice(len(y_train0),size=int(train_size*len(y)),replace=False)
        X_train,y_train = X_train0[idx_train],y_train0[idx_train]

        #X_train = MinMaxScaler().fit_transform(X_train)
        #X_test = MinMaxScaler().fit_transform(X_test)  
        
        # 2019.07.15
        for i,method in enumerate(list_methods):
            accuracy[i,ipred] = inference(X_train,y_train,X_test,y_test,method)
            
    return accuracy.mean(axis=1),accuracy.std(axis=1)

In [7]:
list_train_size = [0.8,0.6,0.4,0.2]
list_methods=['ER','LRnonCV','LR']
acc = np.zeros((len(list_train_size),len(list_methods)))
acc_std = np.zeros((len(list_train_size),len(list_methods)))
for i,train_size in enumerate(list_train_size):
    acc[i,:],acc_std[i,:] = compare_inference(X,y,train_size)
    print(train_size,acc[i,:])

0.8 [0.77034014 0.7629932  0.76857143]
0.6 [0.76761905 0.75666667 0.76816327]
0.4 [0.76340136 0.74870748 0.76510204]
0.2 [0.75891156 0.73285714 0.75857143]


In [9]:
np.savetxt('%s_acc.txt'%data_name,acc,fmt='%f')
np.savetxt('%s_std.txt'%data_name,acc_std,fmt='%f')