## Prediction with Logistic Regression

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,\
recall_score,roc_curve,auc

import expectation_reflection as ER
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from function import split_train_test,make_data_balance

In [2]:
np.random.seed(1)

First of all, the processed data are imported.

In [3]:
data_list = np.loadtxt('data_list.txt',dtype='str')

print(data_list)

['1paradox' '2peptide' '3stigma' '4nki' '5mental' '6smoking' '7anemia'
 '8language' '9coag' '10tazamia' '11hepato' '12heat' '13ef' '14cervix'
 '15heart' '16liver' '17nwosu' '18school' '19ibs' '21survival' '101kidney'
 '102breast_cancer' '103diabetes_niddk' '104diabetic_retinopathy']


In [4]:
def read_data(data_id):    
    data_name = data_list[data_id]
    print('data_name:',data_name)
    Xy = np.loadtxt('%s/data_processed.dat'%data_name) 
    X = Xy[:,:-1]
    y = Xy[:,-1]

    print(np.unique(y,return_counts=True))

    X,y = make_data_balance(X,y)

    print(np.unique(y,return_counts=True))

    X, y = shuffle(X, y, random_state=1)

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state = 1)
    
    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,y_train,y_test

In [5]:
def measure_performance(X_train,X_test,y_train,y_test):
    model = LogisticRegressionCV(cv=3,random_state=0,solver='liblinear')
    model.fit(X_train,y_train)
    y_test_pred = model.predict(X_test)

    # performance:
    #y_test_pred,p_test_pred = ER.predict(X_test,h0[il2_opt],w[il2_opt,:])

    #fp,tp,thresholds = roc_curve(y_test, p_test_pred, drop_intermediate=False)

    #roc_auc = auc(fp,tp)
    #print('AUC:', roc_auc)

    acc = accuracy_score(y_test,y_test_pred)
    print('Accuracy:', acc)

    #precision = precision_score(y_test,y_test_pred)
    #print('Precision:',precision)

    #recall = recall_score(y_test,y_test_pred)
    #print('Recall:',recall)

    return acc,0.,0.,0. #,roc_auc,precision,recall

In [6]:
n_data = len(data_list)
roc_auc = np.zeros(n_data)   ; acc = np.zeros(n_data)
precision = np.zeros(n_data) ; recall = np.zeros(n_data)
for data_id in range(n_data):
    X_train,X_test,y_train,y_test = read_data(data_id)
    acc[data_id],roc_auc[data_id],precision[data_id],recall[data_id] =\
            measure_performance(X_train,X_test,y_train,y_test) 

data_name: 1paradox
(array([0., 1.]), array([169,  60]))
(array([0., 1.]), array([60, 60]))
Accuracy: 0.8166666666666667
data_name: 2peptide
(array([0., 1.]), array([675,  23]))
(array([0., 1.]), array([23, 23]))
Accuracy: 0.9565217391304348
data_name: 3stigma
(array([0., 1.]), array([2725, 7940]))
(array([0., 1.]), array([2725, 2725]))
Accuracy: 1.0
data_name: 4nki
(array([0., 1.]), array([195,  77]))
(array([0., 1.]), array([77, 77]))
Accuracy: 0.7272727272727273
data_name: 5mental
(array([0., 1.]), array([616, 147]))
(array([0., 1.]), array([147, 147]))
Accuracy: 0.7142857142857143
data_name: 6smoking
(array([0., 1.]), array([852, 722]))
(array([0., 1.]), array([722, 722]))
Accuracy: 1.0
data_name: 7anemia
(array([0., 1.]), array([193,  43]))
(array([0., 1.]), array([43, 43]))
Accuracy: 0.7209302325581395
data_name: 8language
(array([0., 1.]), array([896, 267]))
(array([0., 1.]), array([267, 267]))
Accuracy: 0.6966292134831461
data_name: 9coag
(array([0., 1.]), array([504, 994]))
(a

In [7]:
#np.savetxt('result_LR_CV.dat',(roc_auc,acc,precision,recall),fmt='%f')

In [8]:
acc.mean()

0.8114180452594861