## Expectation Reflection for Diabetes Diagnosis
In this work, we apply our method, Expectation Reflection (ER), to predict diabetes from Pima Indians Diabetes dataset. We compare the performance of ER with other existing methods such as Logistic Regression, Naive Bayes, Dicision Tree, Random Forest, k-nearest neighbors, and Support Vector Machines (SVM).

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from inference import fit

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
# load data
s = pd.read_csv('diabetes_data.csv',sep= ',', header= None)
s.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


The data contains 8 features:<br/>
1) Pregnancies: Number of times pregnant<br/>
2) Glucose: Plasma glucose concentration at 2 hours in an oral glucose tolerance test (GTT)<br/>
3) BloodPressure: Diastolic blood pressure (mmHg)<br/>
4) SkinThickness: Triceps skin fold thickness (mm)<br/>
5) Insulin: 2-Hour serum insulin (mu U/ml)<br/>
6) BMI: Body Mass Index (weight(kg)/(height(m))^2)<br/>
7) DiabetesPedigreeFunction: Diabetes Pedigree Function<br/>
8) Age: Age (years)<br/>

and 1 target: 1 (positive), 0 (negative)

### Impute missing data

In [4]:
# impute missing data
Xy = np.loadtxt('diabetes_data_imputed_knn3.txt').astype(float)
X = Xy[:,:8]
y = Xy[:,8]

### Shuffle data

In [5]:
from sklearn.utils import shuffle
X, y = shuffle(X, y)

In [6]:
# Normalize data
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

# convert 1,0 to 1,-1:
y = 2*y - 1

In [7]:
kf = 5

### Prediction with Expectation Reflection

In [8]:
def expectation_reflection_inference(X,y,kf=5,regu=0.005):    
    #x_train,x_test,y_train,y_test = train_test_split(X1,y,test_size=0.3,random_state = 100)    
    kfold = KFold(n_splits=kf,shuffle=False,random_state=1)
    accuracy = np.zeros(kf)
    
    for i,(train_index,test_index) in enumerate(kfold.split(y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # predict with ER
        h0,w = fit(X_train,y_train,niter_max=100,regu=0.005)
        h_pred = h0 + X_test.dot(w)
        y_pred = np.sign(h_pred)
        accuracy[i] = accuracy_score(y_test,y_pred)
        #print(accuracy[i])
    return accuracy.mean(),accuracy.std()

In [9]:
regu_list = [0.0,0.001,0.002,0.003,0.004,0.005,0.01,0.02,0.1,0.2]
#regu = 0.003
for regu in regu_list:
    accuracy_mean,accuracy_std = expectation_reflection_inference(X,y,kf,regu)
    print('ER:',accuracy_mean,accuracy_std,regu)

('ER:', 0.7708683473389355, 0.01771045977835708, 0.0)
('ER:', 0.7695696460402341, 0.01868986381562802, 0.001)
('ER:', 0.7708683473389355, 0.019085574066772446, 0.002)
('ER:', 0.7695696460402341, 0.01868986381562802, 0.003)
('ER:', 0.7708683473389355, 0.01771045977835708, 0.004)
('ER:', 0.7682624564977505, 0.016319835768358715, 0.005)
('ER:', 0.7708683473389355, 0.01771045977835708, 0.01)
('ER:', 0.7695696460402341, 0.01868986381562802, 0.02)
('ER:', 0.7682624564977505, 0.016319835768358715, 0.1)
('ER:', 0.7721670486376369, 0.01803428023087815, 0.2)


### Compare with other existing machine learning algorithms

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

def inference(X,y,kf=5,method='naive_bayes'):     
    kfold = KFold(n_splits=kf,shuffle=False,random_state=1)            
    accuracy = np.zeros(kf)
            
    if method == 'logistic_regression':
        model = LogisticRegression(solver='liblinear')

    if method == 'naive_bayes': 
        model = GaussianNB()
        
    if method == 'random_forest':
        model = RandomForestClassifier(criterion = "gini", random_state = 1,
                               max_depth=3, min_samples_leaf=5,n_estimators=100)        
    if method == 'decision_tree':
        model = DecisionTreeClassifier()
        
    if method == 'knn':    
        model = KNeighborsClassifier()
        
    if method == 'svm':    
        model = SVC(gamma='scale')     
        
    for i,(train_index,test_index) in enumerate(kfold.split(y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # fit and predict
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy[i] = accuracy_score(y_test,y_pred)
        #print(accuracy[i])
    return accuracy.mean(),accuracy.std()

In [11]:
other_methods=['naive_bayes','logistic_regression','decision_tree','random_forest','knn','svm']

for i,method in enumerate(other_methods):
    accuracy_mean,accuracy_std = inference(X,y,kf,method)
    print('% 20s :'%method,accuracy_mean,accuracy_std)

('         naive_bayes :', 0.7448179271708684, 0.024856551980908588)
(' logistic_regression :', 0.7682539682539682, 0.014790934676447327)
('       decision_tree :', 0.7175027586792293, 0.030419352769936476)
('       random_forest :', 0.7604362957304134, 0.022082123375170664)
('                 knn :', 0.7434682964094728, 0.04019233714738783)
('                 svm :', 0.7604193192428487, 0.037963787747415705)
