## Expectation Reflection for Breast Cancer Diagnosis
In this work, we apply our method, Expectation Reflection (ER), to predict Breast Cancer. We compare the performance of ER with other existing methods such as Logistic Regression, Naive Bayes, Dicision Tree, Random Forest, k-nearest neighbors, and Support Vector Machines (SVM).

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from inference import fit

import matplotlib.pyplot as plt
%matplotlib inline

ImportError: No module named inference

In [None]:
np.random.seed(1)

In [None]:
# load data
df = pd.read_csv('../input_data/breast_cancer_data.txt',sep= ',')
df.head()

We will drop out the first column `id` and move the target `diagnosis` to the last column, just for convenience.

In [None]:
df = df.drop('id', axis=1) # remove id column
df1 = df.pop('diagnosis') # remove column diagnosis and store it in df1
df['diagnosis'] = df1 # add df1 to df as a 'new' column

In [None]:
# convert B to -1, M to 1:
df.diagnosis = [0 if t == "B" else 1 for t in df.diagnosis]

In [None]:
# select features and target:
ds = np.array(df).astype(float)

# features:
X = ds[:,:-1]
l,n = X.shape
print(l,n)

# target:
y = ds[:,-1]
# convert 1,0 to 1,-1:
y = 2*y - 1

### Shuffle data

In [None]:
from sklearn.utils import shuffle
X, y = shuffle(X, y)

In [None]:
#from sklearn.preprocessing import StandardScaler
#X = StandardScaler().fit_transform(X)

from sklearn.preprocessing import MinMaxScaler
X = MinMaxScaler().fit_transform(X)

In [None]:
kf = 5

### Prediction with Expectation Reflection

In [None]:
def expectation_reflection_inference(X,y,kf=5,regu=0.005):    
    #x_train,x_test,y_train,y_test = train_test_split(X1,y,test_size=0.3,random_state = 100)    
    kfold = KFold(n_splits=kf,shuffle=False,random_state=1)
    accuracy = np.zeros(kf)
    
    for i,(train_index,test_index) in enumerate(kfold.split(y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # predict with ER
        h0,w = fit(X_train,y_train,niter_max=100,regu=0.005)
        h_pred = h0 + X_test.dot(w)
        y_pred = np.sign(h_pred)
        accuracy[i] = accuracy_score(y_test,y_pred)
        #print(accuracy[i])
    return accuracy.mean(),accuracy.std()

In [None]:
regu_list = [0.0,0.001,0.002,0.003,0.004,0.005,0.01,0.02,0.1,0.2,0.5,0.6,0.8,1.]
for regu in regu_list:
    accuracy_mean,accuracy_std = expectation_reflection_inference(X,y,kf,regu)
    print('ER:',accuracy_mean,accuracy_std,regu)

### Compare with other existing machine learning algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

def inference(X,y,kf=5,method='naive_bayes'):     
    kfold = KFold(n_splits=kf,shuffle=False,random_state=1)            
    accuracy = np.zeros(kf)
            
    if method == 'logistic_regression':
        model = LogisticRegression(solver='liblinear')

    if method == 'naive_bayes': 
        model = GaussianNB()
        
    if method == 'random_forest':
        model = RandomForestClassifier(criterion = "gini", random_state = 1,
                               max_depth=3, min_samples_leaf=5,n_estimators=100)        
    if method == 'decision_tree':
        model = DecisionTreeClassifier()
        
    if method == 'knn':    
        model = KNeighborsClassifier()
        
    if method == 'svm':    
        model = SVC(gamma='scale')     
        
    for i,(train_index,test_index) in enumerate(kfold.split(y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # fit and predict
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy[i] = accuracy_score(y_test,y_pred)
        #print(accuracy[i])
    return accuracy.mean(),accuracy.std()

In [None]:
other_methods=['naive_bayes','logistic_regression','decision_tree','random_forest','knn','svm']

for i,method in enumerate(other_methods):
    accuracy_mean,accuracy_std = inference(X,y,kf,method)
    print('% 20s :'%method,accuracy_mean,accuracy_std)

## Small sample sizes

In [None]:
l = 100
for regu in regu_list:
    accuracy_mean,accuracy_std = expectation_reflection_inference(X[:l,:],y[:l],kf,regu)
    print('ER:',accuracy_mean,accuracy_std,regu)

In [None]:
for i,method in enumerate(other_methods):
    accuracy_mean,accuracy_std = inference(X[:l,:],y[:l],kf,method)
    print('% 20s :'%method,accuracy_mean,accuracy_std)

In [None]:
l = 50
for regu in regu_list:
    accuracy_mean,accuracy_std = expectation_reflection_inference(X[:l,:],y[:l],kf,regu)
    print('ER:',accuracy_mean,accuracy_std,regu)

In [None]:
for i,method in enumerate(other_methods):
    accuracy_mean,accuracy_std = inference(X[:l,:],y[:l],kf,method)
    print('% 20s :'%method,accuracy_mean,accuracy_std)