## Expectation Reflection for Breast Cancer Diagnosis
In this work, we apply our method, Expectation Reflection (ER), to predict Breast Cancer. We compare the performance of ER with other existing methods such as Logistic Regression, Naive Bayes, Dicision Tree, Random Forest.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import expectation_reflection as ER

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
# load data
df = pd.read_csv('../input_data/breast_cancer_data.txt',sep= ',')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


We will drop out the first column `id` and move the target `diagnosis` to the last column, just for convenience.

In [4]:
df = df.drop('id', axis=1) # remove id column
df1 = df.pop('diagnosis') # remove column diagnosis and store it in df1
df['diagnosis'] = df1 # add df1 to df as a 'new' column

In [5]:
# convert B to -1, M to 1:
df.diagnosis = [0 if t == "B" else 1 for t in df.diagnosis]

In [6]:
# select features and target:
ds = np.array(df).astype(float)

# features:
X = ds[:,:-1]
l,n = X.shape
print(l,n)

# target:
y = ds[:,-1]
# convert 1,0 to 1,-1:
y = 2*y - 1

(569, 30)


### Shuffle data

In [7]:
from sklearn.utils import shuffle
X, y = shuffle(X, y)

In [8]:
#from sklearn.preprocessing import StandardScaler
#X = StandardScaler().fit_transform(X)

from sklearn.preprocessing import MinMaxScaler
X = MinMaxScaler().fit_transform(X)

In [9]:
kf = 5

### Prediction with Expectation Reflection

In [10]:
def ER_inference(X,y,kf=5,regu=0.005):    
    #x_train,x_test,y_train,y_test = train_test_split(X1,y,test_size=0.3,random_state = 100)    
    kfold = KFold(n_splits=kf,shuffle=False,random_state=1)
    accuracy = np.zeros(kf)
    
    for i,(train_index,test_index) in enumerate(kfold.split(y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # predict with ER
        h0,w = ER.fit(X_train,y_train,niter_max=100,regu=0.005)
        y_pred = ER.predict(X_test,h0,w)
        accuracy[i] = accuracy_score(y_test,y_pred)
        #print(accuracy[i])
    return accuracy.mean(),accuracy.std()

In [11]:
regu_list = [0.0,0.001,0.002,0.003,0.004,0.005,0.01,0.02,0.1,0.2,0.5,0.6,0.8,1.]
for regu in regu_list:
    accuracy_mean,accuracy_std = ER_inference(X,y,kf,regu)
    print('ER:',accuracy_mean,accuracy_std,regu)

('ER:', 0.9701443875174662, 0.008903135490243306, 0.0)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.001)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.002)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.003)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.004)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.005)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.01)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.02)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.1)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.2)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.5)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.6)
('ER:', 0.9701443875174662, 0.008903135490243306, 0.8)
('ER:', 0.9701443875174662, 0.008903135490243306, 1.0)


### Compare with other existing machine learning algorithms

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def ML_inference(X,y,kf=5,method='naive_bayes'):     
    kfold = KFold(n_splits=kf,shuffle=False,random_state=1)            
    accuracy = np.zeros(kf)
            
    if method == 'logistic_regression':
        model = LogisticRegression(solver='liblinear')

    if method == 'naive_bayes': 
        model = GaussianNB()
        
    if method == 'random_forest':
        model = RandomForestClassifier(criterion = "gini", random_state = 1,
                               max_depth=3, min_samples_leaf=5,n_estimators=100)        
    if method == 'decision_tree':
        model = DecisionTreeClassifier()  
        
    for i,(train_index,test_index) in enumerate(kfold.split(y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # fit and predict
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy[i] = accuracy_score(y_test,y_pred)
        #print(accuracy[i])
    return accuracy.mean(),accuracy.std()

In [13]:
other_methods=['naive_bayes','logistic_regression','decision_tree','random_forest']

for i,method in enumerate(other_methods):
    accuracy_mean,accuracy_std = ML_inference(X,y,kf,method)
    print('% 20s :'%method,accuracy_mean,accuracy_std)

('         naive_bayes :', 0.9297624592454587, 0.02472309453344037)
(' logistic_regression :', 0.9595714951094549, 0.008977387313188354)
('       decision_tree :', 0.9385188635305077, 0.029830901065864925)
('       random_forest :', 0.9473218444340941, 0.019941925467578978)


## Small sample sizes

In [14]:
l = 100
for regu in regu_list:
    accuracy_mean,accuracy_std = ER_inference(X[:l,:],y[:l],kf,regu)
    print('ER:',accuracy_mean,accuracy_std,regu)

('ER:', 0.97, 0.06000000000000001, 0.0)
('ER:', 0.97, 0.06000000000000001, 0.001)
('ER:', 0.97, 0.06000000000000001, 0.002)
('ER:', 0.97, 0.06000000000000001, 0.003)
('ER:', 0.97, 0.06000000000000001, 0.004)
('ER:', 0.97, 0.06000000000000001, 0.005)
('ER:', 0.97, 0.06000000000000001, 0.01)
('ER:', 0.97, 0.06000000000000001, 0.02)
('ER:', 0.97, 0.06000000000000001, 0.1)
('ER:', 0.97, 0.06000000000000001, 0.2)
('ER:', 0.97, 0.06000000000000001, 0.5)
('ER:', 0.97, 0.06000000000000001, 0.6)
('ER:', 0.97, 0.06000000000000001, 0.8)
('ER:', 0.97, 0.06000000000000001, 1.0)


In [15]:
for i,method in enumerate(other_methods):
    accuracy_mean,accuracy_std = ML_inference(X[:l,:],y[:l],kf,method)
    print('% 20s :'%method,accuracy_mean,accuracy_std)

('         naive_bayes :', 0.93, 0.060000000000000005)
(' logistic_regression :', 0.93, 0.05099019513592784)
('       decision_tree :', 0.85, 0.08944271909999157)
('       random_forest :', 0.93, 0.05099019513592784)


In [16]:
l = 50
for regu in regu_list:
    accuracy_mean,accuracy_std = ER_inference(X[:l,:],y[:l],kf,regu)
    print('ER:',accuracy_mean,accuracy_std,regu)

('ER:', 0.9399999999999998, 0.07999999999999999, 0.0)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.001)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.002)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.003)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.004)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.005)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.01)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.02)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.1)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.2)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.5)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.6)
('ER:', 0.9399999999999998, 0.07999999999999999, 0.8)
('ER:', 0.9399999999999998, 0.07999999999999999, 1.0)


In [17]:
for i,method in enumerate(other_methods):
    accuracy_mean,accuracy_std = ML_inference(X[:l,:],y[:l],kf,method)
    print('% 20s :'%method,accuracy_mean,accuracy_std)

('         naive_bayes :', 0.9399999999999998, 0.07999999999999999)
(' logistic_regression :', 0.9, 0.06324555320336757)
('       decision_tree :', 0.9199999999999999, 0.039999999999999994)
('       random_forest :', 0.9399999999999998, 0.07999999999999999)
