In [1]:
import numpy as np
import matplotlib.pyplot as plt
from math import log, pi, sqrt, exp

import warnings
warnings.filterwarnings('ignore')

In [2]:
def NormZscore(data):
    mean = np.mean(data, axis = 0)
    data_int = data - mean
    sdt = np.std(data, axis = 0)
    data_norm = data_int / sdt

    return data_norm, mean, sdt
def DesnZscore(data,m,s):
    return data * s + m
def kfolds(dataset, k):
    shuf = np.random.permutation(dataset)
    n = shuf.shape[0]
    k = 10
    fold_size = n // k

    folds = []

    for i in range(k-1):
        #print(i*fold_size, (i+1)*fold_size)
        folds.append(shuf[i*fold_size:(i+1)*fold_size,:])

    folds.append(shuf[(k-1)*fold_size:,:])

    return folds
def Sigm(x, w):
    return 1 / (1 + np.exp(-x @ w))

In [3]:
def fitLR(x, y, alpha=1, epoch=5):
    onecolumn = np.ones(x.shape[0])
    X = np.c_[onecolumn, x]
    w_pred = np.zeros((X.shape[1], 1))

    err_list = []
    
    for e in range (0, epoch):
        error = y - Sigm(X, w_pred)

        for i in range(X.shape[1]):
            w_pred[i] += alpha * (np.mean(error * X[:,i]))
        
        err = np.mean(- y * np.log(Sigm(X, w_pred)) - (1 - y) * np.log(1 - Sigm(X, w_pred)))

        err_list.append(err)
    
    return w_pred, err_list
def predLR(X, w_pred):
    onecolumn = np.ones(X.shape[0])
    X = np.c_[onecolumn, X]
    
    Y = []
    
    for j in range(0, X.shape[0]):
        y_pred = Sigm(X[i], w_pred)

        if y_pred >= 0.5:
            Y.append(1)

        else:
            Y.append(0)
    
    return Y

In [None]:
#NaiveBayes
def fitNB(x, y):
    classes_nb, counts_nb = np.unique(y, return_counts=True)

    nClasses_nb = len(classes_nb)

    n_nb = y.shape[0]

    nFeatures_nb = x.shape[1]

    probClasses_nb = {classes_nb[i]: counts_nb[i] for i in range(len(counts_nb))}

    for k_nb in probClasses_nb:
            probClasses_nb[k_nb] = round(probClasses_nb[k_nb] / n_nb, 6)

    mean_nb = np.zeros((nFeatures_nb, nClasses_nb))
    covar_nb = np.zeros((nFeatures_nb, nClasses_nb))
    x_c_nb = []

    for c_nb in classes_nb:

        x_i_nb = []

        for i_nb in range(0, x.shape[0]):
            if c_nb == y[i_nb]:
                x_i_nb.append(x[i_nb])

        x_c_nb.append(np.array(x_i_nb))

    x_c_nb = np.array(x_c_nb, dtype=object)

    for p_nb in range(classes_nb.shape[0]):
        mean_nb[:, p_nb] = np.mean(x_c_nb[p_nb], axis=0)
        x_mean_nb = x_c_nb[p_nb] - mean_nb[:, p_nb]

        covar_nb[:,p_nb] = np.sum((x_mean_nb)**2, axis=0) / x_c_nb[p_nb].shape[0]
    
    return nClasses_nb, covar_nb, mean_nb, probClasses_nb, classes_nb
def predNB(x, nClasses_nb, covar_nb, mean_nb, probClasses_nb, classes_nb):
    prob_x_nb = np.zeros((x.shape[0], nClasses_nb))

    for ix_nb in range(x.shape[0]):
        for ic_nb in range(nClasses_nb):
            fat1_nb = - 0.5 * np.sum(np.log(2*pi*covar_nb[:, ic_nb]))
            fat2_nb = - 0.5 * np.sum((x[ix_nb, :] - mean_nb[:, ic_nb])**2 / covar_nb[:,ic_nb])
            fat3_nb = np.log(probClasses_nb[ic_nb])

            prob_x_nb[ix_nb, ic_nb] = fat1_nb + fat2_nb + fat3_nb

    y_pred_nb = []

    for i_nb in range(x.shape[0]):
        y_pred_nb.append(classes_nb[np.argmax(prob_x_nb[i_nb, :])])

    return y_pred_nb

In [4]:
k = 10

bc = np.genfromtxt('./breastcancer.csv', delimiter=',')
bcsize = bc.shape[0]

x_norm, x_mean, x_sdt = NormZscore(bc[:,0:30])
x_des = DesnZscore(x_norm, x_mean, x_sdt)
y = bc[:,[30]]

bc_norm = np.c_[x_norm, y]

folds = kfolds(bc_norm, k)

In [31]:
#ADG
def fitADG(x, y):
    classes, counts = np.unique(y, return_counts=True)

    nClasses = len(classes)

    n = y.shape[0]

    nFeatures = x.shape[1]

    probClasses = {classes[i]: counts[i] for i in range(len(counts))}

    for k in probClasses:
            probClasses[k] = round(probClasses[k] / n, 6)

    mean = np.zeros((nFeatures, nClasses))
    covar = np.zeros((nFeatures, nFeatures, nClasses))
    x_c = []

    for c in classes:

        x_i = []

        for i in range(0, x.shape[0]):
            if c == y[i]:
                x_i.append(x[i])

        x_c.append(np.array(x_i))

    x_c = np.array(x_c, dtype=object)

    for p in range(classes.shape[0]):
        mean[:, p] = np.mean(x_c[p], axis=0)
        x_mean = x_c[p] - mean[:, p]

        covar[:,:,p] = (np.transpose(x_mean) @ x_mean) / counts[p]

    det = np.zeros(nClasses)    
    inv = np.zeros((nFeatures, nFeatures, nClasses))

    for cl in range (nClasses):
        det[cl] = np.linalg.det(covar[:, :, cl])

        inv[:, :, cl] = (np.linalg.inv(covar[:, :, cl]))

    return mean, inv, det, nClasses, probClasses, nFeatures, classes
def predADG(x, mean, inv, det, nClasses, probClasses, nFeatures, classes):
    prob_x = np.zeros((x.shape[0], nClasses))
    
    for ix in range(x.shape[0]):
        for ic in range(nClasses):
            fat1 = (1/ (sqrt(det[ic]) * ((2*pi)**(nFeatures/2))))
            fat2 = np.exp(-(0.5) * np.transpose(x[ix] - mean[:, ic]) @ inv[:, :, ic] @ (x[ix] - mean[:, ic]))
            fat3 = np.log(probClasses[ic])
            prob_x[ix, ic] = fat1 * fat2 * fat3

    y_pred = []

    for i in range(x.shape[0]):
        y_pred.append(classes[np.argmax(prob_x[i, :])])
    
    return y_pred

In [32]:
traintest = []
modelsLR = []
modelsADG = []
modelsNB = []
predsLR = []
predsADG = []
predsNB = []

for i in range(len(folds)):
    test = folds[i]
    train = []
    for f in range(len(folds)):
        if i != f:
            train.extend(folds[f])
    
    #traintest.append((train, test))

    x_train = np.array(train)[:,0:30]
    y_train = np.array(train)[:,30]
    
    x_test = np.array(test)[:,0:30]
    y_test = np.array(test)[:,30]
    
    traintest.append((x_train, y_train, x_test, y_test))
    
    modelsLR.append(fitLR(x_train, y_train))
    modelsADG.append(fitADG(x_train, y_train))
    modelsNB.append(fitNB(x_train, y_train))
    
for i in range(len(folds)):
    x_train, y_train, x_test, y_test  = traintest[i]

    predsLR.append(predLR(x_test, modelsLR[i][0]))
    predsADG.append(predADG(x_test, modelsADG[i][0], modelsADG[i][1], modelsADG[i][2], modelsADG[i][3], modelsADG[i][4], modelsADG[i][5], modelsADG[i][6]))
    predsNB.append(predNB(x_test, modelsNB[i][0], modelsNB[i][1], modelsNB[i][2], modelsNB[i][3], modelsNB[i][4]))

In [33]:
predsADG

[[1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.

In [8]:
def acc(y, y_pred):
    correct = 0
    for i in range(len(y)):
        if y[i] == y_pred[i]:
            TP += 1
    return TP / float(len(y)) * 100.0