## Bayes algorithm

In [1]:
import numpy as np
import pandas as pd

In [102]:
dataset = pd.read_csv('pima-indians-diabetes.data.csv', header=None )

In [103]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [104]:
len(dataset)

768

In [105]:
labels = dataset[8]
del dataset[8]

In [126]:
def calc_stdev(dataset):
    return (np.sum((dataset - np.mean(dataset, axis=0))**2, axis=0)/len(dataset))**0.5

In [127]:
calc_stdev(dataset)

0      3.367384
1     31.951796
2     19.343202
3     15.941829
4    115.168949
5      7.879026
6      0.331113
7     11.752573
dtype: float64

In [108]:
def calc_gaussian(x, mean, stdev):
    return np.exp(-((x - mean)**2)/(2 * (stdev**2)))/(np.sqrt(2 * np.pi) * stdev)

In [209]:
calc_gaussian([0,1,2,3,4,5,6,7,8,9], np.mean([0,1,2,3,4,5,6,7,8,9]), calc_stdev([0,1,2,3,4,5,6,7,8,9]))

array([ 0.04070852,  0.06610774,  0.09509936,  0.12118842,  0.13680528,
        0.13680528,  0.12118842,  0.09509936,  0.06610774,  0.04070852])

In [120]:
np.mean(np.array(dataset)[np.array(labels) == 1], axis=0)

array([   4.86567164,  141.25746269,   70.82462687,   22.1641791 ,
        100.3358209 ,   35.14253731,    0.5505    ,   37.06716418])

In [134]:
np.std(np.array(dataset)[np.array(labels) == 1], axis=0)

array([   3.7342526 ,   31.87997752,   21.45167751,   17.64669604,
        138.43013454,    7.24940427,    0.37165914,   10.94777137])

In [180]:
def bayes_model(dataset, labels):
    model = {}
    p_x = (np.mean(dataset, axis=0), np.std(dataset, axis=0))
    for label in set(labels):
        sub_dataset = np.array(dataset)[np.array(labels) == label]
        model[label] = (np.mean(sub_dataset, axis=0), np.std(sub_dataset, axis=0), np.sum(labels == label)/len(labels))
    return model, p_x

In [184]:
model, px_model = bayes_model(dataset, labels)
model

{0: (array([   3.298   ,  109.98    ,   68.184   ,   19.664   ,   68.792   ,
           30.3042  ,    0.429734,   31.19    ]),
  array([  3.01416589,  26.11504547,  18.0450033 ,  14.87504971,
          98.76637452,   7.68216131,   0.29878607,  11.6559813 ]),
  0.65104166666666663),
 1: (array([   4.86567164,  141.25746269,   70.82462687,   22.1641791 ,
          100.3358209 ,   35.14253731,    0.5505    ,   37.06716418]),
  array([   3.7342526 ,   31.87997752,   21.45167751,   17.64669604,
          138.43013454,    7.24940427,    0.37165914,   10.94777137]),
  0.34895833333333331)}

In [185]:
px_model

(0      3.845052
 1    120.894531
 2     69.105469
 3     20.536458
 4     79.799479
 5     31.992578
 6      0.471876
 7     33.240885
 dtype: float64, 0      3.367384
 1     31.951796
 2     19.343202
 3     15.941829
 4    115.168949
 5      7.879026
 6      0.331113
 7     11.752573
 dtype: float64)

In [201]:
def bayes_predict(x, model, p_x_model):
    ret = {}
    '''
    prob_x = 1
    for prob in calc_gaussian(x, p_x_model[0], p_x_model[1]):
        prob_x = prob_x * prob
    '''
    for label in model.keys():
        ret[label] = 1
        mean, std, p_c = model[label]
        #p(ci|x) = p(x|ci) * p(c) / p(x)
        for prob in calc_gaussian(x, mean, std): #note that gaussian is not a exact prob, but it is a likehood function
            ret[label] = ret[label] * prob
        ret[label] = ret[label] * p_c #p(x|ci) * p(c)
        #ret[label] = ret[label] / prob_x #p(x|ci) * p(c) / p(x)
        #but p(x) is same for a input x, so that p(x) would not affect the result of max prob of label
    return ret

In [211]:
bayes_predict(dataset.iloc[0], model, px_model)

{0: 0.30625006209423733, 1: 0.62600361643987723}

In [204]:
pre_label = []
for i in range(len(dataset)):
    best_prob = 0
    prob = bayes_predict(dataset.iloc[i], model, px_model)
    for label in prob.keys():
        if prob[label] > best_prob:
            best_prob = prob[label]
            best_label = label
    pre_label.append(best_label)

In [205]:
np.sum(pre_label == labels)/len(labels)

0.76302083333333337