In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

In [2]:
df = pd.read_csv("Diabetes.csv")

This model will only be used as a classifier because the predicted probabilities aren't reliable for Naive Bayes anyway.

In [3]:
# Split data set into testing and training
train, test = train_test_split(df, test_size = 0.1)

In [4]:
target = "Class"

X = train.ix[:,df.columns != target]
Y = train[target]

In [5]:
# Split data into classes
separated = dict() # dict of dataframes--> target_val : dataframe
for val in np.unique(Y):
    separated[val] = df[df[target] == val]

In [6]:
# Calculate class priors
priors = dict()
for val in np.unique(Y):
    priors[val] = float(separated[val].shape[0]) / train.shape[0]

In [7]:
# Calculate class conditional mean and standard deviation
summaries = dict()
for val in np.unique(Y):
    summaries[val] = [separated[val].apply(lambda x: np.mean(x)),separated[val].apply(lambda x: np.std(x))]

In [8]:
# Calculate class conditional standard deviation for each column
std = dict()
for val in np.unique(Y):
    std[val] = X.apply(lambda x: np.std(x))

In [9]:
# Create a function to return the normal density
import math
def calculate_density(x,mean, std):
    exponential = np.exp(-math.pow(x-mean,2)/(2*math.pow(std,2)))
    return math.pow(2*math.pi*math.pow(std,2),-0.5)*exponential    

In [10]:
def calculate_class_probability(summaries, inputRow):
    columns = inputRow.index
    probabilities = {}
    for val in summaries.keys():
        probabilities[val] = priors[val]
        for col in columns:
            mean = summaries[val][0][col]
            std = summaries[val][1][col]
            probabilities[val] *= calculate_density(inputRow[col], mean, std)
    
    return probabilities
    

In [11]:
# Create a function to predict
def pickWinner(probabilities):
    return max(probabilities, key=probabilities.get)

In [15]:
prob = calculate_class_probability(summaries, X.ix[3])

In [16]:
print prob

{0: 2.5944757542381952e-12, 1: 6.4638163177707377e-14}


In [14]:
print pickWinner(prob)

0


In [38]:
q = X

In [39]:
preds = X.apply(lambda x: pickWinner(calculate_class_probability(summaries, x)), axis = 1)

In [40]:
print np.mean(df['Class'])

0.348958333333


In [41]:
print np.mean(preds)

0.379160636758


In [37]:
print preds

253    0
499    1
429    0
0      1
37     1
751    0
685    0
24     1
626    0
606    1
dtype: int64
