# Part 1A

In [149]:
import pandas as pd
import numpy as np
import random
import math
from functools import reduce
from numpy import genfromtxt

In [150]:
def getData():
    data = np.genfromtxt('./data/pima-indians-diabetes.csv', delimiter=',')
    return data[1:,:]

In [151]:
def split(dataset, training_percent = .8):
    np.random.shuffle(dataset)
    num_of_items = len(dataset)
    training_split = int(.8 * num_of_items)
    return dataset[:training_split, :], dataset[training_split:,:]

In [152]:
def aggregateByClass(data): 
    classToValues = {}
    for item in data:
        classType = item[-1]
        if(classType not in classToValues):
            classToValues[classType] = []
        
        classToValues[classType].append(item)
    return classToValues

In [153]:
def calculateMeanAndStDv(data):
    results = []
    for item in zip(*data):
        results.append((np.mean(item), np.std(item)))
    del results[-1]
    return results

In [154]:
def calculateForClass(dataByClasses):
    results = {}
    for classValue, values in dataByClasses.items():
        results[classValue] = calculateMeanAndStDv(values)
    return results

In [155]:
def calculateProbability(x, mean, std):
    exponent = np.exp(-(np.power(x-mean,2)/(2*np.power(std,2))))
    return (1 / (np.sqrt(2*np.pi) * std)) * exponent

In [156]:
def calculateProbabilitiesForClass(dataByClasses, vector):
    classProbabilities = {}
    for classType, classSummaries in dataByClasses.items():
        classProbabilities[classType] = 1
        for i in range(len(classSummaries)):
            mean, std = classSummaries[i]
            inputV = vector[i]
            classProbabilities[classType] *= calculateProbability(inputV, mean, std)
    return classProbabilities

In [157]:
def makePrediction(classSummaries, vector):
    classProbabilities = calculateProbabilitiesForClass(classSummaries, vector)
    predictedLabel, bestProb = None, -1
    for label, prob in classProbabilities.items():
        if predictedLabel is None or prob > bestProb:
            predictedLabel = label
            bestProb = prob
    return predictedLabel

In [158]:
def getPredictionsForClass(classSummaries, testDataset):
    predictions = []
    for testValue in testDataset:
        prediction = makePrediction(classSummaries, testValue)
        predictions.append(prediction)
    return predictions

In [159]:
def getAccuracy(predictions, testData):
    accurate = 0
    rows_in_test_set = len(testData)
    for index in range(rows_in_test_set):
        if predictions[index] == testData[index][-1]:
            accurate += 1
    return (accurate / rows_in_test_set) * 100

In [160]:
def train(data):
    splits = 10
    accuracyAgg = 0
    for step in range(splits):
        print(f'Training step {step}')
        trainingData, testData = split(data)
        groupedByClass = aggregateByClass(trainingData)
        classSummary = calculateForClass(groupedByClass)
        predictions = getPredictionsForClass(classSummary, testData)
        accuracy = getAccuracy(predictions, testData)
        accuracyAgg += accuracy
    return accuracyAgg / splits

In [161]:
data = getData()

In [162]:
train(data)

Training step 0
Training step 1
Training step 2
Training step 3
Training step 4
Training step 5
Training step 6
Training step 7
Training step 8
Training step 9


75.0

# Part 1B