In [1]:
import numpy as np
from random import randrange
import math

In [10]:
def separateByClass(dataset):
    separated = {}
    
    for i in dataset:
        vector = i
        
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
            
        separated[vector[-1]].append(vector)  
    return separated

def summarize(dataset):
    summaries = [(np.mean(attribute), np.std(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.iteritems():
		summaries[classValue] = summarize(instances)
	return summaries

def calculateProbability(x, mean, stdev):
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems():
		probabilities[classValue] = 1
		for i in np.arange(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities[classValue] *= calculateProbability(x, mean, stdev)
	return probabilities


def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.iteritems():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def getPredictions(summaries, testSet):
	predictions = []
	for i in np.arange(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def getAccuracy(testSet, predictions):
	correct = 0
	for i in np.arange(len(testSet)):
		if testSet[i][-1] == predictions[i]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

def splitDataset(data, trainNumberForClass, testNumberForclass, trainNumber, testNumber):
    
    train = np.zeros(shape=(trainNumber,5))
    test = np.zeros(shape=(testNumber,5))
    j = 0
    step = 0
    rows = trainNumber + testNumber
    classNumber = trainNumberForClass + testNumberForclass
    for i in np.arange(1,int(rows/classNumber)+1):
        test[j*testNumberForclass: i*testNumberForclass] = data[j*testNumberForclass + step : i*testNumberForclass + step]
        train[j*trainNumberForClass : i*trainNumberForClass] = data[i*testNumberForclass + step : i*testNumberForclass + trainNumberForClass + step]
        
        j = i
        step += trainNumberForClass
    
    return train, test

In [5]:
data = np.loadtxt('C:\\Users\\darle\\Desktop\\ex4data1.data', delimiter=',')
trainNumberForClass = 30
testNumerForClass = 20
testNumber = 60
trainNumber = 90
train, test = splitDataset(data, trainNumberForClass, testNumerForClass, trainNumber, testNumber)

In [16]:
trainingsumaries = summarizeByClass(train)

In [35]:
trainingPred = getPredictions(trainingsumaries, test)
conf = [[0,0,0],[0,0,0],[0,0,0]]
for i, j in zip(trainingPred, test[:,4]):
    conf[int(j-1)][int(i-1)] += 1
            

In [90]:
accuracy = getAccuracy(test, trainingPred)

In [92]:
print('Accuracy: {0}%').format(accuracy)

Accuracy: 95.0%


In [36]:
conf

[[20, 0, 0], [0, 19, 1], [0, 2, 18]]