In [195]:
# kNearest Neibhbors algorithm

import numpy as np
import pandas as pd
import csv
import random

# Read csv file and split the data into training and test data sets
def loadDataset(filename, split, trainingSet=[], testSet=[]):
    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        for x in range(len(dataset)-1):
            for y in range(4):
                dataset[x][y] = float(dataset[x][y])
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])

In [196]:
# Test the function loadDataset
trainingSet=[]
testSet=[]

# Data Source https://www.kaggle.com/uciml/iris#Iris.csv
loadDataset(r'Data//iris.csv', 0.66, trainingSet, testSet)

print('Size of training dataset = ', len(trainingSet))
print('Size of test dataset = ', len(testSet))

Size of training dataset =  103
Size of test dataset =  46


In [197]:
# Define euclideanDistance calculation function

import math
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += np.power((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

In [198]:
data1 = [2, 2, 2, 'a']
data2 = [4, 4, 4, 'b']
print(euclideanDistance(data1, data2, 3))

3.4641016151377544


In [199]:
import operator
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    #print(distances)
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [200]:
# Test the function "getNeighbors"
trainSet = [[2, 2, 2, 'a'], [3, 3, 3, 'b'], [5, 5, 5, 'd'], [6, 6, 6, 'e']]
testInstance = [4, 4, 4]
k = 2
neighbors = getNeighbors(trainSet, testInstance, k)
print("The", k,"- nearest neighbors to ", testInstance, " are ", neighbors)

The 2 - nearest neighbors to  [4, 4, 4]  are  [[3, 3, 3, 'b'], [5, 5, 5, 'd']]


In [201]:
import operator
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items() , key=operator.itemgetter(1), reverse=True)
    
    return sortedVotes[0][0]

In [202]:
neighbors = [[1, 1, 1, 'a'], [2, 2, 2, 'c'], [3, 3, 3, 'c']]
response = getResponse(neighbors)
print(response)

c


In [203]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if np.isin(testSet[x][-1], predictions[x]):
        #if testSet[x][-1] is predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [204]:
testSet = [[1, 1, 1, 'a'], [2, 2, 2, 'a'], [3, 3, 3, 'b']]
predictions = ['b', 'b', 'b']
accuracy = getAccuracy(testSet, predictions)
print(accuracy)

33.33333333333333


In [219]:
def main():
    # prepare data
    trainingSet = []
    testSet = []
    split = 0.67
    
    loadDataset('Data//iris.csv', split, trainingSet, testSet)
    print('Main - Train Set = ', len(trainingSet))
    print('Main - Test Set = ', len(testSet))

    #print("Training Data set:\n", trainingSet)
    #print("Test Data set:\n", testSet)

    # Generate predictions
    predictions = []
    k = 3
    
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
        print('> predicted = ', result, ' Actual = ', testSet[x][-1], result == testSet[x][-1])
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy = ', accuracy, '%')


In [220]:
main()

Main - Train Set =  94
Main - Test Set =  55
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Irissetosa False
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-setosa  Actual =  Iris-setosa True
> predicted =  Iris-versicolor  Actual =  Iris-versicolor True
> predicted 