In [1]:
from datascience import *
import numpy as np

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import warnings
warnings.simplefilter('ignore', FutureWarning)

# regression functions
def standard_units(arr):
    return (arr - np.mean(arr)) / np.std(arr)

def correlation(x, y):
    return np.mean(standard_units(x) * standard_units(y))

def slope(x, y):
    r = correlation(x, y)
    return r * np.std(y) / np.std(x)

def intercept(x, y):
    m = slope(x, y)
    return np.mean(y) - m * np.mean(x)

# read table
iris = Table.read_table('iris.csv')
iris

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa
6,5.4,3.9,1.7,0.4,Iris-setosa
7,4.6,3.4,1.4,0.3,Iris-setosa
8,5.0,3.4,1.5,0.2,Iris-setosa
9,4.4,2.9,1.4,0.2,Iris-setosa
10,4.9,3.1,1.5,0.1,Iris-setosa


In [6]:
iris_suffled = iris.sample(with_replacement=False).drop('Id')
train = iris_suffled.take(np.arange(110))
test = iris_suffled.take(np.arange(110, iris_suffled.num_rows))
train

SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
5.2,3.4,1.4,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
6.7,3.1,4.4,1.4,Iris-versicolor
4.9,3.1,1.5,0.1,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
7.7,2.8,6.7,2.0,Iris-virginica
5.9,3.0,4.2,1.5,Iris-versicolor
5.0,3.5,1.6,0.6,Iris-setosa


In [14]:
def distance(t, arr):
    distances = []
    for i in np.arange(t.num_rows):
        squared_dists = 0
        for j in np.arange(4):
            squared_dists += (t.row(i)[j] - arr[j])**2
        distances += [squared_dists**0.5]
    return t.with_column('distance', distances)

def knn(t, arr, k):
    t = distance(t, arr)
    t = t.sort('distance').take(np.arange(k))
    return t.group('Species').sort('count', descending=True).column('Species').item(0)

def test_accuracy(train, test, k):
    classifications = []
    test2 = test.select(np.arange(4))
    for row in test2.rows:
        classification = knn(train, row, k)
        classifications += [classification]
    
    test = test.with_column('class', classifications)
    # print(test)
    return np.count_nonzero(test.column('class') == test.column('Species')) / test.num_rows

In [18]:
accuracy = []
for i in np.arange(1, 100, 2):
    acc = test_accuracy(train, test, i)
    accuracy += [acc]
    
Table().with_columns('k', np.arange(1, 100, 2), 'accuracy', accuracy).show()

k,accuracy
1,0.975
3,0.975
5,0.975
7,0.975
9,0.975
11,0.975
13,0.975
15,0.975
17,0.975
19,0.975
