In [7]:
import pandas as pd
import numpy as np
from collections import Counter
import math

df = pd.read_csv("iris.csv")

In [8]:
class KNN:
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = X
        self.y = y
        
    def predict(self, X: np.ndarray, k=5):
        # calculate the distance between each point of
        # test data and every point in train data
        distances = self.distances(X)
        votes = []
        for i in distances:
            # find the majority vote of neighbors
            votes.append(self.vote(i, k))
            
        return votes
        
    def distances(self, X: np.ndarray):
        distances = np.sqrt(np.sum(np.pow(self.X - X[:, np.newaxis, :], 2), axis=-1))
        return distances

    def vote(self, X: np.ndarray, k):
        # get the k smallest distances indexes 
        idxs = np.argsort(X)[:k]
        y = self.y[idxs]
        
        counts = Counter(y)
        winner, win_count = counts.most_common(1)[0]
        
        candidates = [c for c in counts.values() if c == win_count]
        
        # guarantees no draw, as in:
        # [0, 1, 1, 2, 2]
        if len(candidates) > 1:
            return self.vote(X[:-1], k)
        
        return winner

In [9]:
def train_test_split(X, y, train_size, shuffle=False):
    idx = list(range(X.shape[0]))
    if shuffle:
        np.random.shuffle(idx)

    X = X[idx, :]
    y = y[idx]

    limit = int(train_size*df.shape[0])
    return X[:limit], X[limit:], y[:limit], y[limit:]

X = df.iloc[:, :-1].values

# categorical columns to numerical
y = pd.get_dummies(df["species"], dtype=int).to_numpy().argmax(axis=1)

# split shuffled data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, 0.7, True)

knn = KNN(X_train, y_train)

# running 1000 test to validate the code
accuracies = []
for _ in range(1000):
    accuracies.append(sum(np.array(knn.predict(X_test)) == y_test) / y_test.shape)

sum(accuracies) / len(accuracies)

array([0.93333333])