In [87]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter

In [88]:
iris = load_iris()
xs = iris.data
ys = iris.target

X_train, X_test, y_train, y_test = train_test_split(xs, ys, train_size=0.25, random_state=8, shuffle=True)

In [89]:
class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, x_trains, y_trains):
        self.x_trains = x_trains
        self.y_trains = y_trains

    def predict(self, xs):
        return [self._predict_single_sample(x) for x in xs]

    def _predict_single_sample(self, x):
        distances = [self._calculate_distance(x_train, x) for x_train in self.x_trains]
        sorted_idx = np.argsort(distances)
        k_nearest_labels = self.y_trains[sorted_idx][: self.k]
        return self._majority_vote(k_nearest_labels)

    def _calculate_distance(self, a, b):
        return np.sqrt(np.sum((a - b) ** 2))

    def _majority_vote(self, labels):
        counts = Counter(labels)
        winner, winner_count = Counter(labels).most_common(1)[0]
        n_winner = sum(1 for count in counts.values() if count == winner_count)
        if n_winner > 1:
            return self._majority_vote(labels[:-1])
        elif n_winner == 1:
            return winner

In [90]:
knn = KNN(5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

acc = np.sum(y_test == y_pred) / len(y_pred)
acc

0.9026548672566371

# Explaination

## What is `argsort` doing?

`argsort` is returning indices that will sort an array.

## How to find distances between two points?

$$d(\mathbf{A}, \mathbf{B}) = \sqrt{\sum_{i=1}^{n} (A_i - B_i)^2}$$

In [91]:
a = np.array([1, 0, 5])
b = np.array([0, 2, 4])

assert np.sqrt(np.sum((a - b) ** 2)) == np.sqrt(6)

## How does `_majority_vote` handles tie?

In [92]:
labels = np.array(["a", "b", "b", "d", "a"])
knn = KNN(1)
knn._majority_vote(labels)

'b'