# KNN
Assumptions:
- We are free to implement any distance function

The following code is the implementation of the KNN algorithm.


In [43]:
import numpy as np
from collections import Counter


def euclidean_distance(x1, x2):
    distance = 0.0
    for i in range(len(x1) - 1):
        distance += (x1[i] - x2[i]) ** 2
    return np.sqrt(distance)


class KNNClassifier:
    def __init__(self, k=3, distance_func=euclidean_distance):
        self.X_train = None
        self.y_train = None
        self.k = k
        self.distance_function = distance_func

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)

    def _predict(self, x):
        distances = [self.distance_function(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

## Using KNN
The following section uses the knn algorithm that was implemented to predict diabetes based on the pima-indians-diabetes dataset.

In [44]:
import pandas as pd

df = pd.read_csv("./pima-indians-diabetes.csv", header=None)
df = df.apply(pd.to_numeric)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
n = .70
X_train = X.iloc[:int(len(df) * n)]
y_train = y.iloc[:int(len(df) * n)]
clf = KNNClassifier(k=2)
clf.fit(X_train.to_numpy(), y_train.to_numpy())

In [45]:
X_test = X.iloc[:int(len(df) * (1 - n))]
y_test = y.iloc[:int(len(df) * (1 - n))]
predictions = clf.predict(X_test.to_numpy())

predictions

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0])

In [7]:
y_test

0      0
1      1
2      0
3      1
4      0
      ..
225    0
226    1
227    0
228    0
229    1
Name: 1, Length: 230, dtype: int64