# KNN
Assumptions:
- We are free to implement any distance function

The following code is the implementation of the KNN algorithm.


In [72]:
import numpy as np
from collections import Counter


def euclidean_distance(x1, x2):
    distance = 0.0
    for i in range(len(x1) - 1):
        distance += (x1[i] - x2[i]) ** 2
    return np.sqrt(distance)


class KNNClassifier:
    def __init__(self, k=3, distance_func=euclidean_distance):
        self.X_train = None
        self.y_train = None
        self.k = k
        self.distance_function = distance_func

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)

    def _predict(self, x):
        distances = [self.distance_function(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

## Using KNN
The following section uses the knn algorithm that was implemented to predict diabetes based on the pima-indians-diabetes dataset.

In [73]:
import pandas as pd

df = pd.read_csv("./pima-indians-diabetes.csv", header=None)
df = df.apply(pd.to_numeric)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
n = .70
X_train = X.iloc[:int(len(df) * n)].to_numpy()
y_train = y.iloc[:int(len(df) * n)].to_numpy()

In [74]:
X_test = X.iloc[:int(len(df) * (1 - n))].to_numpy()
y_test = y.iloc[:int(len(df) * (1 - n))].to_numpy()

In [75]:
from lib import accuracy, recall, precision, f1

def calculate_metrics(y_test, predictions):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for i in range(len(y_test)):
        y = y_test[i]
        pred = predictions[i]
    
        if y == 1 and pred == 1:
            TP += 1
    
        if y == 0 and pred == 0:
            TN += 1
    
        if y == 1 and pred == 0:
            FN += 1
    
        if y == 0 and pred == 1:
            FP += 1
            
    return accuracy(TP, TN, FP, FN), recall(TP, FN), precision(TP, FP), f1(TP, FP, FN)

In [76]:
for k in range(1, 10):
    clf = KNNClassifier(k=k)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    metrics = calculate_metrics(y_test, predictions)
    
    print(f"K: {k}")
    print(f"Accuracy: {metrics[0]}")
    print(f"Recall: {metrics[1]}")
    print(f"Precision: {metrics[2]}")
    print(f"F1: {metrics[3]}")
    

K: 1
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
K: 2
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1: 1.0
K: 3
Accuracy: 0.8434782608695652
Recall: 0.6896551724137931
Precision: 0.8695652173913043
F1: 0.7692307692307693
K: 4
Accuracy: 0.9043478260869565
Recall: 0.8160919540229885
Precision: 0.922077922077922
F1: 0.8658536585365854
K: 5
Accuracy: 0.8173913043478261
Recall: 0.6436781609195402
Precision: 0.835820895522388
F1: 0.7272727272727273
K: 6
Accuracy: 0.8652173913043478
Recall: 0.7126436781609196
Precision: 0.9117647058823529
F1: 0.8
K: 7
Accuracy: 0.7869565217391304
Recall: 0.5632183908045977
Precision: 0.8166666666666667
F1: 0.6666666666666666
K: 8
Accuracy: 0.8347826086956521
Recall: 0.6666666666666666
Precision: 0.8656716417910447
F1: 0.7532467532467533
K: 9
Accuracy: 0.7913043478260869
Recall: 0.5747126436781609
Precision: 0.819672131147541
F1: 0.6756756756756757
