# KNN
Assumptions:
- We are free to implement any distance function

The following code is the implementation of the KNN algorithm.


In [1]:
import numpy as np
from collections import Counter


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


class KNNClassifier:
    def __init__(self, k=3, distance_func=euclidean_distance):
        self.X_train = None
        self.y_train = None
        self.k = k
        self.distance_function = distance_func

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)

    def _predict(self, x):
        distances = [self.distance_function(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

## Using KNN
The following section uses the knn algorithm that was implemented to predict diabetes based on the pima-indians-diabetes dataset.
First the CSV is read, then converted to a numeric datatype.

In [40]:
import pandas as pd

df = pd.read_csv("./pima-indians-diabetes.csv", header=None)
df = df.apply(pd.to_numeric)

# Creating the Train/Test Sets
The creation of the train and test sets are done via numpy and pandas.
* The dataset is first shuffled
* Then split into a train and test set
* Lastly, split into the features `X_<train/test>` and the target `y_<train/test>`. `n` is the amount of data that should be in the train set in percent.

In [27]:
n = .70
train = df.sample(frac=n, random_state=42)
test = df.drop(train.index)
X_train = train.iloc[:, :-1].to_numpy()
y_train = train.iloc[:, -1].to_numpy()
X_test = test.iloc[:, :-1].to_numpy()
y_test = test.iloc[:, -1].to_numpy()
X_train

array([[6.00e+00, 9.80e+01, 5.80e+01, ..., 3.40e+01, 4.30e-01, 4.30e+01],
       [2.00e+00, 1.12e+02, 7.50e+01, ..., 3.57e+01, 1.48e-01, 2.10e+01],
       [2.00e+00, 1.08e+02, 6.40e+01, ..., 3.08e+01, 1.58e-01, 2.10e+01],
       ...,
       [5.00e+00, 1.62e+02, 1.04e+02, ..., 3.77e+01, 1.51e-01, 5.20e+01],
       [1.00e+00, 9.10e+01, 5.40e+01, ..., 2.52e+01, 2.34e-01, 2.30e+01],
       [1.00e+00, 8.90e+01, 2.40e+01, ..., 2.78e+01, 5.59e-01, 2.10e+01]])

In [29]:
X_test

array([[1.000e+00, 8.500e+01, 6.600e+01, ..., 2.660e+01, 3.510e-01,
        3.100e+01],
       [0.000e+00, 1.370e+02, 4.000e+01, ..., 4.310e+01, 2.288e+00,
        3.300e+01],
       [2.000e+00, 1.970e+02, 7.000e+01, ..., 3.050e+01, 1.580e-01,
        5.300e+01],
       ...,
       [7.000e+00, 1.370e+02, 9.000e+01, ..., 3.200e+01, 3.910e-01,
        3.900e+01],
       [9.000e+00, 1.700e+02, 7.400e+01, ..., 4.400e+01, 4.030e-01,
        4.300e+01],
       [5.000e+00, 1.210e+02, 7.200e+01, ..., 2.620e+01, 2.450e-01,
        3.000e+01]])

The following code calculates the metrics for a prediction.

In [30]:
from lib import accuracy, recall, precision, f1


def calculate_metrics(y_test, predictions):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for i in range(len(y_test)):
        y = y_test[i]
        pred = predictions[i]

        if y == 1 and pred == 1:
            TP += 1

        if y == 0 and pred == 0:
            TN += 1

        if y == 1 and pred == 0:
            FN += 1

        if y == 0 and pred == 1:
            FP += 1

    return accuracy(TP, TN, FP, FN), recall(TP, FN), precision(TP, FP), f1(TP, FP, FN)

# Evaluating KNN
The following code evaluates the KNN algorithm with different ks.
Starting with one until 19 being the maximum.
The result below indicates that the best `k` is 18.

In [37]:
lower_k = 1
upper_k = 20
res = []
for k in range(lower_k, upper_k):
    clf = KNNClassifier(k=k)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    metrics = calculate_metrics(y_test, predictions)
    res.append({
        "k": k,
        "Accuracy": metrics[0],
        "Recall": metrics[1],
        "Precision": metrics[2],
        "F1": metrics[3],
    })
results = pd.DataFrame(res)
results

Unnamed: 0,k,Accuracy,Recall,Precision,F1
0,1,0.647826,0.506329,0.487805,0.496894
1,2,0.647826,0.506329,0.487805,0.496894
2,3,0.7,0.506329,0.571429,0.536913
3,4,0.717391,0.518987,0.602941,0.557823
4,5,0.713043,0.531646,0.591549,0.56
5,6,0.730435,0.531646,0.626866,0.575342
6,7,0.730435,0.531646,0.626866,0.575342
7,8,0.717391,0.468354,0.616667,0.532374
8,9,0.743478,0.518987,0.66129,0.58156
9,10,0.730435,0.518987,0.630769,0.569444
