In [None]:
import numpy
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

### Preprocessing
* Set seed for reproducability
* Load dataset into numpy arrays and shuffle it
* Split dataset into 80% train, 20% test

In [None]:
numpy.random.seed(7)

dataset = numpy.loadtxt("./data/pima-indians-diabetes.csv", delimiter=",")
numpy.random.shuffle(dataset)

In [None]:
split_ratio = 0.8

X_train = dataset[:int(len(dataset)*split_ratio), 0:8]
X_test = dataset[int(len(dataset)*split_ratio):, 0:8]
Y_train = dataset[:int(len(dataset)*split_ratio), 8]
Y_test = dataset[int(len(dataset)*split_ratio):, 8]
print(X_train)
print()
print(Y_train)

### Functions

In [None]:
def distance(one, two):
    return numpy.linalg.norm(one - two)


def shortestDistance(x, x_rest, y_rest):
    shortest = distance(x, x_rest[0])
    predicted = y_rest[0]
    
    for i in range(len(x_rest)):
        if distance(x, x_rest[i]) <= shortest:
            shortest = distance(x, x_rest[i])
            predicted = y_rest[i]
            
    return predicted, shortest


def shortestDistanceWithK(x, x_rest, y_rest, n):
    distances = [(index, distance(x, x_rest[index])) for index, rest in enumerate(x_rest)]
    distances.sort(key=lambda z: z[1])
    
    pred = [y_rest[distances[i][0]] for i in range(0, n)]
    return int(numpy.median(pred)), [k[0] for k in distances[0:n]]
    

### Test data
|       | Positive | Negative |
|-------|----------|----------|
| True  | TP       | TN       |
| False | FP       | FN       |

* Calculate shortest distance and predict if diabetes or not
* Add results to correct counter
* Print results
    * Accuracy
    * Recall
    * Precision
    * F1

In [None]:
accuracies = []
for j in tqdm_notebook(range(1, 1000)):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for i in range(len(X_test)):
        x = X_test[i]
        y = Y_test[i]

        # pred, shortest = shortestDistance(x, X_train, Y_train)
        pred, shortest = shortestDistanceWithK(x, X_train, Y_train, j)
        pred = float(pred)
        # print(f"y^: {pred} | y: {y} | distance: {shortest}")

        if y == 1 and pred == 1:
            TP += 1
        if y == 0 and pred == 0:
            TN += 1
        if y == 1 and pred == 0:
            FN += 1
        if y == 0 and pred == 1:
            FP += 1
        
    accuracies.append([(TP + TN) / (TP + TN + FP + FN), j])

In [None]:
x_values = [accuracy[1] for accuracy in accuracies]
y_values = [accuracy[0] for accuracy in accuracies]

plt.plot(x_values, y_values)
# plt.xticks(x_values)
plt.xlabel("Number of K's")
plt.ylabel("Accuracy")
plt.savefig("./images/hyperparameter_10_k")
plt.show()

In [None]:
accuracy = ((TP + TN) / (TP + TN + FP + FN)) * 100
recall = (TP / (TP + FN)) * 100
precision = (TP / (TP + FP)) * 100
f1 = ((2 * TP) / (2 * TP + FP + FN)) * 100

print(f"Accuracy: {accuracy:.4g}%")
print(f"Recall: {recall:.4g}%")
print(f"Precision: {precision:.4g}%")
print(f"F1: {f1:.4g}%")

| K       | Accuracy   | Recall   | Precision   | F1       |
|:-------:|:----------:|:--------:|:-----------:|:--------:|
| K = 1   | 69.48%     | 58.33%   | 50.91%      | 54.37%   |
| K = 3   | 69.48%     | 54.17%   | 50.98%      | 52.53%   |
| K = 6   | 77.92%     | 54.17%   | 68.42%      | 60.47%   |
