In [None]:
from l2_distance import l2_distance
from utils import *

import matplotlib.pyplot as plt
import numpy as np


def knn(k, train_data, train_labels, valid_data):
    """ Uses the supplied training inputs and labels to make
    predictions for validation data using the K-nearest neighbours
    algorithm.

    Note: N_TRAIN is the number of training examples,
          N_VALID is the number of validation examples,
          M is the number of features per example.

    :param k: The number of neighbours to use for classification
    of a validation example.
    :param train_data: N_TRAIN x M array of training data.
    :param train_labels: N_TRAIN x 1 vector of training labels
    corresponding to the examples in train_data (must be binary).
    :param valid_data: N_VALID x M array of data to
    predict classes for validation data.
    :return: N_VALID x 1 vector of predicted labels for
    the validation data.
    """
    dist = l2_distance(valid_data.T, train_data.T)  #Make a list with the distances between validation and training data points.

    nearest = np.argsort(dist, axis=1)[:, :k]  #Sort the distance along axis 1 from min to max. From the sorted array, take up to the kth nearest points.

    train_labels = train_labels.reshape(-1) #reshape 2D array to 1D array. It is an Nx1 vector now.
    valid_labels = train_labels[nearest]

    valid_labels = (np.mean(valid_labels, axis=1) >= 0.5).astype(np.int)
    valid_labels = valid_labels.reshape(-1, 1)

    return valid_labels

In [None]:


def run_knn():
    train_inputs, train_targets = load_train()
    valid_inputs, valid_targets = load_valid()
    test_inputs, test_targets = load_test()

    #Validation set performance
    k_values = [1,3,5,7,9]
    val_accuracies = []

    for k in k_values:
        predicted_val_labels = knn(k, train_inputs, train_targets, valid_inputs)
        val_classif_accu = valid_targets - predicted_val_labels
        val_accuracies.append((len(val_classif_accu)-np.count_nonzero(val_classif_accu))/len(val_classif_accu))

    print("K Values = ",k_values)
    print("Validation Accuracy = ",val_accuracies)


    #Plotting validation set peformance
    plt.scatter(k_values,val_accuracies)
    plt.xlabel("K")
    plt.ylabel("Validation Accuracy")
    plt.title("Validation Accuracy vs. K")
    plt.show()


    #Test set performance
    k_star = k_values[val_accuracies.index(max(val_accuracies))]
    print("k_star =", k_star)
    print("k_star val accuracy = ", max(val_accuracies))

    if k_star <= 2:
        k_star_values = [k_star, k_star + 2]       #set of k values for testing
    else:
        k_star_values = [k_star - 2, k_star, k_star + 2]

    test_accuracies = []

    for k in k_star_values:
        test_predictions = knn(k,train_inputs, train_targets, test_inputs)
        test_classif_accu = test_targets - test_predictions
        test_accuracies.append((len(test_classif_accu)-np.count_nonzero(test_classif_accu))/len(test_classif_accu))

    print("Test Accuracies = ", test_accuracies)
    print("k_star test accuracy =", test_accuracies[k_star_values.index(k_star)])


    return

In [None]:
if __name__ == "__main__":
    run_knn()