# Functions for calculating NN

In [1]:
#!/usr/bin/env python3
from scipy.io import loadmat
from random import sample
import numpy as np
import time
import sys

def calculate_distances_matrix(test_vectors, train_vectors):
    sum_squared_of_test_vectors = np.sum(np.square(test_vectors), axis=1)
    sum_squared_of_train_vectors = np.sum(np.square(train_vectors), axis=1)
    matrix_multiplication = np.dot(test_vectors, train_vectors.T)

    return np.sqrt(sum_squared_of_test_vectors[:, np.newaxis] + sum_squared_of_train_vectors - 2 * matrix_multiplication)

def get_knn_label(dist_vector, label_vector, k):
    k_smallest_indexes = np.argpartition(dist_vector, k)[:k]
    count_dict = {}
    for idx in k_smallest_indexes:
        label = label_vector[idx][0]
        if label not in count_dict:
            count_dict[label] = 1
        else:
            count_dict[label] = count_dict[label] + 1
    # Note that this will return an arbitrary label if k is even and there are more than one labels with highest count 
    final_label = max(count_dict, key=lambda key: count_dict[key])
    return np.array([final_label])
#     print(type(ret))
#     print(type(ret[0]))
#     print(ret[0])
#     sys.exit()

def nn(X,Y,test,k):
    start = time.time()
    preds = []

    distances_matrix = calculate_distances_matrix(test, X)
    for row in distances_matrix:
        min_idx = np.argmin(row)
#         print(type(Y[min_idx]))
#         print(type(Y[min_idx][0]))
#         print(Y[min_idx])
#         sys.exit()
        preds.append(Y[min_idx])

    print(str(time.time()-start) + " seconds")
    return preds

def knn(X,Y,test,k):
    start = time.time()
    preds = []

    distances_matrix = calculate_distances_matrix(test, X)
    for row in distances_matrix:
        # APPEND ARRAY BOI BKN INTEGER
        preds.append(get_knn_label(row, Y, k))

    print(str(time.time()-start) + " seconds")
    return preds

In [2]:
get_knn_label([1,2,3,4,5], [[0],[1],[0],[1],[1]], 3)

array([0])

# Functions for visualization

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt

# def plot_learning_curve(title, train_sizes, mean_errors, std_errors):
#     plt.figure()
#     plt.title(title)
#     plt.xlabel("Training examples")
#     plt.ylabel("Average test error rates")
#     plt.grid()
#     plt.errorbar(train_sizes, mean_errors, yerr=std_errors, color="g", ecolor="r")

#     plt.legend(loc="best")
#     return plt

In [3]:
from sklearn.model_selection import KFold

if __name__ == '__main__':
    ocr = loadmat('ocr.mat')

    train_data = ocr['data'].astype('float')
    labels = ocr['labels']
    splitted_data = []
    splitted_labels = []
    
    kf = KFold(n_splits=10)
    for train, validation in kf.split(train_data):
        splitted_data.append((train_data[train], train_data[validation]))      
        splitted_labels.append((labels[train], labels[validation]))      
    
    mean_errors = []
    for k in range(1, 11):
        print('Executing for k = ' + str(k))
        
        test_err = np.zeros(10)
        for i in range(0, len(splitted_data)):
            preds = knn(splitted_data[i][0], splitted_labels[i][0], splitted_data[i][1], k)
            test_err[i] = np.mean(preds != splitted_labels[i][1])

        mean_errors.append(np.mean(test_err))

        print('--------------------')
        
    for k in range(1, 11):
        print("k = " + str(k) + ", mean error = " + str(mean_errors[k-1]))

Executing for k = 1
71.21881175041199 seconds
81.81418323516846 seconds
70.03643918037415 seconds
62.410974979400635 seconds
66.3675708770752 seconds
70.35510015487671 seconds
77.63730001449585 seconds
79.40611505508423 seconds
77.64462304115295 seconds
85.67376494407654 seconds
--------------------
Executing for k = 2
104.59900307655334 seconds
93.1554548740387 seconds
91.12786602973938 seconds
96.81510829925537 seconds
96.91727495193481 seconds
92.39309501647949 seconds
88.51109099388123 seconds
84.24285793304443 seconds
78.69085884094238 seconds
76.2126190662384 seconds
--------------------
Executing for k = 3
78.58119773864746 seconds
78.15860486030579 seconds
72.02861595153809 seconds
77.49914908409119 seconds
75.26180672645569 seconds
77.0115921497345 seconds
72.51455783843994 seconds
73.34319019317627 seconds
73.61482691764832 seconds
68.40324997901917 seconds
--------------------
Executing for k = 4
72.35821485519409 seconds
74.1621081829071 seconds
98.59932708740234 seconds
90

In [None]:
# plt = plot_learning_curve("Nearest Neighbor Learning Curve", train_sizes, mean_errors, std_errors)
# plt.savefig('coba-coba.png', bbox_inches='tight')

In [None]:
from sklearn.model_selection import KFold

X = np.zeros(100)
X = np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19])
kf = KFold(n_splits=10)
for train, test in kf.split(X):
    print("%s %s" % (X[train], X[test]))