# Functions for calculating NN

In [27]:
#!/usr/bin/env python3
from scipy.io import loadmat
from random import sample
import numpy as np
import time
import sys

def calculate_distances_matrix(test_vectors, train_vectors):
    sum_squared_of_test_vectors = np.sum(np.square(test_vectors), axis=1)
    sum_squared_of_train_vectors = np.sum(np.square(train_vectors), axis=1)
    matrix_multiplication = np.dot(test_vectors, train_vectors.T)

    return np.sqrt(sum_squared_of_test_vectors[:, np.newaxis] + sum_squared_of_train_vectors - 2 * matrix_multiplication)

def get_knn_label(dist_vector, label_vector, k):
    k_smallest_indexes = np.argpartition(dist_vector, k)[:k]
    count_dict = {}
    for idx in k_smallest_indexes:
        label = label_vector[idx][0]
        if label not in count_dict:
            count_dict[label] = 1
        else:
            count_dict[label] = count_dict[label] + 1
    # Note that this will return an arbitrary label if k is even and there are more than one labels with highest count 
    final_label = max(count_dict, key=lambda key: count_dict[key])
    return np.array([final_label])
#     print(type(ret))
#     print(type(ret[0]))
#     print(ret[0])
#     sys.exit()

def nn(X,Y,test,k):
    start = time.time()
    preds = []

    distances_matrix = calculate_distances_matrix(test, X)
    for row in distances_matrix:
        min_idx = np.argmin(row)
#         print(type(Y[min_idx]))
#         print(type(Y[min_idx][0]))
#         print(Y[min_idx])
#         sys.exit()
        preds.append(Y[min_idx])

    print(str(time.time()-start) + " seconds")
    return preds

def knn(X,Y,test,k):
    start = time.time()
    preds = []

    distances_matrix = calculate_distances_matrix(test, X)
    for row in distances_matrix:
        # APPEND ARRAY BOI BKN INTEGER
        preds.append(get_knn_label(row, Y, k))

    print(str(time.time()-start) + " seconds")
    return preds

In [3]:
get_knn_label([1,2,3,4,5], [[0],[1],[0],[1],[1]], 3)

0

# Functions for visualization

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt

# def plot_learning_curve(title, train_sizes, mean_errors, std_errors):
#     plt.figure()
#     plt.title(title)
#     plt.xlabel("Training examples")
#     plt.ylabel("Average test error rates")
#     plt.grid()
#     plt.errorbar(train_sizes, mean_errors, yerr=std_errors, color="g", ecolor="r")

#     plt.legend(loc="best")
#     return plt

In [29]:
from sklearn.model_selection import KFold

if __name__ == '__main__':
    ocr = loadmat('ocr.mat')

    train_data = ocr['data'].astype('float')
    labels = ocr['labels']
    splitted_data = []
    splitted_labels = []
    
    kf = KFold(n_splits=10)
    for train, validation in kf.split(train_data):
        splitted_data.append((train_data[train], train_data[validation]))      
        splitted_labels.append((labels[train], labels[validation]))      
    
    mean_errors = []
    for k in range(1, 11):
        print('Executing for k = ' + str(k))
        
        test_err = np.zeros(10)
        for i in range(0, len(splitted_data)):
            preds = knn(splitted_data[i][0], splitted_labels[i][0], splitted_data[i][1], k)
            test_err[i] = np.mean(preds != splitted_labels[i][1])

        mean_errors.append(np.mean(test_err))

        print('--------------------')
        
    for k in range(1, 11):
        print("k = " + str(k) + ", mean error = " + str(mean_errors[k-1]))

Executing for k = 1
0.38361096382141113 seconds
0.3035609722137451 seconds
0.33700084686279297 seconds
0.34471607208251953 seconds
0.2901289463043213 seconds
0.2873101234436035 seconds
0.28937721252441406 seconds
0.2828998565673828 seconds
0.2939901351928711 seconds
0.2891242504119873 seconds
--------------------
Executing for k = 2
0.29347991943359375 seconds
0.29628968238830566 seconds
0.29941511154174805 seconds
0.2924158573150635 seconds
0.29862189292907715 seconds
0.3108518123626709 seconds
0.29622817039489746 seconds
0.2918362617492676 seconds
0.3048229217529297 seconds
0.29541707038879395 seconds
--------------------
Executing for k = 3
0.34310221672058105 seconds
0.3571460247039795 seconds
0.3526458740234375 seconds
0.35244202613830566 seconds
0.35442519187927246 seconds
0.35259294509887695 seconds
0.3621530532836914 seconds
0.34786033630371094 seconds
0.3615231513977051 seconds
0.35621094703674316 seconds
--------------------
Executing for k = 4
0.350632905960083 seconds
0.350

In [None]:
# plt = plot_learning_curve("Nearest Neighbor Learning Curve", train_sizes, mean_errors, std_errors)
# plt.savefig('coba-coba.png', bbox_inches='tight')

In [None]:
from sklearn.model_selection import KFold

X = np.zeros(100)
X = np.array([1, 3, 5, 7, 9, 11, 13, 15, 17, 19])
kf = KFold(n_splits=10)
for train, test in kf.split(X):
    print("%s %s" % (X[train], X[test]))