In [8]:
import numpy as np
import scipy.io as spio
import scipy.stats as stats
import math
import time

def EuclideanDistance(x,y):
    return np.linalg.norm(np.array(y)-np.array(x))

def L1Distance(x,y):
    return np.sum(abs(np.array(y)-np.array(x)))

def LinfDistance(x,y):
    return max(abs(np.array(y)-np.array(x)))

class kNNModel:
    
    def __init__(self, dist_function):
        self.dist_function = dist_function
        
    # computes the distance from x to each element in lst, which is a list of vectors
    def __dist(self, x, lst, lst_labels):
        dists = [self.dist_function(x,e) for e in lst]
        vec_label_dist_tuple = list(zip(lst,lst_labels,dists))
        return vec_label_dist_tuple
                
    def train_model(self,training_data,labels):
        # k will be log(# of data points) rounded to the nearest integer
        self.k = int(np.log(len(training_data)) + 0.5)
        self.data = training_data.tolist()
        self.labels = labels
        self.train_size = len(training_data)
        
    def predict(self,x):
        vec_label_dist_tuple = self.__dist(x,self.data,self.labels)
        # sort by distance
        vec_label_dist_tuple_sorted = sorted(vec_label_dist_tuple, key=lambda f: f[2])
        # get an array containing (label,distance) tuples
        k_closest = vec_label_dist_tuple_sorted[:self.k]
        # group by frequency of label
        count_dic = {}
        for tup in k_closest:
            if tup[1] not in count_dic:
                count_dic[tup[1]] = 1.0
            else:
                count_dic[tup[1]] += 1.0
        
        prediction = max(count_dic, key=count_dic.get)
        return prediction          
        
    def test_model(self,test_data,labels):
        misses=0        
        # slice the data based on the most significant features
        self.test_size = len(test_data)
        results = {'Test Error': [], 'Training Size': [], 'Testing Size': [], 'Predicted': [], 'Actual': []}
        results['Training Size'] = self.train_size
        results['Testing Size'] = self.test_size
        
        for i in range(len(test_data)):
            pred = self.predict(test_data[i])
            actual = labels[i]
            
            results['Predicted'] += [pred]
            results['Actual'] += [actual]
            
            if(pred != actual):
                misses += 1
                
        results['Test Error'] = misses*1.0/len(test_data)
        return results
    
mat = spio.loadmat('hw1data.mat', squeeze_me=True)
image_matrix=mat['X']
label_array=mat['Y']   

tstart = time.time()
kNN_model_L2 = kNNModel(EuclideanDistance)
kNN_model_L2.train_model(image_matrix[:1000,:],label_array[:1000])
print(kNN_model_L2.test_model(image_matrix[1001:1101,:],label_array[1001:1101]))
#print("L2 Test error: " + str(L2_test_error))
tend = time.time()
print("Total time taken: " + str(tend-tstart))

kNN_model_L1 = kNNModel(L1Distance)
kNN_model_L1.train_model(image_matrix[:1000,:],label_array[:1000])
print(kNN_model_L1.test_model(image_matrix[1001:1101,:],label_array[1001:1101]))
#print("L1 Test error: " + str(L1_test_error))

kNN_model_Linf = kNNModel(LinfDistance)
kNN_model_Linf.train_model(image_matrix[:1000,:],label_array[:1000])
print(kNN_model_Linf.test_model(image_matrix[1001:1101,:],label_array[1001:1101]))
#print("Linf Test error: " + str(Linf_test_error))

{'Test Error': 0.11, 'Training Size': 1000, 'Testing Size': 100, 'Predicted': [2, 7, 9, 7, 7, 6, 9, 9, 0, 1, 4, 8, 2, 6, 1, 9, 7, 3, 8, 9, 6, 3, 7, 7, 1, 1, 1, 3, 9, 1, 1, 4, 6, 8, 2, 8, 0, 3, 5, 0, 5, 0, 2, 7, 1, 7, 5, 1, 2, 2, 2, 0, 3, 9, 5, 2, 1, 9, 8, 7, 2, 5, 4, 0, 2, 2, 9, 7, 6, 0, 1, 9, 1, 2, 5, 7, 1, 4, 1, 2, 7, 5, 2, 7, 5, 1, 8, 2, 5, 0, 8, 5, 6, 5, 8, 0, 9, 1, 0, 1], 'Actual': [2, 2, 9, 5, 7, 6, 4, 9, 0, 1, 4, 8, 2, 6, 1, 9, 7, 3, 8, 9, 6, 3, 7, 7, 1, 1, 7, 3, 9, 1, 1, 4, 6, 8, 2, 8, 0, 3, 3, 0, 5, 0, 2, 7, 1, 7, 5, 1, 2, 2, 2, 2, 3, 9, 5, 2, 1, 9, 8, 7, 2, 5, 4, 0, 2, 2, 4, 7, 6, 0, 1, 9, 2, 2, 5, 7, 1, 4, 1, 2, 7, 5, 2, 7, 5, 1, 8, 2, 5, 0, 8, 5, 6, 3, 3, 0, 9, 1, 0, 7]}
Total time taken: 26.616002559661865
{'Test Error': 0.14, 'Training Size': 1000, 'Testing Size': 100, 'Predicted': [2, 7, 9, 7, 7, 6, 9, 9, 0, 1, 4, 8, 2, 6, 1, 9, 7, 3, 8, 9, 6, 3, 7, 7, 1, 1, 1, 3, 9, 1, 1, 4, 6, 8, 2, 3, 0, 3, 5, 0, 5, 0, 2, 7, 1, 7, 5, 1, 2, 2, 2, 2, 3, 9, 5, 7, 1, 4, 8, 7, 2, 5, 4, 0, 