In [1]:
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [2]:
# function to load the data and preprocess the data
def read_preprocess(filename):
    f = open(filename)
    raw_data = f.readlines()
    
    point_lst = []
    for ele in raw_data:
        point = []
        for value in ele.split():
            point.append(float(value))
        point_lst.append(point)
        
    return np.array(point_lst)

# load the train set and test set

In [3]:
filename = './zip.train'
training_array = read_preprocess(filename)

In [4]:
filename = './zip.test'
testing_array = read_preprocess(filename)

In [5]:
X_train = training_array[:,1:]
Y_train = training_array[:,0]

X_test = testing_array[:,1:]
Y_test = testing_array[:, 0]

In [23]:
# test to see how many data points in train set and test set and see the shape of the data
print X_train.shape
print Y_train.shape
print X_test.shape
print Y_test.shape

(7291, 256)
(7291,)
(2007, 256)
(2007,)


# functions for calculating the distances and k-NN implementation

dist() function will calculate the distance of each test point to all the train points and store the values in dist_lst.

knn( ) function is the implementation of kNN.

score() function will calculate the accuracy of each kNN model.

In [7]:
def dist(X_test , X_train):
    dist_lst = []
    for point in X_test:
        dist_lst.append(np.sum(np.power(point - X_train, 2), axis = 1))
    return np.array(dist_lst)

In [24]:
def knn(k, dist_lst):
    
    prediction = []
    
    index = np.argsort(dist_lst, axis = 1)
    
    knn_points =  index[:, :k]
    
    for row in Y_train[knn_points]:
        prediction.append(Counter(row).most_common()[0][0])
    
    return np.array(prediction)

In [9]:
def score(predictions, Y_test):
    return np.sum([predictions == Y_test])/float(Y_test.shape[0])

In [10]:
dist_lst = dist(X_test, X_train)

For each row, there are 7291 values. 

For example, we can know that in row 0, these 7291 values mean the distances of the first test point to all data points in train set.

In [11]:
dist_lst.shape

(2007, 7291)

In [25]:
kone_predictions = knn(1, dist_lst)
ktwo_predictions = knn(2, dist_lst)
kthree_predictions = knn(3, dist_lst)

We use the sklearn library to build the confusion matrix. 

# 1-NN model

In [26]:
# check how many predctions are with label '0'
kone_predictions[kone_predictions==0].shape

(371,)

In [27]:
kone_confusion_matrix= confusion_matrix(Y_test, kone_predictions, labels=[0,1,2,3,4,5,6,7,8,9])
print kone_confusion_matrix

[[355   0   2   0   0   0   0   1   0   1]
 [  0 255   0   0   6   0   2   1   0   0]
 [  6   1 183   2   1   0   0   2   3   0]
 [  3   0   2 154   0   5   0   0   0   2]
 [  0   3   1   0 182   1   2   2   1   8]
 [  2   1   2   4   0 145   2   0   3   1]
 [  0   0   1   0   2   3 164   0   0   0]
 [  0   1   1   1   4   0   0 139   0   1]
 [  5   0   1   6   1   1   0   1 148   3]
 [  0   0   1   0   2   0   0   4   1 169]]


From the above code, we can see that the number of predictions with label '0' is 371. The 1-NN model predicted correctly that 355 are label '0', but 16 (6+3+2+5) points in test set are incorrectly predicted as label '0'.
So the accuracy of label '0' is:$\frac{355}{371}$

In [28]:
print np.around(kone_confusion_matrix/np.sum(kone_confusion_matrix, axis = 0, dtype='float'), decimals=3)

[[ 0.957  0.     0.01   0.     0.     0.     0.     0.007  0.     0.005]
 [ 0.     0.977  0.     0.     0.03   0.     0.012  0.007  0.     0.   ]
 [ 0.016  0.004  0.943  0.012  0.005  0.     0.     0.013  0.019  0.   ]
 [ 0.008  0.     0.01   0.922  0.     0.032  0.     0.     0.     0.011]
 [ 0.     0.011  0.005  0.     0.919  0.006  0.012  0.013  0.006  0.043]
 [ 0.005  0.004  0.01   0.024  0.     0.935  0.012  0.     0.019  0.005]
 [ 0.     0.     0.005  0.     0.01   0.019  0.965  0.     0.     0.   ]
 [ 0.     0.004  0.005  0.006  0.02   0.     0.     0.927  0.     0.005]
 [ 0.013  0.     0.005  0.036  0.005  0.006  0.     0.007  0.949  0.016]
 [ 0.     0.     0.005  0.     0.01   0.     0.     0.027  0.006  0.914]]


In [29]:
score(kone_predictions, Y_test)

0.94369706028898859

# 2-NN model

In [30]:
ktwo_confusion_matrix= confusion_matrix(Y_test, ktwo_predictions)
print ktwo_confusion_matrix

[[355   0   3   0   0   0   0   0   0   1]
 [  0 259   0   0   3   0   1   1   0   0]
 [ 10   1 178   1   1   0   0   2   5   0]
 [  3   0   2 154   0   3   0   0   2   2]
 [  0   3   4   0 173   1   2   2   1  14]
 [  4   1   2   8   0 137   0   0   4   4]
 [  4   0   1   0   2   2 160   0   1   0]
 [  0   2   1   1   4   0   0 133   1   5]
 [  5   0   3   1   0   1   0   1 153   2]
 [  1   0   1   0   2   0   0   4   1 168]]


In [31]:
print np.around(ktwo_confusion_matrix/np.sum(ktwo_confusion_matrix, axis = 0, dtype='float'), decimals= 3)

[[ 0.929  0.     0.015  0.     0.     0.     0.     0.     0.     0.005]
 [ 0.     0.974  0.     0.     0.016  0.     0.006  0.007  0.     0.   ]
 [ 0.026  0.004  0.913  0.006  0.005  0.     0.     0.014  0.03   0.   ]
 [ 0.008  0.     0.01   0.933  0.     0.021  0.     0.     0.012  0.01 ]
 [ 0.     0.011  0.021  0.     0.935  0.007  0.012  0.014  0.006  0.071]
 [ 0.01   0.004  0.01   0.048  0.     0.951  0.     0.     0.024  0.02 ]
 [ 0.01   0.     0.005  0.     0.011  0.014  0.982  0.     0.006  0.   ]
 [ 0.     0.008  0.005  0.006  0.022  0.     0.     0.93   0.006  0.026]
 [ 0.013  0.     0.015  0.006  0.     0.007  0.     0.007  0.911  0.01 ]
 [ 0.003  0.     0.005  0.     0.011  0.     0.     0.028  0.006  0.857]]


In [32]:
score(ktwo_predictions, Y_test)

0.93173891380169405

# 3-NN model

In [33]:
kthree_confusion_matrix= confusion_matrix(Y_test, kthree_predictions)
print kthree_confusion_matrix

[[355   0   2   0   0   0   0   0   1   1]
 [  0 258   0   0   3   0   2   1   0   0]
 [  8   0 183   1   1   0   0   2   3   0]
 [  3   0   2 153   0   6   0   1   0   1]
 [  0   2   0   0 182   2   2   2   1   9]
 [  5   0   3   3   0 144   0   0   1   4]
 [  3   1   1   0   2   0 163   0   0   0]
 [  0   1   1   1   4   0   0 138   1   1]
 [  4   0   1   4   0   1   0   1 152   3]
 [  1   0   0   0   3   0   0   4   1 168]]


In [34]:
print np.around(kthree_confusion_matrix/np.sum(kthree_confusion_matrix, axis = 0, dtype='float'), decimals= 3)

[[ 0.937  0.     0.01   0.     0.     0.     0.     0.     0.006  0.005]
 [ 0.     0.985  0.     0.     0.015  0.     0.012  0.007  0.     0.   ]
 [ 0.021  0.     0.948  0.006  0.005  0.     0.     0.013  0.019  0.   ]
 [ 0.008  0.     0.01   0.944  0.     0.039  0.     0.007  0.     0.005]
 [ 0.     0.008  0.     0.     0.933  0.013  0.012  0.013  0.006  0.048]
 [ 0.013  0.     0.016  0.019  0.     0.941  0.     0.     0.006  0.021]
 [ 0.008  0.004  0.005  0.     0.01   0.     0.976  0.     0.     0.   ]
 [ 0.     0.004  0.005  0.006  0.021  0.     0.     0.926  0.006  0.005]
 [ 0.011  0.     0.005  0.025  0.     0.007  0.     0.007  0.95   0.016]
 [ 0.003  0.     0.     0.     0.015  0.     0.     0.027  0.006  0.898]]


In [35]:
score(kthree_predictions, Y_test)

0.94469357249626307