# K nearest neighbour - KNN

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_train = pd.read_csv('train_data/Classification_train.csv')
data_train

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29996,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29997,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29998,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data = data_train.to_numpy()
x_train = data[:25000,1:]
x_train = x_train/255 #basic normalisation
y_train = data[:25000,0]
x_test = data[25000:,1:]
x_test = x_test/255 
y_test = data[25000:,0].reshape(-1,1)

In [4]:
def bincount(x):
    return np.bincount(x,minlength = 25001)

In [5]:
def knn(x_train,x_test,k,y_train):
    distance = np.sum(np.square(x_train),axis = 1) + np.sum(np.square(x_test),axis=1,keepdims = True) - 2*(x_test@x_train.T)
    y = np.argsort(distance,axis=1)[:,:k]
    y2 = y_train[y]
    y_p  = np.apply_along_axis(bincount, axis=1, arr=y2)
    y_pred = np.argmax(y_p,axis = 1).reshape(-1,1)
    return y_pred

In [6]:
def calculate_accuracy(predictions, true_labels):
    m = predictions.shape[0]
    c=0
    for i in range(m):
        if predictions[i,:] == true_labels[i,:] :
            c+=1
    return c/m*100

In [None]:
import time
s = time.time()
K_list = [1,3,5,7]
accuracy = []
for k in K_list:
    y_pred = knn(x_train,x_test,k,y_train)
    accuracy.append(calculate_accuracy(y_pred,y_test))
print(f"Time Taken by model to run = {(time.time()-s)} sec")

Time Taken by model to run = 137.60401940345764 sec


In [None]:
plt.plot(K_list,accuracy,marker='o')
plt.title('accuracy vs k')
plt.xlabel('value of k')
plt.ylabel('accuracy')
plt.show()

## Visualising the output data 

In [None]:
fig,axs = plt.subplots(1,10)
for j,ax in enumerate(axs):
    i = np.random.randint(0,5000)
    a = x_train[i,:]
    ax.imshow(a.reshape(28,28))
    ax.set_title(f"{y_test[i,:]},{y_pred[i,:]}")
    ax.axis('off')
print("Left side actual value , Right Side predicted value")

### Making Predictions on Test Data and Saving Results

In [None]:
# Reading test data
tes_data = pd.read_csv('test_data/Classification_test.csv')
test_data = tes_data.to_numpy()

In [None]:
tes_data

In [None]:
# Normalizing test data
x_test2 = test_data[:, 1:]
x_test = x_test2/255

x_training = data[:,1:]
x_training = x_training/255

In [None]:
# Making predictions using k=1 as it is giving maximum accuracy 
y_predicted = knn(x_training,x_test,1,data[:,0])

In [None]:
# Creating the final dataframe
final_data = np.zeros((10000, 2))
final_data[:, 0] = test_data[:, 0]
final_data[:, 1] = y_predicted[:,0]

In [None]:
df2 = pd.DataFrame(final_data)
df2.columns = ["ID's", 'Predictions']
df2

In [None]:
# Saving predictions to CSV
df2.to_csv('predicted_data/knn_predicted_final.csv', index=False)