In [13]:
import numpy as np
import editdistance
import keras
import random
from keras.datasets import mnist
from sklearn.model_selection import train_test_split

(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()

#reshaping images
train_images = np.reshape(train_images, (-1, 784))
test_images = np.reshape(test_images, (-1, 784))

# normalize
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

random_sample_indices = random.sample(range(train_images.shape[0]), 20000)
train_images_25 = train_images[random_sample_indices]
train_labels_25 = train_labels[random_sample_indices]


train_images_final_80, validation_images_final_10, train_labels_final_80,validation_labels_final_10 = train_test_split(train_images_25, train_labels_25, test_size=0.1, random_state=42)

print("Final train dataset size: ", train_images_final_80.shape)
print("Final validation dataset size: ", validation_images_final_10.shape)


Final train dataset size:  (18000, 784)
Final validation dataset size:  (2000, 784)


In [180]:
euclidean_distance_train_mat = np.zeros((len(train_images_final_80), len(train_images_final_80)))

for i in range(len(train_images_final_80)):
  for j in range(len(train_images_final_80)):
    euclidean_distance = np.linalg.norm(train_images_final_80[i] - train_images_final_80[j])
    euclidean_distance_train_mat[i][j] = euclidean_distance

In [181]:
euclidean_distance_validation_mat = np.zeros((len(validation_images_final_10), len(validation_images_final_10)))

for i in range(len(validation_images_final_10)):
  for j in range(len(validation_images_final_10)):
    euclidean_distance = np.linalg.norm(validation_images_final_10[i] - validation_images_final_10[j])
    euclidean_distance_validation_mat[i][j] = euclidean_distance

In [2]:
euclidean_distance_test_mat = np.zeros((len(test_images), len(test_images)))
for i in range(len(test_images)):
  for j in range(len(test_images)):
    euclidean_distance = np.linalg.norm(test_images[i] - test_images[j])
    euclidean_distance_test_mat[i][j] = euclidean_distance

(10000, 784)


In [None]:
cosine_similarity_mat = np.zeros((len(train_images_25), len(train_images_25)))
for i in range(len(train_images_25)):
  for j in range(len(train_images_25)):
    cosine_similarity = np.dot(train_images_25[i], train_images_25[j]) / (np.linalg.norm(train_images_25[i]) * np.linalg.norm(train_images_25[j]))
    cosine_similarity_mat[i][j] = cosine_similarity

#### Calculating edit distance between 2 vectors, taking 0.5 as threshold value

In [20]:
def calculate_edit_distance(image1, image2):
    differing_pixels = np.sum(np.abs(image1 - image2) >= 0.5)
    edit_distance = differing_pixels / 784  # 28x28 = 784 (total number of pixels in MNIST images)
    return edit_distance

edit_distance_mat = np.zeros((len(train_images_25), len(train_images_25)))
for i in range(len(train_images_25)):
  for j in range(len(train_images_25)):
    distance = calculate_edit_distance(train_images_25[i], train_images_25[j])
    # print("Edit distance:", distance)

In [184]:
from collections import Counter

class KNN:
  def __init__(self, k=20):
    self.k = k

  def fit(self, X, y): #X=train_image, y=train_label
    self.X_train = X
    self.y_train = y


  def predict(self, euclidian_dist_mat):
    predictions_list = []

    for i in range(len(euclidian_dist_mat)):
      k_indices = np.argsort(euclidian_dist_mat[i])[:self.k]
      k_nearest_labels = [self.y_train[i] for i in k_indices]
      mode = Counter(k_nearest_labels).most_common()
      predictions_list.append(mode[0][0])
    return predictions_list

In [213]:
from sklearn.metrics import accuracy_score
clf = KNN(k=20)
clf.fit(train_images_final_80, train_labels_final_80)
prediction = clf.predict(euclidean_distance_train_mat)
#hyper-parameter tuning - validation dataset
#grid_serach_cv - sklearn


In [214]:
train_accuracy = accuracy_score(train_labels_final_80, prediction)
print("Train accuracy: ", train_accuracy)

Train accuracy:  0.9532222222222222


In [220]:
#Performing hyper parameter tuning here to get an optimum k value..
optimum_k = -1
max_accuracy = 0
for i in range(10, 50):
    clf2 = KNN(k=i)
    clf2.fit(validation_images_final_10, validation_labels_final_10)
    prediction2 = clf2.predict(euclidean_distance_validation_mat)
    validation_accuracy = accuracy_score(validation_labels_final_10, prediction2)
    if validation_accuracy > max_accuracy:
        optimum_k = i
        max_accuracy = validation_accuracy

print("Optimum K: ", optimum_k)
print("Validation accuracy: ", max_accuracy)

Optimum K:  10
Validation accuracy:  0.927


In [222]:
clf2 = KNN(k=10)
clf2.fit(test_images, test_labels)
prediction = clf2.predict(euclidean_distance_test_mat)
test_accuracy = accuracy_score(test_labels, prediction)

print("Test accuracy: ", test_accuracy)

Test accuracy:  0.9643
