In [2]:
!pip install scipy



# KNN with MNIST dataset

In [11]:
## Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import math
import scipy.io

## Data Gathering

In [13]:
# Load MATLAB file
mat_data = scipy.io.loadmat('mnistData.mat')

# Convert to NumPy array
array_data = mat_data['mnist']

## storing as numpy arrays

X_t = array_data['train_images'][0][0]
train_y_arr = array_data['train_labels'][0][0]

X_ts = array_data['test_images'][0][0]
test_y_arr = array_data['test_labels'][0][0]

## flattening (28*28 to 784)

a = []
for i in range (X_t.shape[2]):
    a.append(X_t[:,:,i].flatten())
    
train_X_arr = np.array(a)   
    
c = []
for i in range (X_ts.shape[2]):
    c.append(X_ts[:,:,i].flatten())
test_X_arr = np.array(c)

## concatenating the training and testing data to form training and testing dataset

training_data = np.concatenate((train_X_arr, train_y_arr), axis=1)


## Normalising the given dataset

In [14]:
## Normalising the given data

def normalize_data(data):
    min_val = np.min(data) 
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)  ## MIN-MAX normalisation
    return normalized_data

# normalized datasets
train_X_normalized = normalize_data(train_X_arr)
test_X_normalized = normalize_data(test_X_arr)

## Function for prediction with KNN

In [28]:
def evaluate_class(training_X_data, training_y_data, testing_X_data,testing_y_data, k):
    final_evaluations = []
    
    correct = 0.0

    for test_row,y in zip(testing_X_data,testing_y_data):
        distances = np.linalg.norm(training_X_data - test_row, axis=1)  # calculating the root mean square norm
        sorted_indices = np.argsort(distances)[:k]   # sorting first given indices
        k_nearest_neighbors = [(training_X_data[i], distances[i], training_y_data[i]) for i in sorted_indices]

        class_vals = [neighbor[2] for neighbor in k_nearest_neighbors]
        predicted_class = scipy.stats.mode(class_vals)[0][0]  # calculate mode to find out the majority
        
        if(predicted_class == y[0]):   # check if prediction is right
            correct = correct + 1

        final_evaluations.append([test_row, predicted_class])

    accuracy = (correct/len(testing_y_data))*100   # calculate accuracy

    return final_evaluations,accuracy

## Prediction with Normalized dataset

### With k == 1

In [72]:
c1,accuracy_normalized_1 = evaluate_class(train_X_normalized,train_y_arr,test_X_normalized,test_y_arr,1)
print("accuracy with normalized dataset with k = 1")
print(accuracy_normalized_1)
print("misclassification_rate with normalized dataset with k = 1")
print(1-(accuracy_normalized_1/100))

accuracy with normalized dataset with k = 1
96.91
misclassification_rate with normalized dataset with k = 1
0.03090000000000004


### With k == 3

In [74]:
c3,accuracy_normalized_3 = evaluate_class(train_X_normalized,train_y_arr,test_X_normalized,test_y_arr,3)
print("accuracy with normalized dataset with k = 3")
print(accuracy_normalized_3)
print("misclassification_rate with normalized dataset with k = 3")
print(1-(accuracy_normalized_3/100))

accuracy with normalized dataset with k = 3
97.05
misclassification_rate with normalized dataset with k = 3
0.02950000000000008


## Prediction with original dataset

### With k == 1

In [76]:
c11,accuracy_1 = evaluate_class(train_X_arr,train_y_arr,test_X_arr,test_y_arr,1)
print("accuracy with original dataset with k = 1")
print(accuracy_1)
print("misclassification_rate with original dataset with k = 1")
print(1-(accuracy_1/100))

accuracy with original dataset with k = 1
50.519999999999996
misclassification_rate with original dataset with k = 1
0.4948


### with k == 3

In [78]:
c13,accuracy_3 = evaluate_class(train_X_arr,train_y_arr,test_X_arr,test_y_arr,3)
print("accuracy with original dataset with k = 3")
print(accuracy_3)
print("misclassification_rate with original dataset with k = 3")
print(1-(accuracy_3/100))

accuracy with original dataset with k = 3
43.72
misclassification_rate with original dataset with k = 3
0.5628


## Permuting the columns

In [25]:
def permute_columns(train_matrix,test_matrix):
    num_columns = train_matrix.shape[1]
    permutation_indices = np.random.permutation(num_columns)
    return train_matrix[:, permutation_indices],test_matrix[:, permutation_indices]

X_train_permuted,X_test_permuted = permute_columns(train_X_arr,test_X_arr)

## Prediction with permuted matrices

In [29]:
cp,accuracy_permutation_1 = evaluate_class(X_train_permuted,train_y_arr,X_test_permuted,test_y_arr,1)
print("accuracy with normalized dataset with k = 1")
print(accuracy_permutation_1)
print("misclassification_rate with normalized dataset with k = 1")
print(1-(accuracy_permutation_1/100))

accuracy with normalized dataset with k = 1
50.519999999999996
misclassification_rate with normalized dataset with k = 1
0.4948


In [None]:
## We can see that after permuting the dataset our misclassification rate is not changing