# Image Classification using K-Nearest Neighbours

Using KNN to classify images is seldom used (if at all)! But here we will create a small model of the same to simply give it a shot 

In [79]:
# Importing packages we need
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier'

### The process in a nutshell
- (1) Create train,valid and test data out of image folders
- (2) Flatten our model! Perhaps also use PCA and perform dimension reduction
- (2) fit our model for both finding K nearest neighbors and also the classification labels
- (3) Count accuracy rate or create a crosstab matrix!

#### (1)

In [128]:
"""
(from project- Natural Images Classification CNNs)
STEP 1: Creating train, test and valid data from image folders using Image Data Generator

CREATE VARIABLES WHICH CONTAIN THE PATHS TO THE TRAIN,TEST AND VALIDATE FOLDERS 
- You can give relative or absolute paths
"""
# Here we simply set the path so Keras Image gen can go into these folders 
train_path = 'C:/Users/Darshil/gitly/Deep-Learning/My Projects/Classification_Natural Images/train'
valid_path = 'C:/Users/Darshil/gitly/Deep-Learning/My Projects/Classification_Natural Images/valid'
test_path = 'C:/Users/Darshil/gitly/Deep-Learning/My Projects/Classification_Natural Images/test'

"""
USING IMAGE DATA GENERATOR : ImageDataGenerator()
- Important to understand that what this does is simply go into our directory and go into the class image folders we've created, 
and then simply find those images
Ex: if train/dog has 20 images it will recognize those images and label them as 'dog' I think

NOTE: 
ImageDataGenerator has the following details: 
- we call the flow_from_directory function which goes into a dir, converts the image and assigns the label
- target_size = converts all images into that size (these need to be consistent)
- classes = name classes in order and place them in the same order in all 3 folders just to be safe
- batch_size = this allows us to iterate over the images X times otherwise too many images in memory
"""

all_classes = ['airplane', 'car','cat', 'dog', 'flower', 'fruit', 'motorbike', 'person']

#Note make sure you name the classes correctly!
train_batches = ImageDataGenerator().flow_from_directory(train_path, target_size=(224,224), classes = all_classes, batch_size = 808)
valid_batches = ImageDataGenerator().flow_from_directory(valid_path, target_size=(224,224), classes = all_classes, batch_size = 160)
test_batches = ImageDataGenerator().flow_from_directory(test_path, target_size=(224,224), classes = all_classes, batch_size = 88)

Found 808 images belonging to 8 classes.
Found 160 images belonging to 8 classes.
Found 88 images belonging to 8 classes.


In [129]:
#Here we create our training, validation and test data
training_data_3d, training_data_labels = next(train_batches)
valid_data_3d, valid_data_labels = next(valid_batches)
test_data_3d, test_data_labels = next(test_batches)

#checking dimensions!
print ('Training data dimensions: ', training_data_3d.shape, ' Training labels dimensions: ',training_data_labels.shape, '\n',
      'Validation data dimensions: ', valid_data_3d.shape, ' Valid labels dimensions: ',valid_data_labels.shape, '\n',
      'Test data dimensions: ', test_data_3d.shape, ' Test labels dimensions: ',test_data_labels.shape)

Training data dimensions:  (808, 224, 224, 3)  Training labels dimensions:  (808, 8) 
 Validation data dimensions:  (160, 224, 224, 3)  Valid labels dimensions:  (160, 8) 
 Test data dimensions:  (88, 224, 224, 3)  Test labels dimensions:  (88, 8)


#### (2) Flatten data
It is vital to reshape our data for the KNN algorithm to calculate distances between different datapoints. <br> 
- Here each datapoint is an image of dimensions 1x150528
- In order to save time and make our model less compuationally expensive, it may be wise to also incorporate dimension reduction using Principal Component Analysis

In [134]:
#First we flatten our data
training_data =training_data_3d.reshape(808,150528)
valid_data = valid_data_3d.reshape(160,150528)
test_data = test_data_3d.reshape(88,150528)

print ('Flattened dimensions','\n',
       '----------------------- \n',
      'training: ', training_data.shape, '\n',
       'valid', valid_data.shape, '\n',
      'test: ',test_data.shape )

Flattened dimensions 
 ----------------------- 
 training:  (808, 150528) 
 valid (160, 150528) 
 test:  (88, 150528)


#### (3) Now we do the following:
- (i) fit model to find nearest neighbors
- (ii) use KNN to classify test data
- (iii) creating a confusion matrix!

In [135]:
"""
(i)  fit model to find nearest neighbors
"""
#initializing model
nearest_imgs= NearestNeighbors(n_neighbors=7)

#fitting our data
nearest_imgs.fit(training_data)

"""
Find K Nearest neighbors
- here we can predict on a test image
- Note that it .kneighbors returns 2 arrays, 
    - first array : distances between test image and all K nearest images
    - the indexes of the same images in test_data so you can see
"""

prediction_test = nearest_imgs.kneighbors([test_data[4]])
prediction_test

(array([[    0.        , 21030.71113871, 22439.20513298, 22472.96197656,
         22699.82270856, 22805.07717593, 23461.12429105]]),
 array([[331, 517, 677, 662, 746, 329, 180]], dtype=int64))

In [136]:
#these are the following images (each row is an image) that are the closest
training_data[[list(prediction_test)[1][0]]]

array([[  6.,   6.,   6., ...,   6.,   6.,   6.],
       [ 26.,  23.,  42., ...,  56.,  57.,  51.],
       [ 27.,  29.,  24., ..., 137., 147., 136.],
       ...,
       [ 33.,  37.,  40., ...,  58.,  37.,  32.],
       [ 53.,  45.,  43., ...,  49.,  40.,  33.],
       [ 25.,  35.,  27., ...,  27.,  34.,  40.]], dtype=float32)

In [137]:
"""
(ii)  use KNN to classify test data

- Important to note that we need ton convert our one-hot coded labels to integers
(https://stackoverflow.com/questions/42497340/how-to-convert-one-hot-encodings-into-integers)
"""
#Initialize our KNN model with 5 neighbours and define the euclidean distance
knn = KNeighborsClassifier(n_neighbors =7, 
                       p = 2)# p=2 for euclidean distance

#converting our one-hot coded labels to integers
knn_training_labels = [np.where(r==1)[0][0] for r in training_data_labels]
knn_valid_labels = [np.where(r==1)[0][0] for r in valid_data_labels]
knn_test_labels = [np.where(r==1)[0][0] for r in test_data_labels]


#fitting our model
knn.fit(training_data,knn_training_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [138]:
#lets try out predicting for a random image in test_data 
knn.predict([test_data[1]])

array([2], dtype=int64)

In [139]:
#predicting on the entire valid dataset
test_preds = knn.predict(valid_data)

In [145]:
#creating a confusion matrix
confusion_matrix(knn_valid_labels, test_preds, labels=[0,1,2,3,4,5,6,7])

array([[18,  0,  2,  0,  0,  0,  0,  0],
       [ 3,  8,  5,  3,  0,  1,  0,  0],
       [ 0,  0, 20,  0,  0,  0,  0,  0],
       [ 2,  0,  9,  6,  0,  0,  0,  3],
       [ 4,  1,  6,  1,  6,  0,  0,  2],
       [ 0,  0,  0,  0,  0, 20,  0,  0],
       [ 4,  0, 11,  2,  0,  0,  3,  0],
       [ 0,  0,  1,  2,  0,  0,  0, 17]], dtype=int64)