# Lab 2, Nearest Neighbour (NN)
by Domrachev Ivan, B20-RO-01

In [1]:
import torch
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt

import torch
import torchvision
import torchvision.transforms as transforms
from tqdm.notebook import tqdm 

from collections.abc import Iterable, Callable

## Part 1. Finishing lab task

Here, the task is to continue considering a toy example from the lab and test the second test data and consider another metric.

In [2]:
# Generating train data
train_data = np.array([
    np.eye(4) + np.eye(4)[::-1],
    np.ones((4,4))
])
train_data[1, 1:3, 1:3] = 0
print(train_data[0])
print(train_data[1])

train_labels = np.array([0, 1])

[[1. 0. 0. 1.]
 [0. 1. 1. 0.]
 [0. 1. 1. 0.]
 [1. 0. 0. 1.]]
[[1. 1. 1. 1.]
 [1. 0. 0. 1.]
 [1. 0. 0. 1.]
 [1. 1. 1. 1.]]


In [3]:
# Generating test data
test_data = np.copy(train_data)
test_data[0, ((1,2)), ((2,1)),] = 0
test_data[1, ((1, 3)), ((3, 2)),] = 0
test_data

array([[[1., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 1.]],

       [[1., 1., 1., 1.],
        [1., 0., 0., 0.],
        [1., 0., 0., 1.],
        [1., 1., 0., 1.]]])

In [4]:
def KNN(
        train_ds: Iterable[np.array, np.array], 
        test_data: np.array, 
        dist_func: Callable[[np.array, np.array], np.array],
        K: int = 1
) -> int:
    """Predicts class of test_data given train_data
    
        Keyword arguments:
        train_ds -- (train_data, train_labels), the actual dataset
        test_data -- the data to be tested
        dist_func -- function to measure the distance
        
        Returns:
        Class prediction, according to the train_labels"""
    train_data, train_labels = train_ds
    distance = dist_func(train_data, test_data)
    if K == 1:
        return train_labels[distance.argmin()]
    else:
        # Get K indices of K least elements
        closest_classes = train_labels[np.argpartition(-distance, K)[:K]]
        # Return the most frequent inxed. If there are several,
        # return the first one (a.k.a. the smallest)
        return train_labels[
            closest_classes[closest_classes.argmax()]
        ]

In [5]:
def abs_dist(train_data: np.array, test_data: np.array) -> np.array:
    """ Measures distance as absolute distance between entries 
        
        Keyword arguments:
        train_data -- the actual dataset
        test_data -- the data to be tested
            
        Returns:
        Absolute distance for each train_data entry"""
    return np.abs(
        train_data - test_data
    ).sum(
        axis = tuple(
            i for i in range(1, train_data.ndim)
        )
    )


def mult_dist(train_data: np.array, test_data: np.array) -> np.array:
    """ Measures distance as matrix multiplication 
        
        Keyword arguments:
        train_data -- the actual dataset
        test_data -- the data to be tested
            
        Returns:
        Multiplication distance for each train_data entry"""
    # Apparently, one single multiplication is longer to compute
    # than many small ones (this becomes important for CIFAR10)
    sum = 0
    for train_entry in train_data:
        sum += (train_entry.T @ test_data.T).sum()
    
    return sum

In [6]:
# Making predictions for the data
pred1 = [
    KNN(
        (train_data, train_labels), 
        test_data_i, 
        abs_dist
    ) for test_data_i in test_data
]
pred2 = [
    KNN(
        (train_data, train_labels), 
        test_data_i, 
        mult_dist
    ) for test_data_i in test_data
]

pred1_labeled = ["cross" if pred1_i else "circle" for pred1_i in pred1]
pred2_labeled = ["cross" if pred2_i else "circle" for pred2_i in pred2]

pred1_labeled, pred2_labeled

(['circle', 'cross'], ['circle', 'circle'])

Overall, one could say that absolute distance metric (`abs_dist`) works better for this task, because it shows better performance for the second test entry. It is correct for the first metric, but incorrect for the second

## Part 2. Testing 2 KNN implementation

### 2.1. Implementing 2-KNN manually

First, one need to load CIFAR10 dataset:

In [7]:
transform = transforms.Compose([
    transforms.ToTensor(),
    # This normalization values are precomputed for CIFAR-10 dataset,
    # refer to: https://github.com/kuangliu/pytorch-cifar/issues/19
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2471, 0.2435, 0.2616)),
])

# Train
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, 
                                          shuffle=True, num_workers=2)

# Test 
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset,
                                         shuffle=False, num_workers=2)

# List of classes
classes = trainset.classes

Files already downloaded and verified
Files already downloaded and verified


Let's convert the trainset and testset to the numpy arrays and test it's performance with our CNN implementation

In [8]:
train_data_np = trainset.data
train_labels_np = np.array(trainset.targets)
test_data_np = testset.data
test_labels_np = np.array(testset.targets)

In [9]:
# ~16 seconds
N_pred1 = 1000
pred1 = np.zeros(N_pred1)
test_loop_1 = tqdm(
    enumerate(test_data_np[:N_pred1], 0), total=N_pred1, desc="Test absolute value"
)

for i, input in test_loop_1:
    pred1[i] = KNN (
        (train_data_np, train_labels_np), 
        input, 
        abs_dist
    )
    

Test absolute value:   0%|          | 0/1000 [00:00<?, ?it/s]

In [11]:
# ~7.5 minutes
N_pred2 = 100
pred2 = np.zeros(N_pred2)
test_loop = tqdm(
    enumerate(test_data_np[:N_pred2], 0), total=N_pred2, desc="Test multiplication"
)

for i, input in test_loop:
    pred2[i]= KNN (
        (train_data_np, train_labels_np), 
        input, 
        mult_dist
    )
    

Test multiplication:   0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
acc1 = (pred1 == test_labels_np[:N_pred1]).sum() / N_pred1
acc2 = (pred2 == test_labels_np[:N_pred2]).sum() / N_pred2

print(f"Accuracy of absolute distance: {acc1}")
print(f"Accuracy of product distance: {acc2}")

Accuracy of absolute distance: 0.253
Accuracy of product distance: 0.16


As one could see, the accuracy is quite low without encoding

### 2.2. Testing encoded data 

For a prepared encoder, I've decided to borrow a RESNET model, trained on CIFAR-10 from [here](https://github.com/huyvnphan/PyTorch_CIFAR10/tree/master).

In [13]:
# !git submodule add https://github.com/huyvnphan/PyTorch_CIFAR10.git

In [14]:
from PyTorch_CIFAR10.cifar10_models.resnet import resnet50

model = resnet50()
ckpt = torch.load("data/resnet50.pt")
model.load_state_dict(ckpt)
encoding_model = torch.nn.Sequential(*(list(model.children())[:-1]));
# from torchvision.models import resnet50, ResNet50_Weights

# model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# encoding_model = torch.nn.Sequential(*(list(model.children())[:-1]))

In [15]:
trainset_encoded = []
with torch.no_grad():
    encoding_model.eval()  # evaluation mode
    test_loop = tqdm(enumerate(trainloader, 0), total=len(trainloader), desc="Train encoding")
    for i, inputs in test_loop:
        trainset_encoded.append(encoding_model(inputs[0]))

Train encoding:   0%|          | 0/50000 [00:00<?, ?it/s]

In [16]:
testset_encoded = []
with torch.no_grad():
    encoding_model.eval()  # evaluation mode
    test_loop = tqdm(enumerate(testloader, 0), total=len(testloader), desc="Test encoding")
    for i, inputs in test_loop:
        testset_encoded.append(encoding_model(inputs[0]))

Test encoding:   0%|          | 0/10000 [00:00<?, ?it/s]

In [19]:
trainset_encoded = np.array([entry.numpy() for entry in trainset_encoded]).reshape(50000, -1)
testset_encoded = np.array([entry.numpy() for entry in testset_encoded]).reshape(10000, -1)

trainset_encoded.shape, testset_encoded.shape

((50000, 2048), (10000, 2048))

Let's first try to test the encoded values with previous metric -- absolute distance:

In [20]:
N_pred_encoded = 1000
pred_encoded = np.zeros(N_pred_encoded)
test_loop = tqdm(
    enumerate(testset_encoded[:N_pred_encoded], 0), total=N_pred_encoded, desc="Test encoded values, absolute distance"
)
for i, input in test_loop:
    pred_encoded[i]= KNN (
        (trainset_encoded, train_labels_np), 
        input, 
        abs_dist
    )
    

Test encoded values, absolute distance:   0%|          | 0/1000 [00:00<?, ?it/s]

In [21]:
acc_encoded = (pred_encoded == test_labels_np[:N_pred_encoded]).sum() / N_pred_encoded
print(f"Accuracy of encoded values, absolute distance: {acc_encoded}")

Accuracy of encoded values, absolute distance: 0.123


In [22]:
def cosine_similarity(train_data: np.array, test_data: np.array) -> np.array:
    """ Measures cosine similarity between inputs
        
        Keyword arguments:
        train_data -- the actual dataset
        test_data -- the data to be tested
            
        Returns:
        Cosine similarity for each train_data entry"""
    
    return train_data.dot(test_data) / (
        np.linalg.norm(train_data) * np.linalg.norm(test_data)
    )

In [23]:
N_pred_encoded = 1000
pred_encoded = np.zeros(N_pred_encoded)
test_loop = tqdm(
    enumerate(testset_encoded[:N_pred_encoded], 0), total=N_pred_encoded, desc="Test encoded values, cosine similarity"
)
for i, input in test_loop:
    pred_encoded[i]= KNN (
        (trainset_encoded, train_labels_np), 
        input, 
        cosine_similarity
    )

Test encoded values, cosine similarity:   0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
acc_encoded = (pred_encoded == test_labels_np[:N_pred_encoded]).sum() / N_pred_encoded
print(f"Accuracy of encoded values, cosine similarity: {acc_encoded}")

Accuracy of encoded values, cosine similarity: 0.127


Results show that the cosine similarity metric show worse result, little more accurate than random prediction.

Here are the steps one might take to improve the accuracy of the result:
1. *Try another model*. I've considered two examples, but both of them failed to perform well. Hovewer, there exist some other options to test.
2. *Remove some convolutional layers*. The KNN algorithm generally works poorly for sparse data. Hence, one could try to remove more sensors from the initial NN and get more meaningful and dence feature list. *I failed to try this because it broke my OS*