# Just a Practice Notebook for Random Data Science Stuff

In [1]:
import torch

In [2]:
class Metrics():
    def euclidean_distance(self, X, Y):
        Z = X - Y
        return torch.sqrt(torch.dot(Z, Z))
         
    def manhattan_distance(self, X, Y):
        return torch.sum(torch.abs(X - Y))

    def cosine_similarity(self, X, Y):
        return torch.dot(X, Y) / (self.two_norm(X) * self.two_norm(Y))
        # or even simpler with: 
        # return torch.cosine_similarity(X, Y, dim=0) 

    def jaccard_similarity(self, X, Y):                
        XandY = set([e for e in X + Y])
        XorY = set([e for e in X if e in Y])
        return len(XorY) / len(XandY)
        
    def two_norm(self, X):
        return sum(xi ** 2 for xi in X) ** 0.5

X = torch.tensor([1, 3, 4, 5], dtype=torch.float32)
Y = torch.tensor([7, 6, 3, 1], dtype=torch.float32)

print("X={}".format(X))
print("Y={}".format(Y))

metrics = Metrics()
print("2-Norm of X: {}".format(metrics.two_norm(X)))
print("2-Norm of Y: {}".format(metrics.two_norm(Y)))
print()
print("Euclidean Distance: {}".format(metrics.euclidean_distance(X,Y)))
print("Manhattan Distance: {}".format(metrics.manhattan_distance(X,Y)))
print("Cosine Similarity: {}".format(metrics.cosine_similarity(X,Y)))
print("Jaccard Similarity: {}".format(metrics.jaccard_similarity(X,Y)))

X=tensor([1., 3., 4., 5.])
Y=tensor([7., 6., 3., 1.])
2-Norm of X: 7.141428470611572
2-Norm of Y: 9.746794700622559

Euclidean Distance: 7.874007701873779
Manhattan Distance: 14.0
Cosine Similarity: 0.6033958792686462
Jaccard Similarity: 0.5


## Implementing KNN search and predict a binary label using KNN search

In [3]:
from heapq import heappush, heappop

class KnnSearch:
    
    def predict_label(self, samples, query_vector, k, label_key="label"):
        knn = self.find_k_nearest_neighbors(samples, query_vector, k)
        # Compute the average (binary) label of the KNN 
        return round(sum(samples[entry][label_key] for entry in knn) / len(knn))
    
    def find_k_nearest_neighbors(self, samples, query_vector, k):
        # O(n log k) time and O(k) space, n = len(samples)
        k_nearest = []
        for sample_id, sample in samples.items():
            d = self.l2_norm(sample['vector'], query_vector)
            e = [-d, sample_id]
            heappush(k_nearest, e)
            while len(k_nearest) > k:
                heappop(k_nearest)
        result = []
        while k_nearest:
            _, sample = heappop(k_nearest)
            result.append(sample)
        result.reverse()
        return result
    
    def l2_norm(self, vector_u , vector_v):
        return torch.linalg.norm(vector_u - vector_v).item() # use .item() to get the value out of a 1-element + 1d-tensor
    
knn = KnnSearch()
samples = {
    'id-1': {'vector': torch.tensor([1, 3, 4, 5], dtype=torch.float32), 'label': 1},
    'id-2': {'vector': torch.tensor([7, 6, 3, 1], dtype=torch.float32), 'label': 0},
    'id-3': {'vector': torch.tensor([2, 3, 4, 5], dtype=torch.float32), 'label': 1},
    'id-4': {'vector': torch.tensor([1, 3, 9, 5], dtype=torch.float32), 'label': 0},
    'id-5': {'vector': torch.tensor([7, 2, 3, 1], dtype=torch.float32), 'label': 1},
    'id-6': {'vector': torch.tensor([2, 3, 4, 8], dtype=torch.float32), 'label': 0},    
}

k, query = 3, torch.tensor([1, 3, 4, 5], dtype=torch.float32)
print("The {}-NN of {} are:".format(k, query), knn.find_k_nearest_neighbors(samples, query, k))

k, query = 1, samples['id-1']['vector']
print("The {}-NN of {} is:".format(k, query), knn.find_k_nearest_neighbors(samples, query, k))

k, query = 2, samples['id-2']['vector']
print("The {}-NN of {} are:".format(k, query), knn.find_k_nearest_neighbors(samples, query, k))

print()

k, query = 1, samples['id-1']['vector']
print("The predicted label of {} (with k={}) is:".format(query, k), knn.predict_label(samples, query, k))

k, query = 4, samples['id-1']['vector']
print("The predicted label of {} (with k={}) is:".format(query, k), knn.predict_label(samples, query, k))

k, query = 1, samples['id-2']['vector']
print("The predicted label of {} (with k={}) is:".format(query, k), knn.predict_label(samples, query, k))

k, query = 4, samples['id-2']['vector']
print("The predicted label of {} (with k={}) is:".format(query, k), knn.predict_label(samples, query, k))


The 3-NN of tensor([1., 3., 4., 5.]) are: ['id-1', 'id-3', 'id-6']
The 1-NN of tensor([1., 3., 4., 5.]) is: ['id-1']
The 2-NN of tensor([7., 6., 3., 1.]) are: ['id-2', 'id-5']

The predicted label of tensor([1., 3., 4., 5.]) (with k=1) is: 1
The predicted label of tensor([1., 3., 4., 5.]) (with k=4) is: 0
The predicted label of tensor([7., 6., 3., 1.]) (with k=1) is: 0
The predicted label of tensor([7., 6., 3., 1.]) (with k=4) is: 1
