## ML Algorithms in Numpy and PyTorch

---
title: ML Algorithms in Numpy and PyTorch
description: ML Algorithms in Numpy and PyTorch
date: 2024-10
categories: [PyTorch]
---

In [118]:
import numpy as np
import torch
from sklearn import datasets
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
import collections

## KNN

- Predict the class based on the most common class among the k nearest neighbors
- fit: doesn't actually "train" the model in a traditional sense, just stores the training data.
- predict: for each point, find k nearest neighbors, then find most common class among these.

In [114]:
class KNN:
    def __init__(self,k=3):
        self.k = k

    def _dist(self,x1,x2):
        return np.sqrt(np.sum((x1-x2)**2))
    
    def fit(self,X,y):
        self.X_train = X
        self.y_train = y

    def _predict(self,x,debug=False):
        dists=np.array([self._dist(x,x_train) for x_train in self.X_train])
        sorted_indices = np.argsort(dists)[:self.k]
        labels = [self.y_train[i] for i in sorted_indices]
        most_common_labels = collections.Counter(labels).most_common(1)[0][0]
        if debug: 
            print("ORIGINAL: ",dists)
            print("LABELS: ",labels)
            print("MOST COMMON LABEL: ", most_common_labels)
        return most_common_labels
    
    def predict(self,X,debug):
        # X can have multiple samples, so predict for each one
        out = np.array([self._predict(x,debug) for x in X])
        if debug: print(out)
        return out


In [140]:

iris = datasets.load_iris()
X,y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
def accuracy(y_true,y_pred):
    return np.sum(y_true==y_pred)/len(y_true)


k = 3
clf = KNN(k)
clf.fit(X_train,y_train)
predictions  = clf.predict(X_test,debug=False)
print("Accuracy: ", accuracy(y_test,predictions))


Accuracy:  1.0


- First implement a naive implementation by directly converting numpy arrays to torch.tensors and replacing numpy functions with PyTorch functions.

In [141]:
class KNNTorchNaive:
    def __init__(self,k=3):
        self.k = k

    def _dist(self,x1,x2):
        return torch.sqrt(torch.sum((x1-x2)**2))
    
    def fit(self,X,y):
        self.X_train = X
        self.y_train = y

    def _predict(self,x,debug=False):
        dists=torch.tensor([self._dist(x,x_train) for x_train in self.X_train])
        sorted_indices = torch.argsort(dists)[:self.k]
        labels = [self.y_train[i] for i in sorted_indices]
        most_common_labels = collections.Counter(labels).most_common(1)[0][0]
        if debug: 
            print("ORIGINAL: ",dists)
            print("LABELS: ",labels)
            print("MOST COMMON LABEL: ", most_common_labels)
        return most_common_labels
    
    def predict(self,X,debug):
        # X can have multiple samples, so predict for each one
        out = torch.tensor([self._predict(x,debug) for x in X])
        if debug: print(out)
        return out

In [142]:
k = 3
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).long()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
def accuracy(y_true,y_pred): return torch.sum(y_true==y_pred).item()/len(y_true)
clf = KNNTorchNaive(k)
clf.fit(X_train,y_train)
predictions  = clf.predict(X_test,debug=False)
print("Accuracy: ", accuracy(y_test,predictions))

Accuracy:  1.0


In [126]:
X_train.shape

torch.Size([120, 4])

In [156]:
class KNNTorchBroadcast:
    def __init__(self,k=3):
        self.k = k
    
    def fit(self,X,y):
        self.X_train = X
        self.y_train = y

    def _predict(self,x,debug=False):
        #print(x.shape)
        dists = torch.sqrt(torch.sum((self.X_train-x)**2,dim=1))
        sorted_indices = torch.argsort(dists)[:self.k]
        labels = [self.y_train[i] for i in sorted_indices] #could perhaps be optimized, but trying to keep this consistent with collections.Counter
        most_common_labels = collections.Counter(labels).most_common(1)[0][0]
        if debug: 
            print("ORIGINAL: ",dists)
            print("LABELS: ",labels)
            print("MOST COMMON LABEL: ", most_common_labels)
        return most_common_labels
    
    def predict(self,X,debug):
        # X can have multiple samples, so predict for each one
        out = torch.tensor([self._predict(x,debug) for x in X])
        if debug: print(out)
        return out

- Broadcasting in PyTorch follows these rules:

1. If the two tensors differ in the number of dimensions, the shape of the tensor with fewer dimensions is padded with ones on its leading (left) side.

2. If the shape of the two tensors does not match in any dimension, the tensor with shape equal to 1 in that dimension is stretched to match the other shape.

3. If in any dimension the sizes disagree and neither is equal to 1, an error is raised.

- We subtract tensor `x` of shape `[4]` from a tensor `self.X_train` of shape `[120, 4]`, PyTorch automatically broadcasts `x` to the shape of `self.X_train` by repeating it along the 0th dimension.
    - In the *dists* calculation, the tensors differ in number of dimensions, so `x` is padded on the left with 1, becoming `[1,4]` according to Rule 1.  Then by Rule 2, this resulting tensor is "stretched out" along 0th dimension from shape `[1,4]` to `[120,4]`.

In [157]:
def accuracy(y_true,y_pred): return torch.sum(y_true==y_pred).item()/len(y_true)
clf = KNNTorchBroadcast(k)
clf.fit(X_train,y_train)
predictions  = clf.predict(X_test,debug=False)
print("Accuracy: ", accuracy(y_test,predictions))

Accuracy:  1.0
