# Neural Network from scratch

In [136]:
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F
import torchvision.datasets as datasets
from torchvision import transforms
from torchvision.transforms import Lambda

from typing import Optional

np.random.seed(42)

In [None]:
from typing import List


class Network:
    """
    Basic Neural Network, totally unoptimized
    Uses Stochastic Gradient Descent as the optimizer
    """

    def __init__(self, sizes: list[int]):
        self.num_layers = len(sizes)
        self.sizes = sizes

        # Where x is the size of the previous layer and y the size of the next layer
        self.w = [
            np.random.randn(y, x) for x, y in zip(self.sizes[:-1], self.sizes[1:])
        ]
        self.b = [np.random.randn(y, 1) for y in self.sizes[1:]]

    def sigmoid(self, z: np.ndarray) -> np.ndarray:
        """Sigmoid Activation Function"""
        return 1.0 / (1.0 + np.exp(-z))

    def sigmoid_deriv(self, z: np.ndarray):
        """first derivative of sigmoid evaluated at z"""
        return self.sigmoid(z) * (1 - self.sigmoid(z))

    def forward(self, x: np.ndarray):
        """Forward Pass through the Network"""
        for w, b in zip(self.w, self.b):
            z = np.matmul(w, x) + b
            x = self.sigmoid(z)
        return x

    def cost_deriv(self):
        raise NotImplementedError()

    def evaluate(self, test_data: Dataset):
        """
        Nr of correctly classified test-samples
        """
        test_results = [(np.argmax(self.forward(x)), y) for x, y in test_data]
        return sum(int(x == y) for x, y in test_results)

    def backward(self):
        """Backpropagation"""
        raise NotImplementedError("shit is not implemented yet")

    def train(
        self,
        train_data: Dataset,
        epochs: int = 20,
        batch_size: int = 30,
        lr: float = 0.01,
        test_data: Optional[Dataset] = None,
    ):
        """
        Training using Stochastic Gradient Descent.

        If test_data is passed, then the network is evaluated on the test_data after each epoch
        """
        if not train_data:
            raise ValueError("train_data can not be none")
        n_train = len(train_data)
        n_test = len(test_data) if test_data else None
        n_batches = n_train // batch_size

        test_results = []
        for epoch in range(epochs):
            for step in range(n_batches):
                batch = [
                    train_data[idx]
                    for idx in np.random.randint(low=0, high=n_train, size=batch_size)
                ]
                self.minibatch_update(batch, lr)

            # for x, y in batch:
            #     y_pred = self.forward(x)

            if test_data:
                """Compute the cost on the test set"""
                test_result = self.evaluate(test_data)
                test_results.append(test_result)
                print(
                    f"Epoch: {epoch} / {epochs},\t Nr of correctly classified samples: {test_result}/{n_test}\t accuracy: {test_result / n_test:.5f}"
                )

    def minibatch_update(
        self, batch: List[tuple[torch.Tensor, torch.Tensor]], lr: float
    ):
        """
        Runs one minibatch update

        Key Equations of backpropagation

        BP​1: δ^L =  ∇_a C ⊙ σ'(z^L)                        Get Error δ in last layer of network ∇_a C for quadratic cost 0.5(y(x) - a^L(x))² -> (a^L - y)
        BP2: δ^l = ((w^{l+1})^T δ^{l+1}) ⊙ σ'(z^L)          Propagate Errors from last layer (BP1) through rest of network to all layers
        BP3: ∂C/∂b^l_j = δ^l_j -> ∂C/∂b = δ                 Rate of change of cost wrp. to any bias
        BP4: ∂C/∂w^l_jk = a^{l-1}_k δ^l_j -> a_in δ_out     Rate of change of cost wrp. to any weight

        For all samples in minibatch
            1. Get the errors and activations for all nodes
            2. Calculate the gradients at the nodes and save them (accumulate gradients)

        Calculate average gradient at each node accumulated gradients/n_samples
        3. Update weights and biases according to update rule
            w_k' = w_k - lr * dC/dw_k
        """
        grad_w = [np.zeros(w.shape) for w in self.w]
        grad_b = [np.zeros(b.shape) for b in self.b]
        activations = [np.zeros_like(self.b[layer]) for layer in range(len(self.b))]

        for x, y in batch:
            # Backprop

            grad_w = [np.zeros(w.shape) for w in self.w]
            grad_b = [np.zeros(b.shape) for b in self.b]
            activation = x
            activations = [activation]
            zs = []

            # forward pass
            for w, b in zip(self.w, self.b):
                z = np.matmul(w, activation) + b
                zs.append(z)
                activation = self.sigmoid(z)
                activations.append(activation)

            print(len(activations))
            print(activations[-1].shape)
            print(y.shape)
            delta_L = activations[-1] - y

In [193]:
trns = transforms.Compose([transforms.ToTensor(), torch.nn.Flatten(), torch.squeeze])
train_data = datasets.MNIST(
    root="./data",
    train=True,
    download=True,
    transform=trns,
    target_transform=Lambda(lambda y: F.one_hot(torch.tensor(y), num_classes=10)),
)

test_data = datasets.MNIST(
    root="./data",
    train=False,
    download=True,
    transform=trns,
    target_transform=Lambda(lambda y: F.one_hot(torch.tensor(y), num_classes=10)),
)

In [194]:
net = Network([784, 20, 10])
net.train(train_data, epochs=20, batch_size=30, lr=0.01, test_data=test_data)

3
torch.Size([10, 20])
torch.Size([10])


  z = np.matmul(w, activation) + b
  return 1.0 / (1.0 + np.exp(-z))


RuntimeError: The size of tensor a (20) must match the size of tensor b (10) at non-singleton dimension 1

## Test

Data Preparation

Network