# Makemore - Manual Backprop

This will take the existing network, but build it into a low level class where we run every step manually (like in `micrograd` notebooks).

We will also use the set of debug functions created previously to debug and check the network.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
from sklearn.decomposition import PCA
import torch
from torch import nn
from torch.nn import functional as F
from torchviz import make_dot
from typing import List, Callable, Dict, Any, Union, Optional, Tuple
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


**HParams**

In [2]:
heads = 5
emb_size = 64
hidden_layer = 512

**Data Loading & Prep**

In [3]:
# load the name data
with open("data/names.txt") as f:
    names = f.read().splitlines()

# print stats
print("Number of names: ", len(names))

# split names into bigrams
bigrams = {}
for name in names:
    ls = ["<T>"] + list(name.lower()) + ["<T>"]
    tpl = list(zip(ls, ls[1:]))
    for bigram in tpl:
        bigrams[bigram] = bigrams.get(bigram, 0) + 1
items = sorted(list(set([b for a, b in bigrams.keys()])))
pos_map = {v: k for k, v in enumerate(items)}
num_items = len(items)
t_bigrams = torch.zeros((num_items, num_items))


# split names into 3 datasets (based on percentages)
train_perc, dev_perc, eval_perc = 0.8, 0.1, 0.1

# shuffle and split names
random.shuffle(names)
train_names = names[: int(len(names) * train_perc)]
dev_names = names[
    int(len(names) * train_perc) : int(len(names) * (train_perc + dev_perc))
]
eval_names = names[int(len(names) * (train_perc + dev_perc)) :]

# print stats
print(f"Train: {len(train_names)}")
print(f"Dev: {len(dev_names)}")
print(f"Eval: {len(eval_names)}")


def gen_dataset(items: List[str], encode: Callable[[str], List[int]], heads: int):
    train = []
    label = []
    for name in items:
        ls = [0] * heads + encode(name) + [0]
        tpl = list(zip(ls, *[ls[i + 1 :] for i in range(heads)]))
        for bigram in tpl:
            train.append(bigram[:-1])
            label.append(bigram[-1])

    # convert to tensors and expand as one-hots
    train = torch.tensor(train)
    label = F.one_hot(torch.tensor(label), num_items)
    return train, label


def char_encoding(name: str) -> List[int]:
    return [pos_map[i] for i in list(name.lower())]


print(f"Vocab Size: {num_items}")

num_mlp_items = len(pos_map)

train_X, train_y = gen_dataset(train_names, char_encoding, heads=heads)
dev_X, dev_y = gen_dataset(dev_names, char_encoding, heads=heads)
eval_X, eval_y = gen_dataset(eval_names, char_encoding, heads=heads)
print(train_X.shape)
print(dev_X.shape)
print(eval_X.shape)

Number of names:  32033
Train: 25626
Dev: 3203
Eval: 3204
Vocab Size: 27
torch.Size([182513, 5])
torch.Size([22720, 5])
torch.Size([22913, 5])


**Debug Helpers**

In [4]:
# plots the given model
def plot_mlp(model, X=train_X):
    return make_dot(model.forward(X), params=dict(model.named_parameters()))


def compute_nll(model, X, y):
    # compute the loss
    probs = model.predict_proba(X)
    lhood = (probs * y).sum(dim=1)
    # take mean to make usre large data is still handlable
    return -torch.log(lhood).mean()


# function to compute cross entropy loss
ce_loss = lambda p, y: F.cross_entropy(p, torch.argmax(y, dim=1))


def count_parameters(model):
    # sum number of elements in all model parameters that take a gradient
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def grid_space_lr(exp_start, exp_end, num) -> Tuple[torch.Tensor, torch.Tensor]:
    # NOTE: this can be used by single training steps (random data) then check when the loss explodes
    # generate a grid of learning rates
    space = torch.linspace(exp_start, exp_end, num)
    return 10**space, space


def grid_search_lr(
    model,
    exp_start: int,
    exp_end: int,
    steps: int,
    train_X: torch.Tensor,
    train_y: torch.Tensor,
    loss_fct: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
    grad_compute: Callable[[torch.Tensor], None],
    batch_size: int = 1000,
):
    # generate grid space
    lrs, lri = grid_space_lr(exp_start, exp_end, steps)

    # train model for each learning rate
    losses = []
    for i in range(len(lrs)):
        # sample from trainX and trainY
        idx = torch.randint(0, train_X.shape[0], (batch_size,))
        batch_X = train_X[idx]
        batch_y = train_y[idx]

        # compute loss
        probs = model(batch_X)[0]
        loss = loss_fct(probs, batch_y)
        grad_compute(loss)

        # optimize
        for p in model.parameters():
            p.data -= lrs[i] * p.grad.data
            p.grad.data.zero_()

        # compute loss
        losses.append(loss.sum().item())

    # plot data
    fig = plt.figure(figsize=(10, 5))
    ax = fig.add_subplot(111)
    ax.plot(lri, losses)
    ax.set_xlabel("Learning Rate Exp")
    ax.set_ylabel("Loss")
    ax.set_title("Loss vs Learning Rate Exp")
    plt.show()

    return losses, lri

def clean_stats():
    return {
        "batch_loss": [],
        "train_loss": [],
        "test_loss": [],
        "epoch_steps": [],
    }

def plot_stats(stats: dict):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    ax.plot(
        np.arange(len(stats["batch_loss"])), stats["batch_loss"], label="Batch Loss"
    )
    ax.plot(stats["epoch_steps"], stats["train_loss"], label="Train Loss")
    ax.plot(stats["epoch_steps"], stats["test_loss"], label="Test Loss")
    ax.set_xlabel("Batch Steps")
    ax.set_ylabel("Loss")
    ax.legend()
    plt.show()

# init the weights and verify how
def init_weights(model: nn.Module, init_func):
    for p in model.parameters():
        init_func(p)

def uniform_init(p: nn.Parameter):
    torch.nn.init.uniform_(p.data, -1, 1)

def fixed_init(p: nn.Parameter):
    p.data.fill_(0.01)

def he_init(p: nn.Parameter):
    # NOTE: to compute this dynamically we would need have access to the lower layers
    if p.dim() > 1:
        torch.nn.init.kaiming_uniform_(p.data, a=0, mode="fan_in", nonlinearity="tanh")
    else:
        torch.nn.init.uniform_(p.data, -1, 1)

def weight_stats(model: nn.Module):
    for p in model.parameters():
        print(f"Weight Mean: {p.data.mean()} - Weight Std: {p.data.std()}")

def visualize_activation(model: nn.Module, train_X: torch.Tensor, bins: int=100):
    # perform a forward pass
    probs = model.forward(train_X)
    # model out should be list of tuples
    layer_data = model.out[-2][1]

    # compute the results
    act_list = layer_data.view(-1).detach().numpy()
    plt.hist(act_list, bins=bins)
    plt.show()

    # also do advanced visualization
    fig = plt.figure(figsize=(20, 10))
    ax = fig.add_subplot(111)
    ax.imshow(layer_data.abs() > 0.99, cmap="gray")
    plt.show()

def visualize_forward(model, X, y, loss_fct, grad_compute, ud):
    logits = model.forward(X)
    data = dict(model.out)

    # create two stacked plots
    fig = plt.subplots(figsize=(20, 20))
    ax = plt.subplot(4, 1, 1)

    # plot the logits
    ax.set_title("Activations")
    for name in data:
        tdata = torch.histogram(data[name], density=True)
        ax.plot(tdata[1][:-1].detach(), tdata[0].detach(), label=name)
        data[name].retain_grad()
    # set limits on x axis
    ax.set_xbound(-2, 2)
    ax.legend()

    # go back
    loss = loss_fct(logits, y)
    grad_compute(loss)

    # get gradient data
    ax = plt.subplot(4, 1, 2)
    ax.set_title("Data Gradients")
    for name in data:
        tdata = torch.histogram(data[name].grad, density=True)
        ax.plot(tdata[1][:-1].detach(), tdata[0].detach(), label=name)
    ax.set_xbound(-.01, .01)
    ax.legend()

    # iterate model parameters
    ax = plt.subplot(4, 1, 3)
    ax.set_title("Model Gradients")
    for p in model.parameters():
        t = p.grad
        if t is None or t.ndim < 2 or t.shape[0] == 1:
            continue
        print(f"{t.shape} grad to data {t.std() / p.std()}")
        tdata = torch.histogram(t, density=True)
        ax.plot(tdata[1][:-1].detach(), tdata[0].detach(), label=f"{tuple(t.shape)}")
    ax.set_xbound(-.01, .01)
    ax.legend()

    # iterate model parameters
    ax = plt.subplot(4, 1, 4)
    ax.set_title("LR dep Weight Updates")
    for i, p in enumerate(model.parameters()):
        if p.ndim == 2:
            ax.plot([model.ud[j][i] for j in range(len(model.ud))], label=f"param {i}")
    ax.plot([0, len(model.ud)], [-3, -3], 'k', label='baseline')
    ax.legend()
    
    plt.show()


## Re-create our basic model

We want to build our model from scratch by reimplementing all the different layers and operations.

In [199]:
class Module():
    def __init__(self):
        self.out = []

    def forward(self, x):
        # applies a forward step, computing the output of the layer
        raise NotImplementedError

    def backward(self, grad):
        # applies a backward step, computing gradients based on gradient of previous layers
        raise NotImplementedError
    
    def learn(self, lr):
        # applies a gradient step to the internal gradients and zeros them again
        raise NotImplementedError

class Tanh(Module):
    def __init__(self):
        super().__init__()
        self._in = None
        self._fw = None

    def forward(self, x):
        self.out = []
        self._in = x
        self._fw = torch.tanh(x)
        self.out.append(("tanh", x))
        return self._fw

    def backward(self, grad):
        fw_grad = grad * (1 - torch.tanh(self._in) ** 2)
        return fw_grad
    
    def learn(self, lr):
        pass


class Linear(Module):
    def __init__(self, in_size: int, out_size: int, bias: bool=True, scale: float = 1.0, bias_scale: float = 1.0, g: torch.Generator = None):
        super().__init__()
        self._in = None
        self._mul_fw = None
        self._bias = bias
        self._w = torch.randn((in_size, out_size), generator=g) * scale
        if bias:
            self._bias_fw = None
            self._b = torch.randn((out_size,), generator=g) * bias_scale
    
    def forward(self, x):
        self.out = []
        self._in = x
        # execute mat_mul operation
        self._mul_fw = x @ self._w
        out = self._mul_fw
        if self._bias:
            self._bias_fw = out + self._b
            out = self._bias_fw
        self.out.append(("linear", out))
        return out
    
    def backward(self, grad):
        # compute gradient of bias first (as it modulates remainder)
        if self._bias:
            # note: input data here is the output of self._mat_mul - since this is just addition, gradient gets distributed equally
            self._b.grad = grad.sum(dim=0)
        # overall function here is Lin(X) = X @ W
        # compute gradient of the weights (dLin / dW) (this then modulates the input by the gradient)
        self._w.grad = self._in.t() @ grad
        # compute gradient of the input (dLin / dX)
        fw_grad = grad @ self._w.t()
        return fw_grad
    
    def learn(self, lr):
        # apply gradient step
        self._w -= lr * self._w.grad
        self._w.grad = None

        # check for bias
        if self._bias:
            self._b -= lr * self._b.grad
            self._b.grad = None
    
class CrossEntropy(Module):
    def __init__(self):
        super().__init__()
        self._fw = None

    def forward(self, x, y):
        self.out = []
        self.out.append(("ce_in", x))
        self._in = x
        # NOTE: at its core cross-entropy compares two distributions (in this case the one-hot encoded label and the softmax output)
        # softmax is just a normalization function, so we can compute the cross-entropy by comparing the two distributions
        # formula for softmax: softmax(x) = exp(x) / sum(exp(x))
        self._xe_fw = torch.exp(x)
        self._xsum_fw = torch.sum(self._xe_fw, dim=1, keepdim=True)
        self._sm_fw = self._xe_fw * (self._xsum_fw**-1)
        self._y_fw = y
        # now compute the cross-entropy
        self._loss_fw = -torch.sum(self._y_fw * torch.log(self._sm_fw), dim=1)
        # take the mean, as we want to boil down to single loss
        #self._fw = torch.sum(self._loss_fw * (self._loss_fw.numel()**-1))
        self._fw = torch.mean(self._loss_fw)
        # self._fw = torch.nn.functional.cross_entropy(x, y)
        self.out.append(("ce_out", x))
        return self._fw

    def backward(self, grad):
        # compute gradient of the mean (and expand to number of batches) [] to [n]
        grad = grad / self._loss_fw.numel()
        # NOTE: sum distributes the gradient, through the number of classes [n] to [n, c]
        # both can be done through broadcasting from point loss
        sm_grad = self._sm_fw
        sm_grad[self._y_fw == 1] -= 1

        # grad = grad * (self._y_fw / self._sm_fw)
        # compute gradient through the softmax layer
        #sm_grad = ((self._xe_fw * (self._xsum_fw - self._xe_fw)) / (self._xsum_fw**2.0))
        return grad * sm_grad
    
    def learn(self, lr):
        pass

class BatchNorm(Module):
    def __init__(self, in_size: int, momentum: float=0.9, eps: float=1e-5):
        super().__init__()
        self._in = None
        self._fw = None
        self._eps = eps
        self._mom = momentum
        self._mean = torch.zeros((1, in_size,))
        self._var = torch.ones((1, in_size,))
        self._gamma = torch.ones((1, in_size,))
        self._beta = torch.zeros((1, in_size,))
        self._n = 0
    
    def forward(self, x: torch.Tensor, train: bool=True):
        self.out = []
        self._in = x
        # compute mean and variance
        mean = torch.mean(x, dim=0, keepdim=True) if train else self._mean
        var = torch.var(x, dim=0, keepdim=True, unbiased=False) if train else self._var
        # update running mean and variance
        if train:
            self._mean = self._mom * self._mean + (1 - self._mom) * torch.mean(x, dim=1, keepdim=True)
            self._var = self._mom * self._var + (1 - self._mom) * torch.var(x, dim=1, keepdim=True)
        # apply batch norm
        self._sqrt_fw = torch.sqrt(var + self._eps)
        self._mean_fw = mean
        self._std_fw = (x - mean)
        self._fw = ((self._gamma * self._std_fw) / self._sqrt_fw) + self._beta
        self.out.append(("bn", self._fw))
        return self._fw
    
    def backward(self, grad):
        # apply gradient directly to the beta
        self._beta.grad = grad.sum(dim=0, keepdim=True)

        # compute gradient of the gamma
        self._gamma.grad = (grad * (self._std_fw / self._sqrt_fw)).sum(dim=0, keepdim=True)

        # FIXME: this is incorrect (does not contain all branches of data)
        # compute gradient of the input
        #fw_grad = ((self._gamma - (self._gamma/self._mean_fw)) * (self._std_fw) - (self._gamma * self._std_fw) * (2/self._in.numel()) * self._std_fw * (1/(2*self._sqrt_fw))) / (self._sqrt_fw**2)
        N = self._in.numel()
        fw_grad = self._gamma * (1/self._sqrt_fw - 1/N + (2*self._std_fw**2)/N)
        return fw_grad * grad
    
    def learn(self, lr):
        # apply gradient step
        self._gamma -= lr * self._gamma.grad
        self._gamma.grad = None
        self._beta -= lr * self._beta.grad
        self._beta.grad = None

class MLPModel(Module):
    def __init__(self, num_items: int, heads: int, emb_size: int, hidden_size: int, bn_momentum: float=0.9, seed: int=42):
        # generate to make predictable
        g = torch.Generator()
        if seed is not None:
            g = g.manual_seed(seed)

        # embedding weights
        self._embs = torch.randn((num_items, emb_size), generator=g)
        # layer 1 weights
        self._hidden1 = Linear(emb_size * heads, hidden_size, scale=(5/3)/((emb_size * heads)**0.5), g=g) # he init
        self._bn1 = BatchNorm(hidden_size, momentum=bn_momentum)
        self._hidden2 = Linear(hidden_size, hidden_size, scale=(5/3)/((hidden_size)**0.5), g=g) # he init
        self._bn2 = BatchNorm(hidden_size, momentum=bn_momentum)
        self._predict = Linear(hidden_size, num_items, bias=True, scale=0.1, bias_scale=0.1, g=g)

        # previous data
        # self._hidden1 = torch.randn((emb_size * heads, hidden_size), generator=g) * (5/3)/((emb_size * heads)**0.5) # he init
        # self._bngains1 = torch.ones((1, hidden_size))
        # self._bnbias1 = torch.zeros((1, hidden_size))
        # self._bnrmean1 = torch.zeros(hidden_size)
        # self._bnrvar1 = torch.ones(hidden_size)
        # # layer 2 weights
        # self._hidden2 = torch.randn((hidden_size, hidden_size), generator=g) * (5/3)/((hidden_size)**0.5) # he init
        # self._bngains2 = torch.ones((1, hidden_size))
        # self._bnbias2 = torch.zeros((1, hidden_size))
        # self._bnrmean2 = torch.zeros(hidden_size)
        # self._bnrvar2 = torch.ones(hidden_size)
        # # layer 3 weights
        # self._predict = torch.randn((hidden_size, num_items), generator=g) * 0.1
        # self._predict_bias = torch.randn(num_items, generator=g) * 0.1

        # create some helper vars
        self.out = []
        # self.bn_momentum = bn_momentum
        self._train = True

    def __call__(self, X):
        return self.forward(X)
    
    @property
    def train(self):
        return self._train

    @train.setter
    def train(self, value):
        self._train = value
    
    def forward(self, X: torch.Tensor):
        # check if X needs to be expanded
        X = X.unsqueeze(0) if X.ndim == 1 else X
        self.out = [("X", X)]

        # retrieve and combine embeddings
        embs = self._embs[X]
        embs = embs.view(embs.shape[0], -1)
        self.out.append(("embs", embs))

        # compute hidden layer and apply batchnorm
        hidden = embs @ self._hidden1
        self.out.append(("hidden1_mul", hidden))
        bn_mean = hidden.mean(0, keepdim=True) if self._train else self._bnrmean1
        bn_var = hidden.var(0, keepdim=True) if self._train else self._bnrvar1
        hidden = self._bngains1 * ((hidden - bn_mean) / torch.sqrt(bn_var + 1e-5)) + self._bnbias1
        self.out.append(("hidden1", hidden))
        if self._train:
            self._bnrmean1 = self.bn_momentum * bn_mean + (1 - self.bn_momentum) * self._bnrmean1
            self._bnrvar1 = self.bn_momentum * bn_var + (1 - self.bn_momentum) * self._bnrvar1
        hidden = torch.tanh(hidden)
        self.out.append(("hidden1_act", hidden))
        
        # compute hidden layer and apply batchnorm
        hidden = hidden @ self._hidden2
        self.out.append(("hidden2_mul", hidden))
        bn_mean = hidden.mean(0, keepdim=True) if self._train else self._bnrmean2
        bn_var = hidden.var(0, keepdim=True) if self._train else self._bnrvar2
        hidden = self._bngains2 * ((hidden - bn_mean) / torch.sqrt(bn_var + 1e-5)) + self._bnbias2
        self.out.append(("hidden2", hidden))
        if self._train:
            self._bnrmean2 = self.bn_momentum * bn_mean + (1 - self.bn_momentum) * self._bnrmean2
            self._bnrvar2 = self.bn_momentum * bn_var + (1 - self.bn_momentum) * self._bnrvar2
        hidden = torch.tanh(hidden)
        self.out.append(("hidden2_act", hidden))

        # compute logits
        logits = hidden @ self._predict + self._predict_bias
        self.out.append(("logits", logits))
        return logits
    
    def backward(self, grad):
        # compute the gradients for each part of the network and return the input gradient
        # NOTE: in general we do not really care about that input gradient
        pass

    def predict(self, X: torch.Tensor):
        probs = self.predict_proba(X)
        return torch.argmax(probs, dim=1)
    
    def predict_proba(self, X: torch.Tensor):
        logits = self.forward(X)
        return F.softmax(logits, dim=1)
    
    def parameters(self):
        return [self._embs, self._hidden1, self._bngains1, self._bnbias1, self._hidden2, self._bngains2, self._bnbias2, self._predict]
    
    def named_parameters(self):
        return dict(embs=self._embs, hidden1=self._hidden1, bngains1=self._bngains1, bnbias1=self._bnbias1, hidden2=self._hidden2, bngains2=self._bngains2, bnbias2=self._bnbias2, predict=self._predict)
    
    def reset_grads(self):
        for p in self.parameters():
            p.grad = None

# create model
model = MLPModel(num_items, heads, emb_size, hidden_layer, bn_momentum=0.1)
logs = model.forward(train_X[:10])
print(logs.shape)
print(f"Model Parameters: {sum([p.numel() for p in model.parameters()])}")

# plot activations
visualize_activation(model, train_X[:100])

TypeError: unsupported operand type(s) for @: 'Tensor' and 'Linear'

Next we need to build an optimize and a backward function.

Keep in mind that chain rule for a chain of functions $f(x) = f_2(f_1(x))$ is:

$$\frac{\partial f}{\partial x} = \frac{\partial f}{\partial f_2} \frac{\partial f_2}{\partial f_1} \frac{\partial f_1}{\partial x}$$

A simpler case of $L(x) = y(x)$ is:

$$\frac{\partial L}{\partial x} = \frac{\partial L}{\partial y} \frac{\partial y}{\partial x}$$

Where $\frac{\partial L}{\partial x}$ is the gradient of the loss with respect to $x$, i.e. how much does x influence the value of the loss (since we want to minimize loss we want to move into the negative direction here). For the starting point $\frac{\partial L}{\partial y}$ has a value of 1, since we want to start with the gradient of the loss with respect to the output of the layer.

In [None]:
def compute_gradients(pred: torch.Tensor, y: torch.Tensor, loss_fct: Callable):
    # compute the loss
    loss: torch.Tensor = loss_fct(pred, y)

    # start with the gradients of the loss, Function is L(x, y) = f_1(f_net(x), y)
    # as chain rule the derivative of L/dx = dL/df_1 * df_1/df_net * df_net/dx
    # we have the dL/df_1 = 1, df_1/df_net is the derivative of the loss function
    loss.grad = torch.ones_like(loss)
    pred.grad = loss.grad * grad()
    
    # compute the gradients manually
    # chain rule: dL/dW = dL/dY * dY/dW

def optimize(model: MLPModel, stats) -> List:
    # TODO: integrate stats and build the ud list
    pass

## Appendix

Some additonal gradient computations

In [17]:
X = torch.arange(6, requires_grad=True, dtype=torch.float32).view(2, 3) + 1
X.retain_grad()
W = torch.arange(9, requires_grad=True, dtype=torch.float32).view(3, 3) + 1
W.retain_grad()

print(X)
print(W)

O = X @ W
print(O)
O.backward(torch.ones_like(O))
print(W.grad)
print(X.t() @ torch.ones_like(O))
print(X.grad)
print(torch.ones_like(O) @ W.t())

jjjO.numel()

tensor([[1., 2., 3.],
        [4., 5., 6.]], grad_fn=<AddBackward0>)
tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]], grad_fn=<AddBackward0>)
tensor([[30., 36., 42.],
        [66., 81., 96.]], grad_fn=<MmBackward0>)
tensor([[5., 5., 5.],
        [7., 7., 7.],
        [9., 9., 9.]])
tensor([[5., 5., 5.],
        [7., 7., 7.],
        [9., 9., 9.]], grad_fn=<MmBackward0>)
tensor([[ 6., 15., 24.],
        [ 6., 15., 24.]])
tensor([[ 6., 15., 24.],
        [ 6., 15., 24.]], grad_fn=<MmBackward0>)


6

In [200]:
def compare(mod: Callable[[int], Module], fct: Callable[[torch.Tensor], torch.Tensor]):
    th = mod(3)
    print(f"Test {th.__class__.__name__}")
    # Test the layers
    X = torch.arange(6, requires_grad=True, dtype=torch.float32).view(2, 3) + 1
    X.retain_grad()
    print(X)
    y = th.forward(X)
    print("> Manually")
    print(y)
    grad = th.backward(torch.ones_like(y))
    print(grad)

    # compute with autograd
    print("> Autograd")
    X = torch.arange(6, requires_grad=True, dtype=torch.float32).view(2, 3) + 1
    X.retain_grad()
    y = fct(X)
    print(y)
    y.backward(torch.ones_like(y))
    print(X.grad)

#compare(lambda s: Tanh(), lambda X: torch.tanh(X))
g1 = torch.Generator()
g1.manual_seed(42)
g2 = torch.Generator()
g2.manual_seed(42)
#compare(lambda s: Linear(s, s, g=g1), lambda X: X @ torch.randn((3, 3), dtype=torch.float32, generator=g2) + 1)
bn = nn.BatchNorm1d(3, momentum=0.1)
compare(lambda s: BatchNorm(s, momentum=0.9, eps=0.00001), lambda X: bn(X))

# check cross entropy
print(f"Test Entropy")
# Test the layers
X = torch.arange(6, requires_grad=True, dtype=torch.float32).view(2, 3) + 1
Y = torch.tensor([0, 1], dtype=torch.long)
Y = torch.nn.functional.one_hot(Y, num_classes=3).float()
X.retain_grad()
print(X)
th = CrossEntropy()
y = th.forward(X, Y)
print("> Manually")
print(y)
grad = th.backward(torch.ones_like(y))
print(grad)

# compute with autograd
print("> Autograd")
X = torch.arange(6, requires_grad=True, dtype=torch.float32).view(2, 3) + 1
Y = torch.tensor([0, 1], dtype=torch.long)
Y = torch.nn.functional.one_hot(Y, num_classes=3).float()
X.retain_grad()
th = CrossEntropy()
y = th.forward(X, Y)
# y = F.cross_entropy(X, Y)
print(y)
y.backward(torch.ones_like(y))
print(X.grad)

Test BatchNorm
tensor([[1., 2., 3.],
        [4., 5., 6.]], grad_fn=<AddBackward0>)
> Manually
tensor([[-1.0000, -1.0000, -1.0000],
        [ 1.0000,  1.0000,  1.0000]], grad_fn=<AddBackward0>)
tensor([[1.2500, 1.2500, 1.2500],
        [1.2500, 1.2500, 1.2500]], grad_fn=<MulBackward0>)
> Autograd
tensor([[-1.0000, -1.0000, -1.0000],
        [ 1.0000,  1.0000,  1.0000]], grad_fn=<NativeBatchNormBackward0>)
tensor([[0., 0., 0.],
        [0., 0., 0.]])
Test Entropy
tensor([[1., 2., 3.],
        [4., 5., 6.]], grad_fn=<AddBackward0>)
> Manually
tensor(1.9076, grad_fn=<MeanBackward0>)
tensor([[-0.4550,  0.1224,  0.3326],
        [ 0.0450, -0.3776,  0.3326]], grad_fn=<MulBackward0>)
> Autograd
tensor(1.9076, grad_fn=<MeanBackward0>)
tensor([[-0.4550,  0.1224,  0.3326],
        [ 0.0450, -0.3776,  0.3326]])


In [176]:
X

tensor([[1., 2., 3.],
        [4., 5., 6.]], grad_fn=<AddBackward0>)