## Gradients for cross entropy loss

### Manual Gradients

In [12]:
import numpy as np
import torch

# ----- Setup -----
np.random.seed(0)
m, n, k = 5, 4, 3  # examples, features, classes

X = np.random.randn(n, m)                        # shape: (n, m)
W = np.random.randn(k, n)                        # shape: (k, n)
b = np.random.randn(k, 1)                        # shape: (k, 1)
Y = np.eye(k)[:, np.random.choice(k, m)]         # shape: (k, m), one-hot labels

# ----- Forward pass -----
Z = W @ X + b                                    # (k, m)

def softmax(Z):
    eZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return eZ / np.sum(eZ, axis=0, keepdims=True)

A = softmax(Z)                                   # (k, m)
loss = -np.sum(Y * np.log(A)) / m

# ----- Manual Gradients -----
dZ = A - Y                                       # (k, m)
dW = (dZ @ X.T) / m                              # (k, n)
db = (dZ @ np.ones((m, 1))) / m                  # (k, 1)


###  PyTorch (Autograd)

In [13]:
# PyTorch Tensors
X_t = torch.tensor(X.T, dtype=torch.float32)              # (m, n)
Y_idx = torch.tensor(np.argmax(Y, axis=0), dtype=torch.long)  # (m,)

W_t = torch.tensor(W, dtype=torch.float32, requires_grad=True)  # (k, n)
b_t = torch.tensor(b.squeeze(), dtype=torch.float32, requires_grad=True)  # (k,)

Z_t = X_t @ W_t.T + b_t                                   # (m, k)
loss_fn = torch.nn.CrossEntropyLoss()
loss_t = loss_fn(Z_t, Y_idx)
loss_t.backward()

# Autograd Gradients
dW_torch = W_t.grad.detach().numpy()                      # (k, n)
db_torch = b_t.grad.detach().numpy().reshape(-1, 1)       # (k, 1)


### Comparison

In [14]:
print("||dW_manual - dW_torch|| =", np.linalg.norm(dW - dW_torch))
print("||db_manual - db_torch|| =", np.linalg.norm(db - db_torch))


||dW_manual - dW_torch|| = 6.086321339476888e-08
||db_manual - db_torch|| = 4.8447627164651547e-08
