### Gradients in Binary Cross Entropy Loss

In [8]:
import torch

# x: features x samples (2 x 3)
x = torch.tensor([[1.0, 3.0, 5.0],
                  [2.0, 4.0, 6.0]], requires_grad=False)

# y: 1 x samples
y = torch.tensor([[1.0, 0.0, 1.0]])  # shape (1, 3)

# weights: features x 1
w = torch.tensor([[0.5], [-0.5]], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

# Forward: z = w^T x + b
z = w.T @ x + b  # shape (1, 3)
a = torch.sigmoid(z)

# BCE loss averaged over samples
loss = -(y * torch.log(a) + (1 - y) * torch.log(1 - a)).mean()

loss.backward()

with torch.no_grad():
    grad_z = a - y  # shape (1, 3)
    grad_w_manual = (x @ grad_z.T) / x.shape[1]  # (2,3) @ (3,1) = (2,1)
    grad_b_manual = grad_z.mean()

print("PyTorch grad w:\n", w.grad)
print("Manual grad w:\n", grad_w_manual)
print("PyTorch grad b:\n", b.grad)
print("Manual grad b:\n", grad_b_manual)


PyTorch grad w:
 tensor([[-0.8674],
        [-1.1565]])
Manual grad w:
 tensor([[-0.8674],
        [-1.1565]])
PyTorch grad b:
 tensor([-0.2891])
Manual grad b:
 tensor(-0.2891)
