In [63]:
# -----------------------------------------------------------
# one_neuron_grad.py  –  per-neuron Kronecker view
# -----------------------------------------------------------
import torch, math
torch.manual_seed(0)
dtype = torch.float64

m, n = 2, 3                     # outputs, inputs  (tiny toy layer)

# ----- tiny random tensors -----
W = torch.randn(m, n, requires_grad=True)
x = torch.randn(n)              # input vector (n,)
g = torch.randn(m)              # gain vector
b = torch.randn(m)              # bias vector

# ----- forward pass -----
a = W @ x                       # (m,)
u = torch.sqrt((a ** 2).mean()) # RMS scale (scalar)
v = g * (a / u) + b             # (m,)
loss = v.pow(2).sum()           # simple scalar loss

# ----- Autograd reference -----
loss.backward()
grad_auto = W.grad.detach()   # (m × n)
print("\nAutograd  ∂L/∂W\n", grad_auto, grad_auto.shape)


Autograd  ∂L/∂W
 tensor([[-0.0538, -0.1118,  0.0960],
        [ 0.3203,  0.6655, -0.5712]]) torch.Size([2, 3])


In [64]:
x.unsqueeze(1).transpose(0,1)

tensor([[ 0.4033,  0.8380, -0.7193]])

In [92]:
# ------------ analytic gradient ----------------------------
d = 2 * v.detach()                                  # ∂L/∂v  (because L = Σ v_i²)
I = torch.eye(m, dtype=dtype)                       # Identity matrix
R = (1/u) * (I - torch.outer(a, a) / (m * u**2))    # (m × m)
print("R", R, R.shape)
# P = torch.diag(g * d) @ R                           # (m x m) diag(g ⊙ d) · R
# print("P", P, P.shape)
# print("x", x, x.shape)
# Calculate grad_an using the Kronecker product method
row      = (d * g) @ R                        # (m,) - this is your P @ (d*g) 
# The Kronecker product with the identity matrix is implementing the Kronecker delta 
print("x", x, x.shape)
J        = torch.kron(I, x.unsqueeze(0))      # (m, m*n) - Jacobian matrix
# I[0,0] = 1  →  1 * [a, b, c] = [a, b, c]
# I[0,1] = 0  →  0 * [a, b, c] = [0, 0, 0]
# I[1,0] = 0  →  0 * [a, b, c] = [0, 0, 0]
# I[1,1] = 1  →  1 * [a, b, c] = [a, b, c]
# J = [[a, b, c, 0, 0, 0],
#      [0, 0, 0, a, b, c]]
# The identity matrix I provides the Kronecker delta structure
# The Kronecker product spreads this pattern across the x values.
print("J shape:", J.shape)
print("\nJ matrix structure:")
for i in range(J.shape[0]):
    print(f"Row {i}:", J[i])
grad_vec = row.unsqueeze(0) @ J               # (1, m*n) - apply chain rule
print("grad_vec", grad_vec, grad_vec.shape)
grad_an  = grad_vec.view(m, n) 

print("grad_an shape:", grad_an.shape)
print("grad_an:\n", grad_an)
# grad_an   = P @ x.unsqueeze(0).transpose(0, 1)              # outer product with xᵀ  → (m × n)
print("\nAnalytic ∂L/∂W\n", grad_an, grad_an.shape)
print("\nAutograd  ∂L/∂W\n", grad_auto, grad_auto.shape)

R tensor([[ 0.0197, -0.1173],
        [-0.1173,  0.6982]], grad_fn=<MulBackward0>) torch.Size([2, 2])
x tensor([ 0.4033,  0.8380, -0.7193]) torch.Size([3])
J shape: torch.Size([2, 6])

J matrix structure:
Row 0: tensor([ 0.4033,  0.8380, -0.7193,  0.0000,  0.0000, -0.0000])
Row 1: tensor([ 0.0000,  0.0000, -0.0000,  0.4033,  0.8380, -0.7193])
grad_vec tensor([[-0.0538, -0.1118,  0.0960,  0.3203,  0.6655, -0.5712]],
       grad_fn=<MmBackward0>) torch.Size([1, 6])
grad_an shape: torch.Size([2, 3])
grad_an:
 tensor([[-0.0538, -0.1118,  0.0960],
        [ 0.3203,  0.6655, -0.5712]], grad_fn=<ViewBackward0>)

Analytic ∂L/∂W
 tensor([[-0.0538, -0.1118,  0.0960],
        [ 0.3203,  0.6655, -0.5712]], grad_fn=<ViewBackward0>) torch.Size([2, 3])

Autograd  ∂L/∂W
 tensor([[-0.0538, -0.1118,  0.0960],
        [ 0.3203,  0.6655, -0.5712]]) torch.Size([2, 3])


In [84]:
# ------------ numeric check --------------------------------
print("\nMax |Δ|  :", (grad_an - grad_auto).abs().max().item())


Max |Δ|  : 0.0
