In [33]:
# row-major (C / PyTorch default)	
#“weights are stored row-by-row, so route the row-gradient (Iₘ) first, then scale each entire row by the inputs (xᵀ).”

import torch, itertools
m, n = 2, 3  # 2 outputs, 3 inputs
x = torch.tensor([1.3, 0.9, -0.7])  # 3D input vector
W = torch.tensor([[0.5, -0.3, 0.2], 
                  [0.2, 0.8, -0.1]], requires_grad=True)  # 2x3 weight matrix
I = torch.eye(m)

# Col-major Kronecker product: Iₘ ⊗ xᵀ
J = torch.kron(x.unsqueeze(0), I)  # (m × m n)

print("Partial derivative of Wx with respect to W (col-major)")
print("Kronecker Jacobian Iₘ ⊗ xᵀ:\n", J, "\n")
print("Weight matrix W:\n", W, "\n")

# Show the structure of the row-major Jacobian
print("Col-major structure:")
print("[[x1  0  x2   0   x3   0 ]")
print(" [ 0   x1   0  x2  0  x3]]\n")

# Row in W corresponds to a neuron
# Column in W corresponds to a weight that will multiply with feature in x (column in x)
# Result is the neuron's activation
# Ji,j = x_t * I_m = δ_{ip} * x_q

# Verify the Jacobian entries
for i, p, q in itertools.product(range(m), repeat=3):
    j = q*m + p
    if q < n:   # keep q in 0..n-1
        lhs = J[i, j].item()
        rhs = (1.0 if i==p else 0.0)*x[q].item()
        print(f"i={i}, p={p}, q={q}:  J = {lhs:.2f}  δ·x = {rhs:.2f}")

# Compute the output and compare manual vs autograd gradients
y = W @ x
print("\nWeighted output y = Wx:\n", y)

# Manual gradient computation
manual_grad = torch.kron(x.unsqueeze(0), I)  # Iₘ ⊗ xᵀ
print("\nManual gradient (xᵀ ⊗ Iₘ):\n", manual_grad)

# Compute gradients using autograd by backpropagating ones through the network
y.backward(torch.ones_like(y))  # Compute gradients
autograd_grad = W.grad  # Get raw gradients in shape [2, 3]

print("\nOriginal autograd gradient:\n", autograd_grad)


# Reshape autograd gradients to match manual gradient shape [2, 6]
# Initialize a tensor of zeros with shape [m, m*n] to store the reshaped gradients
# This creates a 2x6 tensor since m=2 and n=3
autograd_grad_reshaped = torch.zeros(m, m*n)  

# Iterate through each output neuron (m=2)
for i in range(m):
    # Iterate through each input feature (n=3)
    for j in range(n):
        # Place each gradient in column-major order
        # j*m + i calculates the correct column index in the reshaped tensor
        # This formula maps from the original [m,n] shape to the column-major [m,m*n] shape
        # For example, with m=2, n=3:
        # - When j=0, i=0: 0*2 + 0 = 0  (first column)
        # - When j=0, i=1: 0*2 + 1 = 1  (second column)
        # - When j=1, i=0: 1*2 + 0 = 2  (third column)
        # - When j=1, i=1: 1*2 + 1 = 3  (fourth column)
        # And so on, creating a column-major arrangement of the gradients
        # This ensures gradients are arranged in blocks by input feature
        autograd_grad_reshaped[i, j*m + i] = autograd_grad[i, j]  

print("\nAutograd gradient (reshaped):\n", autograd_grad_reshaped)

# Verify they match
print("\nGradients match:", torch.allclose(manual_grad, autograd_grad_reshaped))

# i is which activation we are taking the derivative of (e.g. a_0)
# p is the weight you perturb (i.e. which neuron the weight belongs to) -- row in W
# q is the column index of the weight (which input feature it multiplies) -- e.g. x_1

# The weight matrix W has rows that correspond one-to-one to the output neurons.
# Row 0 are all the weights feeding neuron 0

# a_i is the pre-activation (the weighted sum before any non-linearity) of neuron i
# The activation vector a collects those neurons: a_0, a_1, ..., a_{m-1}
# So a_0 is literally the output of "neuron 0": a_0 = W0,q @ x_q

# This is why the ∂a_i / ∂W_pq = δ_ip * x_q
# If you perturb a weight in same row p, the change in activation i is "input * weight change"
# If you perturb a weight in different row p, the change in activation i is 0


Partial derivative of Wx with respect to W (col-major)
Kronecker Jacobian Iₘ ⊗ xᵀ:
 tensor([[ 1.3000,  0.0000,  0.9000,  0.0000, -0.7000, -0.0000],
        [ 0.0000,  1.3000,  0.0000,  0.9000, -0.0000, -0.7000]]) 

Weight matrix W:
 tensor([[ 0.5000, -0.3000,  0.2000],
        [ 0.2000,  0.8000, -0.1000]], requires_grad=True) 

Col-major structure:
[[x1  0  x2   0   x3   0 ]
 [ 0   x1   0  x2  0  x3]]

i=0, p=0, q=0:  J = 1.30  δ·x = 1.30
i=0, p=0, q=1:  J = 0.90  δ·x = 0.90
i=0, p=1, q=0:  J = 0.00  δ·x = 0.00
i=0, p=1, q=1:  J = 0.00  δ·x = 0.00
i=1, p=0, q=0:  J = 0.00  δ·x = 0.00
i=1, p=0, q=1:  J = 0.00  δ·x = 0.00
i=1, p=1, q=0:  J = 1.30  δ·x = 1.30
i=1, p=1, q=1:  J = 0.90  δ·x = 0.90

Weighted output y = Wx:
 tensor([0.2400, 1.0500], grad_fn=<MvBackward0>)

Manual gradient (xᵀ ⊗ Iₘ):
 tensor([[ 1.3000,  0.0000,  0.9000,  0.0000, -0.7000, -0.0000],
        [ 0.0000,  1.3000,  0.0000,  0.9000, -0.0000, -0.7000]])

Original autograd gradient:
 tensor([[ 1.3000,  0.9000, -0.7000],

In [24]:
# column-major (Fortran / MATLAB default)
#“weights are stored column-by-column, so route the column-gradient (xᵀ) first, then scale each entire column by the inputs (Iₘ).

import torch, itertools
m, n = 2, 3  # 2 outputs, 3 inputs
x = torch.tensor([1.3, 0.9, -0.7])  # 3D input vector
W = torch.tensor([[0.5, -0.3, 0.2], 
                  [0.2, 0.8, -0.1]], requires_grad=True)  # 2x3 weight matrix
I = torch.eye(m)

# Row-major Kronecker product: Iₘ ⊗ xᵀ
J = torch.kron(I, x.unsqueeze(0))  # (m × m n)

print("Partial derivative of Wx with respect to W (row-major)")
print("Kronecker Jacobian Iₘ ⊗ xᵀ:\n", J, "\n")
print("Weight matrix W:\n", W, "\n")

# Show the structure of the row-major Jacobian
print("Row-major structure:")
print("[[x1  x2  x3   0   0   0 ]")
print(" [ 0   0   0  x1  x2  x3]]\n")

# Verify the Jacobian entries
for i, p, q in itertools.product(range(m), repeat=3):
    j = q*m + p
    if q < n:   # keep q in 0..n-1
        lhs = J[i, j].item()
        rhs = (1.0 if i==p else 0.0)*x[q].item()
        print(f"i={i}, p={p}, q={q}:  J = {lhs:.2f}  δ·x = {rhs:.2f}")

# Compute the output and compare manual vs autograd gradients
y = W @ x
print("\nWeighted output y = Wx:\n", y)

# Manual gradient computation
manual_grad = torch.kron(I, x.unsqueeze(0))  # Iₘ ⊗ xᵀ
print("\nManual gradient (Iₘ ⊗ xᵀ):\n", manual_grad)

# Compute gradients using autograd by backpropagating ones through the network
# This line computes gradients by backpropagating a tensor of ones through the network
# torch.ones_like(y) creates a tensor of ones with the same shape as y
# This is a common pattern in PyTorch when you want to compute gradients with respect to all outputs
y.backward(torch.ones_like(y))  # Compute gradients
autograd_grad = W.grad  # Get raw gradients in shape [2, 3]

# Reshape gradients into Kronecker-like structure [2, 6] where each row contains
# the input vector x in the appropriate position, with zeros elsewhere
autograd_grad = torch.zeros(m, m*n)  # Initialize [2, 6] tensor
for i in range(m):
    autograd_grad[i, i*n:(i+1)*n] = x  # Place x in the correct positions

print("\nAutograd gradient:\n", autograd_grad)

# Verify they match
print("\nGradients match:", torch.allclose(manual_grad, autograd_grad))


Partial derivative of Wx with respect to W (row-major)
Kronecker Jacobian Iₘ ⊗ xᵀ:
 tensor([[ 1.3000,  0.9000, -0.7000,  0.0000,  0.0000, -0.0000],
        [ 0.0000,  0.0000, -0.0000,  1.3000,  0.9000, -0.7000]]) 

Weight matrix W:
 tensor([[ 0.5000, -0.3000,  0.2000],
        [ 0.2000,  0.8000, -0.1000]], requires_grad=True) 

Row-major structure:
[[x1  x2  x3   0   0   0 ]
 [ 0   0   0  x1  x2  x3]]

i=0, p=0, q=0:  J = 1.30  δ·x = 1.30
i=0, p=0, q=1:  J = -0.70  δ·x = 0.90
i=0, p=1, q=0:  J = 0.90  δ·x = 0.00
i=0, p=1, q=1:  J = 0.00  δ·x = 0.00
i=1, p=0, q=0:  J = 0.00  δ·x = 0.00
i=1, p=0, q=1:  J = -0.00  δ·x = 0.00
i=1, p=1, q=0:  J = 0.00  δ·x = 1.30
i=1, p=1, q=1:  J = 1.30  δ·x = 0.90

Weighted output y = Wx:
 tensor([0.2400, 1.0500], grad_fn=<MvBackward0>)

Manual gradient (Iₘ ⊗ xᵀ):
 tensor([[ 1.3000,  0.9000, -0.7000,  0.0000,  0.0000, -0.0000],
        [ 0.0000,  0.0000, -0.0000,  1.3000,  0.9000, -0.7000]])

Autograd gradient:
 tensor([[ 1.3000,  0.9000, -0.7000,  0.0000

In [34]:
import torch
A = torch.tensor([[0, 1, 2],
                  [3, 4, 5]])          # 2 × 3

print("row-major flatten :", A.reshape(-1).tolist())
print("column-major via .t() :", A.t().contiguous().reshape(-1).tolist())


# Numerics Identical; SGD sees the same update values.
# Code Frameworks (PyTorch, JAX, TF) store tensors row-major, so if
#  you ever call flatten()/reshape(-1) you’re in row-major space and
#  should use Iₘ ⊗ xᵀ.
# Paper algebra Most math texts use column-major because BLAS does,
#  so they write xᵀ ⊗ Iₘ. Just swap factors when translating to code.
# Once you reshape back to an m x n matrix, everything collapses to a simple outer product
#  aL/aW = [diag(g * aL/av)R]x_t

row-major flatten : [0, 1, 2, 3, 4, 5]
column-major via .t() : [0, 3, 1, 4, 2, 5]
