In [None]:
import torch

weights1 = torch.tensor([[0.1, 0.2],
                        [0.3, 0.4]], dtype=torch.float32, requires_grad=True)
inputs   = torch.tensor([[0.05, 0.15],
                        [0.25, 0.35]], dtype=torch.float32, requires_grad=True)
weights2 = torch.tensor([[0.01, 0.02],
                        [0.03, 0.04]], dtype=torch.float32, requires_grad=True)

layer1_linear = weights1.matmul(inputs.transpose(0,1))
layer1_activated = torch.relu(layer1_linear)

layer2_linear = layer1_activated.matmul(weights2)
layer2_sum = layer2_linear.sum(1, keepdim=True)  # Keep dimension to make it [2,1] instead of [2]
layer2_output = layer2_sum

expand_tensor = torch.tensor([[1.0, 1.0]], dtype=torch.float32)
layer2_output_expanded = layer2_output.matmul(expand_tensor)

layer3_linear = layer2_output_expanded * weights1
layer3_nonlinear = torch.exp(layer3_linear)
layer3_output = layer3_nonlinear + layer2_output_expanded

logits = layer3_output.matmul(weights2.transpose(0,1))

probabilities = 1.0 / (1.0 + torch.exp(-1.0 * logits))

epsilon = 1e-10
log_probs = torch.log(probabilities + epsilon)
neg_log_probs = -1.0 * log_probs + 0.1

loss_sum = neg_log_probs.sum(0).sum(0)

loss_sum.backward()

print("Gradient of weights1:")
print(weights1.grad)
print("\nGradient of inputs:")
print(inputs.grad)
print("\nGradient of weights2:")
print(weights2.grad)

In [None]:
import torch

# Create two 2x2 tensors with gradients enabled.
a = torch.tensor([[1.0, 2.0],
                  [3.0, 4.0]], requires_grad=True)
b = torch.tensor([[0.1, 0.2],
                  [0.3, 0.4]], requires_grad=True)

# --- Test 1: Sum ---
# Sum over dimension 0 (i.e. summing rows) and keep the dimension.
sum_result = a.sum(dim=0, keepdim=True)  
print(sum_result)
# Because sum_result is not scalar (it's shape [1,2]),
# we pass an explicit gradient tensor of matching shape.
sum_result.backward(torch.ones_like(sum_result))
print("Sum gradient:")
print(a.grad)
# Reset a's gradients.
a.grad.zero_()

# --- Test 2: Matmul ---
matmul_result = a.matmul(b)  # Matmul returns a [2,2] tensor.
matmul_result.backward(torch.ones_like(matmul_result))
print("\nMatmul gradient for a:")
print(a.grad)
print("\nMatmul gradient for b:")
print(b.grad)
a.grad.zero_()
b.grad.zero_()

# --- Test 3: Transpose ---
# Compute the transpose of 'a'.
transpose_result = a.transpose(0, 1)  # Still 2x2.
transpose_result.backward(torch.ones_like(transpose_result))
print("\nTranspose gradient:")
print(a.grad)
a.grad.zero_()

# --- Test 4: Exponential ---
exp_result = a.exp()  # Elementwise exponentiation.
exp_result.backward(torch.ones_like(exp_result))
print("\nExp gradient:")
print(a.grad)


In [None]:
import torch

weights1 = torch.tensor([[0.1, 0.3],
                        [0.2, 0.4]], dtype=torch.float32, requires_grad=True)
inputs   = torch.tensor([[0.05, 0.25],
                        [0.15, 0.35]], dtype=torch.float32, requires_grad=True)
weights2 = torch.tensor([[0.01, 0.03],
                        [0.02, 0.04]], dtype=torch.float32, requires_grad=True)

layer1_linear = weights1.matmul(inputs.transpose(0,1))
layer1_activated = torch.relu(layer1_linear)

layer2_linear = layer1_activated.matmul(weights2)
layer2_sum = layer2_linear.sum(1, keepdim=True)  # Keep dimension to make it [2,1] instead of [2]
layer2_output = layer2_sum

expand_tensor = torch.tensor([[1.0, 1.0]], dtype=torch.float32)
layer2_output_expanded = layer2_output.matmul(expand_tensor)

layer3_linear = layer2_output_expanded * weights1
layer3_nonlinear = torch.exp(layer3_linear)
layer3_output = layer3_nonlinear + layer2_output_expanded

logits = layer3_output.matmul(weights2.transpose(0,1))

probabilities = 1.0 / (1.0 + torch.exp(-1.0 * logits))

epsilon = 1e-10
log_probs = torch.log(probabilities + epsilon)
neg_log_probs = -1.0 * log_probs + 0.1

loss_sum = neg_log_probs.sum(0).sum(0)

loss_sum.backward()

print("Gradient of weights1:")
print(weights1.grad)
print("\nGradient of inputs:")
print(inputs.grad)
print("\nGradient of weights2:")
print(weights2.grad)

In [None]:
import torch

weights1 = torch.tensor([[0.1, 0.3],
                        [0.2, 0.4]], dtype=torch.float32, requires_grad=True)
inputs = torch.tensor([[0.05, 0.25],
                      [0.15, 0.35]], dtype=torch.float32)
weights2 = torch.tensor([[0.01, 0.03],
                        [0.02, 0.04]], dtype=torch.float32)

layer1 = weights1.matmul(inputs.transpose(0,1))
layer1.retain_grad()
layer1_activated = torch.relu(layer1)
layer1_activated.retain_grad()

# print(layer1_activated)
# print(weights2)

layer2 = layer1_activated.matmul(weights2) # HERE
# print(layer2)
layer2.retain_grad()

layer2_sum = layer2.sum(1, keepdim=True)
layer2_sum.retain_grad()
layer2_expanded = layer2_sum.matmul(torch.tensor([[1.0, 1.0]], dtype=torch.float32))
layer2_expanded.retain_grad()

layer3 = layer2_expanded * weights1  # Element-wise multiplication
layer3.retain_grad()
loss = layer3.sum()


print("of weights1:")
print(weights1)

print("\nof layer1:")
print(layer1)

print("\nof layer1_activated:")
print(layer1_activated)

print("\nof layer2:")
print(layer2)

print("\nof layer2_sum:")
print(layer2_sum)

print("\nof layer2_expanded:")
print(layer2_expanded)

print("\nof layer3:")
print(layer3)


loss.backward()

print("Gradient of weights1:")
print(weights1.grad)

print("\nGradient of layer1:")
print(layer1.grad)

print("\nGradient of layer1_activated:")
print(layer1_activated.grad)

print("\nGradient of layer2:")
print(layer2.grad)

print("\nGradient of layer2_sum:")
print(layer2_sum.grad)

print("\nGradient of layer2_expanded:")
print(layer2_expanded.grad)

print("\nGradient of layer3:")
print(layer3.grad)


'''
Gradient of weights1: 
Tensor(data=[0.0146, 0.0214, 0.0226, 0.0334], shape=[2, 2])

Gradient of layer1: HERE!
Tensor(data=[0.012, 0.018, 0.028, 0.042], shape=[2, 2])

Gradient of layer1_activated:
Tensor(data=[0.012, 0.018, 0.028, 0.042], shape=[2, 2])

Gradient of layer2: HERE!
Tensor(data=[0.4, 0.6, 0.4, 0.6], shape=[2, 2])

Gradient of layer2_sum: 
Tensor(data=[0.4, 0.6], shape=[2, 1])

Gradient of layer2_expanded:
Tensor(data=[0.1, 0.2, 0.3, 0.4], shape=[2, 2])

Gradient of layer3:
Tensor(data=[1, 1, 1, 1], shape=[2, 2])
'''

In [None]:

# weights2 = torch.tensor([[0.01, 0.02], [0.03, 0.04]], requires_grad=True)
# inputs = torch.tensor([[0.05, 0.15], [0.25, 0.35]], requires_grad=True)
# weights1 = torch.tensor([[0.1, 0.2], [0.3, 0.4]], requires_grad=True)

# weights1 = torch.tensor([[0.1, 0.3], [0.2, 0.4]], requires_grad=True)
# inputs = torch.tensor([[0.05, 0.25], [0.15, 0.35]], requires_grad=True)
# weights2 = torch.tensor([[0.01, 0.03], [0.02, 0.04]], requires_grad=True)

# print(weights1)
# print(inputs.T)
# layer1 = weights1.matmul(inputs.T)
# print(layer1)
# print("---")

# print(layer1)
# print(layer2)
# print(layer1.matmul(weights2))

In [None]:
# WEIGHTS1: Variable(requires_grad=1, data=Tensor(data=[0.1, 0.2, 0.3, 0.4], shape=[2, 2]), grad=Tensor(data=[0, 0, 0, 0], shape=[2, 2]), grad_fn=0)
# INPUT: Variable(requires_grad=1, data=Tensor(data=[0.05, 0.25, 0.15, 0.35], shape=[2, 2]), grad=Tensor(data=[0, 0, 0, 0], shape=[2, 2]), grad_fn=0x565f3ddb9c60)
# LAYER1: Variable(requires_grad=1, data=Tensor(data=[0.035, 0.075, 0.095, 0.215], shape=[2, 2]), grad=Tensor(data=[0, 0, 0, 0], shape=[2, 2]), grad_fn=0x565f3ddb9e10)
# ---
# LAYER1: Variable(requires_grad=1, data=Tensor(data=[0.035, 0.075, 0.095, 0.215], shape=[2, 2]), grad=Tensor(data=[0, 0, 0, 0], shape=[2, 2]), grad_fn=0x565f3ddb9e10)
# WEIGHTS2: Variable(requires_grad=1, data=Tensor(data=[0.01, 0.02, 0.03, 0.04], shape=[2, 2]), grad=Tensor(data=[0, 0, 0, 0], shape=[2, 2]), grad_fn=0)
# LAYER2: Variable(requires_grad=1, data=Tensor(data=[0.0026, 0.0074, 0.0037, 0.0105], shape=[2, 2]), grad=Tensor(data=[0, 0, 0, 0], shape=[2, 2]), grad_fn=0x565f3ddb9fc0)
