In [97]:
import torch
import torch.nn as nn
import torch.optim as optim
from pdb import set_trace

# Initialization

In [98]:
def relu(x): 
    return x.clamp_min(0.)

In [99]:
def calc_grad():
    for name, param in model.named_parameters():
        if name == "fc1.weight":
            print("Gradient of weight:", param.grad)

In [100]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(2, 1) 
        self.relu = nn.ReLU()      
#         self.fc2 = nn.Linear(1, 1) 

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
#         x = self.fc2(x)
        return x

In [101]:
x = torch.tensor([
    [1.0, 3.0],
    [4.0, 1.2],
    [3.0, 2.0]
])
y = torch.tensor([1.0, 0, 1.0])

In [102]:
x.shape

torch.Size([3, 2])

In [103]:
def train(inputs=x, targets=y):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs.squeeze(1), targets)
    loss.backward()
    optimizer.step()
    print('loss is', loss)
    outputs = model(inputs)
    new_loss = criterion(outputs.squeeze(1), targets)
    print("New Loss after one step:", new_loss.item())

In [104]:
model = SimpleNet()
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [105]:
w1 = torch.tensor([[1.0, 2.0]])
b1 = torch.tensor([1.0])

In [106]:
model.fc1.weight.data = w1.clone()
model.fc1.bias.data = b1.clone()

In [107]:
model.fc1.weight.grad

In [108]:
model.fc1.weight.data

tensor([[1., 2.]])

In [109]:
first_pred = model(x)
first_pred

tensor([[8.0000],
        [7.4000],
        [8.0000]], grad_fn=<ReluBackward0>)

In [110]:
def calc_loss(pred):
    res = 0
    for idx, num in enumerate(pred):
        res += (pred[idx] - y[idx])**2
    return res / len(pred)

In [111]:
calc_loss(first_pred)    

tensor([50.9200], grad_fn=<DivBackward0>)

In [112]:
first_pred.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [113]:
train()

loss is tensor(50.9200, grad_fn=<MseLossBackward0>)
New Loss after one step: 28.7898006439209


In [114]:
second_pred = model(x)
second_pred

tensor([[6.5957],
        [5.3703],
        [6.1203]], grad_fn=<ReluBackward0>)

In [115]:
calc_loss(second_pred)

tensor([28.7898], grad_fn=<DivBackward0>)

In [116]:
model.fc1.weight.data

tensor([[0.6160, 1.7075]])

# First Pass

In [117]:
x.shape, w1.shape, b1.shape

(torch.Size([3, 2]), torch.Size([1, 2]), torch.Size([1]))

In [118]:
w1 = w1.t()

In [119]:
def lin(x, w, b):
    return x @ w + b

In [120]:
fc_res = lin(x, w1, b1)
fc_res

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [121]:
relu_pass_1 = relu(fc_res)

In [122]:
fc_res.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [123]:
calc_loss(relu_pass_1)

tensor([50.9200])

# Back Propogation

In [124]:
def mse_grad(inp, targ):
#     set_trace()
    inp.gradient = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [125]:
relu_pass_1.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [126]:
mse_grad(relu_pass_1, y)

In [127]:
relu_pass_1

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [128]:
relu_pass_1.shape

torch.Size([3, 1])

In [129]:
relu_pass_1.squeeze()

tensor([8.0000, 7.4000, 8.0000])

In [130]:
y.shape

torch.Size([3])

In [131]:
y

tensor([1., 0., 1.])

In [132]:
relu_pass_1.squeeze() - y

tensor([7.0000, 7.4000, 7.0000])

In [133]:
relu_pass_1 

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [134]:
relu_pass_1.shape[0]

3

In [135]:
new_ten = torch.tensor([7.0, 7.4, 7.0])

In [138]:
new_ten.shape

torch.Size([3])

In [139]:
new_ten.unsqueeze(1) / relu_pass_1.shape[0]

tensor([[2.3333],
        [2.4667],
        [2.3333]])

In [141]:
(new_ten.unsqueeze(1) / relu_pass_1.shape[0]) * 2

tensor([[4.6667],
        [4.9333],
        [4.6667]])

In [94]:
relu_pass_1.gradient.shape

torch.Size([3, 1])

In [95]:
relu_pass_1.gradient

tensor([[4.6667],
        [4.9333],
        [4.6667]])

In [32]:
model.fc1.weight.grad

tensor([[38.4000, 29.2533]])

In [33]:
def relu_grad(inp, out):
    # gradient of relu multiplied by the gradient of the next layer
    inp.gradient = (inp > 0).float() * out.gradient

In [34]:
relu_grad(fc_res, relu_pass_1)

AttributeError: 'Tensor' object has no attribute 'gradient'

In [35]:
fc_res.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [36]:
model.fc1.weight.grad

tensor([[38.4000, 29.2533]])

In [37]:
def lin_grad(inp, out, w, b):
#     set_trace()
    
    # the gradient of a matrix product, is the matrix product with a transpose
    inp.gradient = out.gradient @ w.t()
        
#     w.gradient = (inp.unsqueeze(-1) * out.gradient.unsqueeze(1)).sum(0)
    w.gradient = (inp * out.gradient).sum(0)

    b.gradient  = out.gradient.sum(0)

In [38]:
fc_res.gradient.shape, w1.shape

AttributeError: 'Tensor' object has no attribute 'gradient'

In [39]:
lin_grad(x, fc_res, w1, b1)

AttributeError: 'Tensor' object has no attribute 'gradient'

In [40]:
x.shape, fc_res.gradient.shape

AttributeError: 'Tensor' object has no attribute 'gradient'

In [41]:
w1.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [42]:
model.fc1.weight.grad

tensor([[38.4000, 29.2533]])

# Re Calculate Weights

In [43]:
lr = 0.01

In [44]:
w1 = w1 - lr * w1.gradient.unsqueeze(1)
b1 -= lr * b1.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [45]:
model.fc1.weight.data

tensor([[0.6160, 1.7075]])

In [46]:
w1

tensor([[1.],
        [2.]])

# Re Calculate Loss

In [47]:
fc_res_2 = lin(x, w1.squeeze(0), b1)
fc_res_2

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [48]:
relu_pass_2 = relu(fc_res_2)
relu_pass_2

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [49]:
calc_loss(relu_pass_2)

tensor([50.9200])

In [50]:
w1, b1

(tensor([[1.],
         [2.]]),
 tensor([1.]))

In [51]:
model.fc1.bias.data

tensor([0.8573])

# Recalculate Weights Second time

In [52]:
train()

loss is tensor(28.7898, grad_fn=<MseLossBackward0>)
New Loss after one step: 16.442419052124023


In [53]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [54]:
model.fc1.weight.grad

tensor([[28.2918, 22.3147]])

In [55]:
fc_res_2 = lin(x, w1, b1)

In [56]:
relu_res_2 = relu(fc_res_2)

In [57]:
relu_res_2

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [58]:
mse_grad(relu_res_2, y)

In [59]:
relu_grad(fc_res_2, relu_res_2)

In [60]:
lin_grad(x, fc_res_2, w1, b1)

In [61]:
w1 = w1 - lr * w1.gradient.unsqueeze(1)
b1 -= lr * b1.gradient

In [62]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [63]:
w1

tensor([[0.6160],
        [1.7075]])

In [64]:
res_after_second_update_change = lin(x, w1, b1)

In [65]:
calc_loss(res_after_second_update_change)

tensor([28.7898])

In [66]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [67]:
w1

tensor([[0.6160],
        [1.7075]])

In [68]:
model.fc1.bias.data

tensor([0.7501])

In [69]:
b1

tensor([0.8573])

In [70]:
b1.gradient

tensor([14.2667])

In [71]:
model.fc1.bias.grad

tensor([10.7242])