In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from pdb import set_trace

# Initialization

In [2]:
def relu(x): 
    return x.clamp_min(0.)

In [3]:
def calc_grad():
    for name, param in model.named_parameters():
        if name == "fc1.weight":
            print("Gradient of weight:", param.grad)

In [4]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(2, 1) 
        self.relu = nn.ReLU()      
#         self.fc2 = nn.Linear(1, 1) 

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
#         x = self.fc2(x)
        return x

In [5]:
x = torch.tensor([
    [1.0, 3.0],
    [4.0, 1.2],
    [3.0, 2.0]
])
y = torch.tensor([1.0, 0, 1.0])

In [6]:
x.shape

torch.Size([3, 2])

In [7]:
def train(inputs=x, targets=y):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs.squeeze(1), targets)
    loss.backward()
    optimizer.step()
    print('loss is', loss)
    outputs = model(inputs)
    new_loss = criterion(outputs.squeeze(1), targets)
    print("New Loss after one step:", new_loss.item())

In [8]:
model = SimpleNet()
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [9]:
w1 = torch.tensor([[1.0, 2.0]])
b1 = torch.tensor([1.0])

In [10]:
model.fc1.weight.data = w1.clone()
model.fc1.bias.data = b1.clone()

In [11]:
model.fc1.weight.grad

In [12]:
model.fc1.weight.data

tensor([[1., 2.]])

In [13]:
first_pred = model(x)
first_pred

tensor([[8.0000],
        [7.4000],
        [8.0000]], grad_fn=<ReluBackward0>)

In [14]:
def calc_loss(pred):
    res = 0
    for idx, num in enumerate(pred):
        res += (pred[idx] - y[idx])**2
    return res / len(pred)

In [15]:
calc_loss(first_pred)    

tensor([50.9200], grad_fn=<DivBackward0>)

In [16]:
first_pred.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [17]:
train()

loss is tensor(50.9200, grad_fn=<MseLossBackward0>)
New Loss after one step: 28.7898006439209


In [18]:
second_pred = model(x)
second_pred

tensor([[6.5957],
        [5.3703],
        [6.1203]], grad_fn=<ReluBackward0>)

In [19]:
calc_loss(second_pred)

tensor([28.7898], grad_fn=<DivBackward0>)

In [None]:
model.fc1.weight.data

# First Pass

In [20]:
x.shape, w1.shape, b1.shape

(torch.Size([3, 2]), torch.Size([1, 2]), torch.Size([1]))

In [21]:
w1 = w1.t()

In [22]:
def lin(x, w, b):
    return x @ w + b

In [23]:
fc_res = lin(x, w1, b1)
fc_res

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [24]:
relu_pass_1 = relu(fc_res)

In [26]:
fc_res.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [27]:
calc_loss(relu_pass_1)

tensor([50.9200])

# Back Propogation

In [30]:
def mse_grad(inp, targ):
#     set_trace()
    inp.gradient = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [28]:
relu_pass_1.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [31]:
mse_grad(relu_pass_1, y)

In [32]:
relu_pass_1.gradient.shape

torch.Size([3, 1])

In [33]:
relu_pass_1.gradient

tensor([[4.6667],
        [4.9333],
        [4.6667]])

In [34]:
model.fc1.weight.grad

tensor([[38.4000, 29.2533]])

In [35]:
def relu_grad(inp, out):
    # gradient of relu multiplied by the gradient of the next layer
    inp.gradient = (inp > 0).float() * out.gradient

In [36]:
relu_grad(fc_res, relu_pass_1)

In [38]:
fc_res.gradient

tensor([[4.6667],
        [4.9333],
        [4.6667]])

In [39]:
model.fc1.weight.grad

tensor([[38.4000, 29.2533]])

In [40]:
def lin_grad(inp, out, w, b):
#     set_trace()
    
    # the gradient of a matrix product, is the matrix product with a transpose
    inp.gradient = out.gradient @ w.t()
        
#     w.gradient = (inp.unsqueeze(-1) * out.gradient.unsqueeze(1)).sum(0)
    w.gradient = (inp * out.gradient).sum(0)

    b.gradient  = out.gradient.sum(0)

In [41]:
fc_res.gradient.shape, w1.shape

(torch.Size([3, 1]), torch.Size([2, 1]))

In [42]:
lin_grad(x, fc_res, w1, b1)

In [43]:
x.shape, fc_res.gradient.shape

(torch.Size([3, 2]), torch.Size([3, 1]))

In [44]:
w1.gradient

tensor([38.4000, 29.2533])

In [45]:
model.fc1.weight.grad

tensor([[38.4000, 29.2533]])

# Re Calculate Weights

In [46]:
lr = 0.01

In [47]:
w1 = w1 - lr * w1.gradient.unsqueeze(1)
b1 -= lr * b1.gradient

In [48]:
model.fc1.weight.data

tensor([[0.6160, 1.7075]])

In [49]:
w1

tensor([[0.6160],
        [1.7075]])

# Re Calculate Loss

In [50]:
fc_res_2 = lin(x, w1.squeeze(0), b1)
fc_res_2

tensor([[6.5957],
        [5.3703],
        [6.1203]])

In [51]:
relu_pass_2 = relu(fc_res_2)
relu_pass_2

tensor([[6.5957],
        [5.3703],
        [6.1203]])

In [52]:
calc_loss(relu_pass_2)

tensor([28.7898])

In [53]:
w1, b1

(tensor([[0.6160],
         [1.7075]]),
 tensor([0.8573]))

In [54]:
model.fc1.bias.data

tensor([0.8573])

# Recalculate Weights Second time

In [55]:
train()

loss is tensor(28.7898, grad_fn=<MseLossBackward0>)
New Loss after one step: 16.442419052124023


In [56]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [57]:
model.fc1.weight.grad

tensor([[28.2918, 22.3147]])

In [58]:
fc_res_2 = lin(x, w1, b1)

In [60]:
relu_res_2 = relu(fc_res_2)

In [61]:
relu_res_2

tensor([[6.5957],
        [5.3703],
        [6.1203]])

In [62]:
mse_grad(relu_res_2, y)

In [63]:
relu_grad(fc_res_2, relu_res_2)

In [64]:
lin_grad(x, fc_res_2, w1, b1)

In [65]:
w1 = w1 - lr * w1.gradient.unsqueeze(1)
b1 -= lr * b1.gradient

In [66]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [67]:
w1

tensor([[0.3331],
        [1.4843]])

In [68]:
res_after_second_update_change = lin(x, w1, b1)

In [69]:
calc_loss(res_after_second_update_change)

tensor([16.4424])

In [70]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [71]:
w1

tensor([[0.3331],
        [1.4843]])

In [72]:
model.fc1.bias.data

tensor([0.7501])

In [73]:
b1

tensor([0.7501])

In [74]:
b1.gradient

tensor([10.7242])

In [75]:
model.fc1.bias.grad

tensor([10.7242])