In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from pdb import set_trace

# Initialization

In [2]:
def calc_grad():
    for name, param in model.named_parameters():
        if name == "fc1.weight":
            print("Gradient of weight:", param.grad)

In [3]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(2, 1) 
#         self.relu = nn.ReLU()      
#         self.fc2 = nn.Linear(1, 1) 

    def forward(self, x):
        x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
        return x

In [4]:
x = torch.tensor([
    [1.0, 3.0],
    [4.0, 1.2],
    [3.0, 2.0]
])
y = torch.tensor([1.0, 0, 1.0])

In [5]:
x.shape

torch.Size([3, 2])

In [6]:
def train(inputs=x, targets=y):
    optimizer.zero_grad()
    outputs = model(inputs) # forward pass
    loss = criterion(outputs.squeeze(1), targets) # loss calculation
    loss.backward() # gradient calculation 
    optimizer.step() # weights update
    print('loss is', loss) 
    outputs = model(inputs) # forward pass
    new_loss = criterion(outputs, targets) # new loss calculation
    print("New Loss after one step:", new_loss.item())

In [7]:
model = SimpleNet()
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [8]:
w1 = torch.tensor([[1.0, 2.0]])
b1 = torch.tensor([1.0])

In [9]:
model.fc1.weight.data

tensor([[-0.4002, -0.6457]])

In [10]:
model.fc1.bias.data

tensor([0.1163])

In [11]:
model.fc1.weight.data = w1.clone()
model.fc1.bias.data = b1.clone()

In [12]:
model.fc1.weight.data, model.fc1.bias.data

(tensor([[1., 2.]]), tensor([1.]))

In [13]:
first_pred = model(x)
first_pred

tensor([[8.0000],
        [7.4000],
        [8.0000]], grad_fn=<AddmmBackward0>)

In [14]:
x.shape

torch.Size([3, 2])

In [15]:
model.fc1.weight.shape

torch.Size([1, 2])

In [16]:
def calc_loss(pred):
    res = 0
    for idx, num in enumerate(pred):
        res += (pred[idx] - y[idx])**2
    return res / len(pred)

In [17]:
calc_loss(first_pred)    

tensor([50.9200], grad_fn=<DivBackward0>)

In [18]:
first_pred.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [19]:
model.fc1.weight.grad

In [20]:
train()

loss is tensor(50.9200, grad_fn=<MseLossBackward0>)
New Loss after one step: 29.22878074645996


  return F.mse_loss(input, target, reduction=self.reduction)


In [21]:
model.fc1.weight.grad, model.fc1.bias.grad

(tensor([[38.4000, 29.2533]]), tensor([14.2667]))

In [22]:
second_pred = model(x)
second_pred

tensor([[6.5957],
        [5.3703],
        [6.1203]], grad_fn=<AddmmBackward0>)

In [23]:
calc_loss(second_pred)

tensor([28.7898], grad_fn=<DivBackward0>)

In [24]:
model.fc1.weight.data, model.fc1.bias.data

(tensor([[0.6160, 1.7075]]), tensor([0.8573]))

In [25]:
model.fc1.weight.grad, model.fc1.bias.grad

(tensor([[38.4000, 29.2533]]), tensor([14.2667]))

# First Pass

In [26]:
x.shape, w1.shape, b1.shape

(torch.Size([3, 2]), torch.Size([1, 2]), torch.Size([1]))

In [27]:
w1 = w1.t()

In [28]:
def lin(x, w, b):
    return x @ w + b

In [29]:
fc_res = lin(x, w1, b1)
fc_res

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [30]:
fc_res.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [None]:
calc_loss(fc_res)

# Back Propogation

In [31]:
w1, b1

(tensor([[1.],
         [2.]]),
 tensor([1.]))

In [32]:
def mse_grad(inp, targ):
    inp.gradient = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [33]:
fc_res.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [34]:
mse_grad(fc_res, y)

In [35]:
fc_res.gradient

tensor([[4.6667],
        [4.9333],
        [4.6667]])

In [36]:
fc_res

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [37]:
fc_res.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [38]:
fc_res.gradient

tensor([[4.6667],
        [4.9333],
        [4.6667]])

In [39]:
fc_res.gradient.shape

torch.Size([3, 1])

In [40]:
def lin_grad(inp, out, w, b):    
    # the gradient of a matrix product, is the matrix product with a transpose
#     inp.gradient = out.gradient @ w.t()
        
    w.gradient = (inp.unsqueeze(-1) * out.gradient.unsqueeze(1)).sum(0)

    b.gradient  = out.gradient.sum(0)

In [41]:
w1.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [42]:
lin_grad(x, fc_res, w1, b1)

In [43]:
w1.gradient, b1.shape

(tensor([[38.4000],
         [29.2533]]),
 torch.Size([1]))

In [44]:
model.fc1.weight.grad

tensor([[38.4000, 29.2533]])

In [45]:
fc_res.gradient

tensor([[4.6667],
        [4.9333],
        [4.6667]])

# Re Calculate Weights

In [46]:
lr = 0.01

In [47]:
w1

tensor([[1.],
        [2.]])

In [48]:
model.fc1.weight.data

tensor([[0.6160, 1.7075]])

In [49]:
w1.gradient.shape

torch.Size([2, 1])

In [50]:
w1 = w1 - lr * w1.gradient
b1 -= lr * b1.gradient

In [51]:
model.fc1.weight.data

tensor([[0.6160, 1.7075]])

In [52]:
w1

tensor([[0.6160],
        [1.7075]])

# Re Calculate Loss

In [53]:
new_fc_res = lin(x, w1.squeeze(0), b1)
new_fc_res

tensor([[6.5957],
        [5.3703],
        [6.1203]])

In [54]:
calc_loss(new_fc_res)

tensor([28.7898])

In [55]:
w1, b1

(tensor([[0.6160],
         [1.7075]]),
 tensor([0.8573]))

In [56]:
model.fc1.bias.data

tensor([0.8573])

# Recalculate Weights Second time

In [57]:
train()

loss is tensor(28.7898, grad_fn=<MseLossBackward0>)
New Loss after one step: 17.00395393371582


In [58]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [59]:
model.fc1.weight.grad

tensor([[28.2918, 22.3147]])

In [60]:
fc_res_2 = lin(x, w1, b1)

In [61]:
fc_res_2.gradient

AttributeError: 'Tensor' object has no attribute 'gradient'

In [62]:
mse_grad(fc_res_2, y)

In [63]:
fc_res_2.gradient

tensor([[3.7305],
        [3.5802],
        [3.4135]])

In [69]:
lin_grad(x, fc_res_2, w1, b1)

In [70]:
model.fc1.weight.grad

tensor([[28.2918, 22.3147]])

In [72]:
fc_res_2.gradient

tensor([[3.7305],
        [3.5802],
        [3.4135]])

In [73]:
w1 = w1 - lr * w1.gradient
b1 -= lr * b1.gradient

In [74]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [75]:
w1

tensor([[0.3331],
        [1.4843]])

In [76]:
second_update_weights = lin(x, w1, b1)

In [78]:
calc_loss(second_update_weights)

tensor([16.4424])

In [79]:
model.fc1.weight.data

tensor([[0.3331, 1.4843]])

In [80]:
w1

tensor([[0.3331],
        [1.4843]])

In [81]:
model.fc1.bias.data

tensor([0.7501])

In [82]:
b1

tensor([0.7501])

In [83]:
b1.gradient

tensor([10.7242])

In [84]:
model.fc1.bias.grad

tensor([10.7242])