In [77]:
import torch
import torch.nn as nn
import torch.optim as optim
from pdb import set_trace

In [88]:
def pow_(x: torch.tensor, power:int, ):
    return x**power

In [89]:
t = torch.tensor([2.0]).requires_grad_()
t.retain_grad()

In [90]:
res = pow_(t, 2)
res

tensor([4.], grad_fn=<PowBackward0>)

In [91]:
res.backward()
t.grad

tensor([4.])

In [6]:
t = torch.tensor([3.0]).requires_grad_()
t.retain_grad()

In [7]:
res = pow_(t, 4)
res

tensor([81.], grad_fn=<PowBackward0>)

In [8]:
res.backward()

In [9]:
t.grad

tensor([108.])

# Initialization

In [10]:
def relu(x): 
    return x.clamp_min(0.)

In [11]:
def calc_grad():
    for name, param in model.named_parameters():
        if name == "fc1.weight":
            print("Gradient of weight:", param.grad)

In [12]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(2, 1) 
        self.relu = nn.ReLU()      
        self.fc2 = nn.Linear(1, 1) 

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        return x

In [13]:
x = torch.tensor([
    [1.0, 3.0],
    [4.0, 1.2],
    [3.0, 2.0]
])
y = torch.tensor([1.0, 0, 1.0])

In [14]:
x.shape

torch.Size([3, 2])

In [15]:
def train(inputs=x, targets=y):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs.squeeze(1), targets)
    loss.backward()
    optimizer.step()
    print('loss is', loss)
    outputs = model(inputs)
    new_loss = criterion(outputs.squeeze(1), targets)
    print("New Loss after one step:", new_loss.item())

In [16]:
model = SimpleNet()
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [17]:
w1 = torch.tensor([[1.0, 2.0]])
b1 = torch.tensor([1.0])

w2 = torch.tensor([[1.0]])
b2 = torch.tensor([[1.0]])

In [18]:
model.fc1.weight.data = w1.clone()
model.fc1.bias.data = b1.clone()

model.fc2.weight.data = w2.clone()
model.fc2.bias.data = b2.clone()

In [19]:
model.fc1.weight.grad

In [20]:
model.fc1.weight.data

tensor([[1., 2.]])

In [21]:
model.fc2.weight.data, model.fc2.bias.data

(tensor([[1.]]), tensor([[1.]]))

In [22]:
first_pred = model(x)
first_pred

tensor([[9.0000],
        [8.4000],
        [9.0000]], grad_fn=<AddmmBackward0>)

In [23]:
first_pred.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [24]:
def calc_loss(pred):
    res = 0
    for idx, num in enumerate(pred):
        res += (pred[idx] - y[idx])**2
    return res / len(pred)

In [25]:
calc_loss(first_pred)    

tensor([66.1867], grad_fn=<DivBackward0>)

In [26]:
first_pred.shape, y.shape

(torch.Size([3, 1]), torch.Size([3]))

In [27]:
train()

loss is tensor(66.1867, grad_fn=<MseLossBackward0>)
New Loss after one step: 2.2632858753204346


In [28]:
second_pred = model(x)
second_pred

tensor([[-0.8757],
        [-0.5247],
        [-0.7309]], grad_fn=<AddmmBackward0>)

In [29]:
calc_loss(second_pred)

tensor([2.2633], grad_fn=<DivBackward0>)

In [30]:
model.fc1.weight.data

tensor([[0.5627, 1.6661]])

# First Pass

In [31]:
x.shape, w1.shape, b1.shape

(torch.Size([3, 2]), torch.Size([1, 2]), torch.Size([1]))

In [32]:
w1 = w1.t()

In [33]:
def lin(x, w, b):
    return x @ w + b

In [34]:
fc1_first_pass = lin(x, w1, b1)
fc1_first_pass

tensor([[8.0000],
        [7.4000],
        [8.0000]])

In [35]:
relu_first_pass = relu(fc1_first_pass)

In [36]:
fc2_first_pass = lin(relu_first_pass, w2, b2)

In [37]:
calc_loss(fc2_first_pass)

tensor([66.1867])

In [58]:
def forward_with_loss_calc(w1, b1, w2, b2):
    lin_1 = lin(x, w1, b1)
    relu_res = relu(lin_1)
    lin_2 = lin(relu_res, w2, b2)
    return calc_loss(lin_2)

# Back Propogation

    mse_grad(out, target)
    lin_grad(l2, out, w2, b2)
    relu_grad(l1, l2)    
    lin_grad(inp, l1, w1, b1)

In [38]:
def mse_grad(inp, targ):
#     set_trace()
    inp.gradient = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [39]:
mse_grad(fc2_first_pass, y)

In [40]:
def lin_grad(inp, out, w, b):
    
    # the gradient of a matrix product, is the matrix product with a transpose
    inp.gradient = out.gradient @ w.t()
        
#     w.gradient = (inp.unsqueeze(-1) * out.gradient.unsqueeze(1)).sum(0)
    w.gradient = (inp * out.gradient).sum(0)

    b.gradient  = out.gradient.sum(0)

In [41]:
lin_grad(relu_first_pass, fc2_first_pass, w2, b2)

In [42]:
def relu_grad(inp, out):
    # gradient of relu multiplied by the gradient of the next layer
    inp.gradient = (inp > 0).float() * out.gradient

In [43]:
relu_grad(fc1_first_pass, relu_first_pass)

In [44]:
lin_grad(x, fc1_first_pass, w1, b1)

In [49]:
w2.gradient

tensor([126.7733])

In [50]:
model.fc2.weight.grad

tensor([[126.7733]])

# Re Calculate Weights

In [51]:
lr = 0.01

In [52]:
w1 -= lr * w1.gradient.unsqueeze(1)
b1 -= lr * b1.gradient

w2 -= lr * w2.gradient.unsqueeze(1)
b2 -= lr * b2.gradient

In [53]:
model.fc1.weight.data, model.fc1.bias.data

(tensor([[0.5627, 1.6661]]), tensor([0.8373]))

In [54]:
w1, b1

(tensor([[0.5627],
         [1.6661]]),
 tensor([0.8373]))

In [55]:
model.fc2.weight.data, model.fc2.bias.data

(tensor([[-0.2677]]), tensor([[0.8373]]))

In [56]:
w2, b2

(tensor([[-0.2677]]), tensor([[0.8373]]))

# Re Calculate Loss

In [59]:
forward_with_loss_calc(w1, b1, w2, b2)

tensor([2.2633])

In [60]:
w1, b1

(tensor([[0.5627],
         [1.6661]]),
 tensor([0.8373]))

In [61]:
model.fc1.bias.data

tensor([0.8373])

# Recalculate Weights Second time

In [62]:
train()

loss is tensor(2.2633, grad_fn=<MseLossBackward0>)
New Loss after one step: 0.42095747590065


In [63]:
fc1_second_pass = lin(x, w1, b1)
relu_second_pass = relu(fc1_second_pass)
fc2_second_pass = lin(relu_second_pass, w2, b2)

In [64]:
mse_grad(fc2_second_pass, y)
lin_grad(relu_second_pass, fc2_second_pass, w2, b2)
relu_grad(fc1_second_pass, relu_second_pass)
lin_grad(x, fc1_second_pass, w1, b1)

In [65]:
w1 -= lr * w1.gradient.unsqueeze(1)
b1 -= lr * b1.gradient

w2 -= lr * w2.gradient.unsqueeze(1)
b2 -= lr * b2.gradient

In [67]:
forward_with_loss_calc(w1, b1, w2, b2)

tensor([0.4210])

In [68]:
model.fc1.weight.data, model.fc1.bias.data, 

(tensor([[0.5463, 1.6488]]), tensor([0.8300]))

In [69]:
w1, b1

(tensor([[0.5463],
         [1.6488]]),
 tensor([0.8300]))

In [70]:
model.fc2.weight.data, model.fc2.bias.data, 

(tensor([[-0.1023]]), tensor([[0.8649]]))

In [71]:
w2, b2

(tensor([[-0.1023]]), tensor([[0.8649]]))

In [72]:
fc1_second_pass.gradient

tensor([[0.3348],
        [0.0937],
        [0.3090]])

In [73]:
w1.gradient

tensor([1.6363, 1.7347])

In [74]:
fc2_second_pass.gradient * relu_second_pass.gradient

tensor([[-0.4187],
        [-0.0328],
        [-0.3565]])

In [75]:
fc1_first_pass.gradient

tensor([[5.3333],
        [5.6000],
        [5.3333]])

In [76]:
fc2_second_pass

tensor([[-0.8757],
        [-0.5247],
        [-0.7309]])