In [41]:
# use torch.autograd to implement automatic differentiation
# example of a one-layer neural network
# input: x
# output: z
# expected output: y
# parameters: vector w and vector b

import torch

x = torch.ones(5)   # input
y = torch.zeros(3)  # expected output
w = torch.randn(5, 3, requires_grad=True)   # parameters: n in randn means normal, gaussion distribution, mean=0 and std=1
b = torch.randn(3, requires_grad=True) # parameters: set requires_grad=True to be able to compute the gradients of loss function
z = x@w + b # @ is matmul(x, w)
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y) # target z to become closer and closer to y

print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x126a04580>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x126a05b70>


In [None]:
# compute the gradients
# !!! parameters' grad are for optimization
# this is done be compute the derivatives of the loss function
# here the grad we need are 'loss/'w and 'loss/'b under fixed values of x and y
loss.backward() # compute the derivatives of the loss function
# loss.backward(retain_graph=True) # with retain_graph=True, parameters are not updated, limited run for 1 time.
print(w.grad) # retrieve the gradient
print(b.grad) # retrieve the gradient

tensor([[0.4922, 0.2885, 0.7575],
        [0.4922, 0.2885, 0.7575],
        [0.4922, 0.2885, 0.7575],
        [0.4922, 0.2885, 0.7575],
        [0.4922, 0.2885, 0.7575]])
tensor([0.4922, 0.2885, 0.7575])


In [43]:
# disable gradient tracking
# when do you need this?
# 1> the layer has been trained, just need to apply it to some input data
# 2> to freeze some parameters
# 3> to speed up computations when only doing forward pass (tracking costs more computing power)

# example:
# gradient tracking is enabled
z = x@w + b
print(z.requires_grad)

# option 1: surround the computation code with torch.no_grad()
with torch.no_grad():
    z = x@w + b
print(z.requires_grad)

# option 2: use detach()
z = x@w + b
z_det = z.detach()
print(z_det.requires_grad)

True
False
False


In [44]:
# tensor gradients and jacobian products

inp = torch.eye(4, 5, requires_grad=True)
print(f"inp\n{inp}")
out = (inp + 1).pow(2).t()
print(f"out\n{out}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

inp
tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.]], requires_grad=True)
out
tensor([[4., 1., 1., 1.],
        [1., 4., 1., 1.],
        [1., 1., 4., 1.],
        [1., 1., 1., 4.],
        [1., 1., 1., 1.]], grad_fn=<TBackward0>)
First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
