In [None]:
import torch
# The autograd package provides automatic differentiation 
# for all operations on Tensors

requires_grad = True -> tracks all operations on the tensor. 

In [None]:
x = torch.randn(3, requires_grad=True)
y = x + 2

y was created as a result of an operation, so it has a grad_fn attribute.<br>
grad_fn: references a Function that has created the Tensor

In [None]:
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

Do more operations on y

In [None]:
z = y * y * 3
print(z)
z = z.mean()
print(z)

Let's compute the gradients with backpropagation<br>
When we finish our computation we can call .backward() and have all the gradients computed automatically.<br>
The gradient for this tensor will be accumulated into .grad attribute.<br>
It is the partial derivate of the function w.r.t. the tensor

In [None]:
z.backward()
print(x.grad) # dz/dx

Generally speaking, torch.autograd is an engine for computing vector-Jacobian product<br>
It computes partial derivates while applying the chain rule

-------------<br>
Model with non-scalar output:<br>
If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() <br>
specify a gradient argument that is a tensor of matching shape.<br>
needed for vector-Jacobian product

In [None]:
x = torch.randn(3, requires_grad=True)

In [None]:
y = x * 2
for _ in range(10):
    y = y * 2

In [None]:
print(y)
print(y.shape)

In [None]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

-------------<br>
Stop a tensor from tracking history:<br>
For example during our training loop when we want to update our weights<br>
then this update operation should not be part of the gradient computation<br>
- x.requires_grad_(False)<br>
- x.detach()<br>
- wrap in 'with torch.no_grad():'

.requires_grad_(...) changes an existing flag in-place.

In [None]:
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

.detach(): get a new Tensor with the same content but no gradient computation:

In [None]:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

wrap in 'with torch.no_grad():'

In [None]:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

-------------<br>
backward() accumulates the gradient for this tensor into .grad attribute.<br>
!!! We need to be careful during optimization !!!<br>
Use .zero_() to empty the gradients before a new optimization step!

In [None]:
weights = torch.ones(4, requires_grad=True)

In [None]:
for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

In [None]:
print(weights)
print(model_output)

Optimizer has zero_grad() method<br>
optimizer = torch.optim.SGD([weights], lr=0.1)<br>
During training:<br>
optimizer.step()<br>
optimizer.zero_grad()