# Autograd: automatic differentiation

`autograd` is pytorch's automatic differentiation package, and is defined by how the code is run, not by a static graph like tensorflow (though Eager Mode in TF2 changes things)

In [1]:
import torch
import itertools as it

In [2]:
# Tensors initialized with data do not have their gradients computed automatically
x = torch.tensor([2, 2])
print(x)
x.requires_grad

tensor([2, 2])


False

In [3]:
x = torch.ones(2, 2, requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)


In [4]:
y = x + 2
print(y)

tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)


In [5]:
# Because y was created as a result of an operation with a tensor
# whose gradient is being tracked, it has a function associated with it

print(y.grad_fn)
# The tensor we created does not
print(x.grad_fn)

<AddBackward0 object at 0x7ff7786ee810>
None


In [6]:
# Some more ops
z = y * y * 3
out = z.mean()

print(z, out, sep='\n')

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>)
tensor(27., grad_fn=<MeanBackward0>)


In [7]:
# .requires_grad_() changes a tensor's requires_grad flag in-place (note the
# post-fixed '_')
# Should probably not set the attribute directly...

a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
True
<SumBackward0 object at 0x7ff7e84a1d90>


## Gradients

In [8]:
# backprop through a single scalar using .backward() method
print(out)
out.backward()

tensor(27., grad_fn=<MeanBackward0>)


In [9]:
# This computed the gradient of `out` wrt every tensor in the computational
# graph
# remember y = x + 2, z = y * y * 3, and out = z.mean()
# Since out is a scalar and x is a (2, 2) tensor, that means the gradient
# d(out)/dx is a (2, 2) tensor as well

print(x)
print(x.grad)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


In [10]:
# If we take a gradient of a non-scalar tensor, the dimension changes
# For example, the gradient of a (3,) tensor (aka a vector) with another (3,)
# tensor is a (3, 3) tensor

x = torch.randn(3, requires_grad=True)
y = 2 * x
while y.data.norm() < 1000:
    y = y * 2
print(y, end='\n\n')

J = list()
for i in range(y.shape[0]):
    y[i].backward(retain_graph=True)
    J.append(x.grad.clone())
    x.grad.zero_() # Need to zero out the gradient each time
                   # because it is accumulated by default
print(torch.stack(J))

tensor([1111.8179,  434.1864, 1381.8116], grad_fn=<MulBackward0>)

tensor([[1024.,    0.,    0.],
        [   0., 1024.,    0.],
        [   0.,    0., 1024.]])


We can see the above Jacobian is proportional to the identity matrix, which makes sense because $y_i = 2^n x_i$ and doesn't depend on any of the other elements of $\vec{x}$, so $dy_i/dx_j = 0$ for $i \neq j$.

In [11]:
# Dot product of gradient with the vector v
x.grad.zero_() # have to zero the gradient from the 
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(v, retain_graph=True)

print(x.grad)

tensor([1.0240e+02, 1.0240e+03, 1.0240e-01])


In [12]:
# You can stop autograd from tracking history on Tensors with a context manager
print(x.requires_grad)
print((x ** 2).requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

True
True
False


In [13]:
# or we can get a copy of the tensor that does not require gradients
print(x.requires_grad)
y = x.detach()
print(y.requires_grad)
print(x.eq(y).all())

True
False
tensor(True)
