In [1]:
import torch
import tensorflow as tf


def pr(*args):
    print(*args)

## A Simple Example

The gradient of this function $y=2\mathbf{x}^T\mathbf{x}$ with respect to $\mathbf{x}$ should be $4\mathbf{x}$

In [2]:
x = torch.arange(4, dtype=torch.float32)
x

tensor([0., 1., 2., 3.])

In [3]:
# y = 2 * torch.dot(x,x)
# y.backward() # Error

In [4]:
x.requires_grad

False

In [5]:
x.requires_grad_(True)

tensor([0., 1., 2., 3.], requires_grad=True)

In [6]:
pr(x.grad)

None


In [7]:
y = 2 * torch.dot(x, x)
y.backward()

In [8]:
x.grad

tensor([ 0.,  4.,  8., 12.])

In [9]:
x.grad == 4 * x

tensor([True, True, True, True])

Let us try another function of $\mathbf{x}$. 

In [10]:
# first we should clear the gradients before
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [11]:
y = x.sum()
y.backward()
x.grad

tensor([1., 1., 1., 1.])

In TensorFlow things are little different

In [12]:
x = tf.range(4, dtype=tf.float32)

In [13]:
x = tf.Variable(x)

In [14]:
with tf.GradientTape() as t:
    y = 2 * tf.tensordot(x, x, axes=1)

In [15]:
y

<tf.Tensor: shape=(), dtype=float32, numpy=28.0>

In [16]:
x_grad = t.gradient(y,x)
x_grad

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.,  4.,  8., 12.], dtype=float32)>

In [17]:
with tf.GradientTape() as t:
    y = tf.reduce_sum(x)
y

<tf.Tensor: shape=(), dtype=float32, numpy=6.0>

In [18]:
x_grad = t.gradient(y,x)
x_grad

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([1., 1., 1., 1.], dtype=float32)>

## Backward for Non-Scalar Variables

In [27]:
x = torch.arange(4, dtype=torch.float32, requires_grad=True)
y = x * x  # now y is not a scalar
y

tensor([0., 1., 4., 9.], grad_fn=<MulBackward0>)

In [25]:
# y.backward() # Error
y.sum().backward()

In [26]:
x.grad

tensor([0., 2., 4., 6.])

In [28]:
y = x * x
y.backward(torch.ones(len(x)))
x.grad

tensor([0., 2., 4., 6.])

In [29]:
x = tf.range(4, dtype=tf.float32)
x = tf.Variable(x)

In [30]:
with tf.GradientTape() as t:
    y = x * x 
t.gradient(y,x)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 2., 4., 6.], dtype=float32)>

## Detaching Computation

In [33]:
x = torch.arange(4, dtype=torch.float32, requires_grad=True)

In [34]:
y = x * x 

In [35]:
u = y.detach()

In [36]:
z = u * x 

In [37]:
z

tensor([ 0.,  1.,  8., 27.], grad_fn=<MulBackward0>)

In [38]:
z.sum().backward()

In [39]:
x.grad

tensor([0., 1., 4., 9.])

$y = x * x, z = y * x$ and now turn $y$ as a constant $u$. The gradient of $z = u * x$ with respect to $x$ is the constant $u$

In [40]:
x.grad == u

tensor([True, True, True, True])

we can also invoke backpropagation on $y$ to get the derivative on $y=x*x$ with respect to $x$ which is $2*x$

In [41]:
x.grad.zero_()
y.sum().backward()
x.grad == 2 * x

tensor([True, True, True, True])

In TensorFlow

In [42]:
x = tf.range(4, dtype=tf.float32)
x = tf.Variable(x)

In [43]:
with tf.GradientTape(persistent=True) as t:
    y = x * x 
    u = tf.stop_gradient(y)
    z = u * x 
x_grad = t.gradient(z,x)
x_grad == u

<tf.Tensor: shape=(4,), dtype=bool, numpy=array([ True,  True,  True,  True])>

In [44]:
t.gradient(y,x) == 2 * x

<tf.Tensor: shape=(4,), dtype=bool, numpy=array([ True,  True,  True,  True])>

## Compute the Gradient of Python Control Flow 

In [45]:
def f(a):
    b = a * 2 
    while b.norm() < 1000:
        b = b * 2 
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c 

In [46]:
a = torch.randn(size=(), requires_grad=True)
a

tensor(-1.5694, requires_grad=True)

In [47]:
d = f(a)
d.backward()

In [48]:
d

tensor(-160708.2188, grad_fn=<MulBackward0>)

In [49]:
d/a == a.grad

tensor(True)

In [50]:
def f(a):
    b = a * 2 
    while tf.norm(b) < 1000:
        b = b * 2 
    if tf.reduce_sum(b) > 0:
        c = b
    else:
        c = 100 * b
    return c 

In [53]:
a = tf.Variable(tf.random.normal(shape=()))
with tf.GradientTape(persistent=True) as t:
    d_tf = f(a)
d_grad = t.gradient(d_tf, a)
d_grad

<tf.Tensor: shape=(), dtype=float32, numpy=409600.0>

In [54]:
d_grad == d_tf / a

<tf.Tensor: shape=(), dtype=bool, numpy=True>

## Exercise

1. Why is the second derivative much more expensive to compute than the first derivative?

    The second derivative requires a $O(n^2)$ complexity 
    
2. After running the function for backpropagation, immediately run it again and see what happens.



In [64]:
x = torch.arange(4, dtype=torch.float32, requires_grad=True)
y = x * x

In [70]:
y

tensor([0., 1., 4., 9.], grad_fn=<MulBackward0>)

In [71]:
y.sum().backward(retain_graph=True)

RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling .backward() or autograd.grad() the first time.

In [72]:
x.grad

tensor([ 0.,  6., 12., 18.])

Ex 3. In the control flow example where we calculate the derivative of d with respect to a, what would happen if we changed the variable a to a random vector or matrix. At this point, the result of the calculation f(a) is no longer a scalar. What happens to the result? How do we analyze this?

In [73]:
def f(a):
    b = a * 2 
    while b.norm() < 1000:
        b = b * 2 
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c 

In [74]:
a = torch.randn(size=(2,2), requires_grad=True)
d = f(a)

In [75]:
d

tensor([[ -248.2693,  1364.9205],
        [-1133.4603,   396.8123]], grad_fn=<MulBackward0>)

In [76]:
d.sum().backward()

In [77]:
a.grad

tensor([[2048., 2048.],
        [2048., 2048.]])

In [78]:
d / a

tensor([[2048., 2048.],
        [2048., 2048.]], grad_fn=<DivBackward0>)