In [74]:
import torch

In [75]:
x = torch.arange(4.0)

In [76]:
x

tensor([0., 1., 2., 3.])

Scalar-valued function with respect to a vector $\mathbf{x}$:
$$f: \mathbb{R}^n \mapsto \mathbb{R}$$

위의 Scalar-valued function의 Gradient는 $\mathbf{x}$와 동일한 모양을 가지는 Vector-valued임. 즉,

$$
\nabla f(\mathbf{x}) = \left( \frac{\partial f}{\partial x_1}, \frac{\partial f}{\partial x_2}, \dots, \frac{\partial f}{\partial x_n} \right)^T
$$

$$
\nabla f(\mathbf{x}) \in \mathbb{R}^n
$$

In [77]:
x.requires_grad_(True)

tensor([0., 1., 2., 3.], requires_grad=True)

In [78]:
x.grad

In [79]:
y = 2 * torch.dot(x, x)

In [80]:
y

tensor(28., grad_fn=<MulBackward0>)

In [81]:
y.backward()

In [82]:
x.grad

tensor([ 0.,  4.,  8., 12.])

In [83]:
x.grad == 4 * x

tensor([True, True, True, True])

In [84]:
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [88]:
x

tensor([0., 1., 2., 3.], requires_grad=True)

$$y = x_1 + x_2 + \dots + x_n$$

> 각 $x_i$에 대해 $y$를 미분하면, 모든 $x_i$가 동일한 방식으로 $y$에 기여하므로, 미분값은 항상 1임.
>
> 그러므로, Gradient는 모든 편미분 값이 1인 Vector가 되고, 다음과 같음.

$$\begin{bmatrix} 1 \\ 1 \\ \vdots \\ 1\end{bmatrix} \in \mathbb{R}^n$$

In [85]:
y = x.sum()

In [86]:
y.backward()

In [87]:
x.grad

tensor([1., 1., 1., 1.])

# Backward for Non-Scalar Variables

In [93]:
x

tensor([0., 1., 2., 3.], requires_grad=True)

In [89]:
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [90]:
y = x * x

In [91]:
y.backward(gradient=torch.ones(len(y)))

In [92]:
x.grad

tensor([0., 2., 4., 6.])

## Detaching Computation

In [94]:
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [95]:
y = x * x

In [96]:
u = y.detach()

In [97]:
z = u * x

In [98]:
z.sum().backward()

In [100]:
x.grad

tensor([0., 1., 4., 9.])

In [99]:
x.grad == u

tensor([True, True, True, True])

In [102]:
x.grad.zero_()

tensor([0., 0., 0., 0.])

In [103]:
y.sum().backward()

In [105]:
x.grad

tensor([0., 2., 4., 6.])

In [104]:
x.grad == 2 * x

tensor([True, True, True, True])

# Gradients and Python Control Flow

In [106]:
def f(a):
    b = a * 2

    while b.norm() < 1000:
        b = b * 2

    if b.sum() > 0:
        c = b
    else:
        c = 100 * b

    return c

In [107]:
a = torch.randn(size=(), requires_grad=True)

In [108]:
d = f(a)

In [109]:
d.backward()

In [112]:
d

tensor(-112743.8594, grad_fn=<MulBackward0>)

In [113]:
a

tensor(-0.2753, requires_grad=True)

In [111]:
a.grad

tensor(409600.)

In [110]:
a.grad == d / a

tensor(True)