In [1]:
from IPython.display import Image

In [2]:
import torch
from torch import nn
from torch.autograd import Variable

In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x749bd40c6cf0>

- 反向传播，链式法则，内部非叶子节点（non-leaf node，哪怕 requires_grad 为 true，且其存在 grad_fn）也是会算梯度的，只是用完就置空了，
    - 因此如果相查看内部非叶子节点的 grad，需要 retain_graph 保留在计算图中;
- 深度神经网络中的中间层 layer 的参数（weights & bias）它们是内部节点呢，还是叶子节点呢？
    - 是叶子节点；
- 不要轻易地关闭 warnings，有助于排查/定位问题；
    - warnings 不会导致程序 dump，但不推荐，因为有可能导致程序的运行不符合预期；
    - 对于自己写的代码，出于健壮性或者可快速定位问题的考虑，也可以尝试多写 warnings

## multi head (output/branch) architecture

- https://www.bilibili.com/video/BV1o24y1b7tk

In [4]:
Image(url='../imgs/multi_loss.PNG', width=100)

In [5]:
a = Variable(torch.rand(1, 4), requires_grad=True)
b = a**2
c = b*2

d = c.mean()
e = c.sum()


d.backward()

# RuntimeError: Trying to backward through the graph a second time
e.backward()

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [6]:
a = Variable(torch.rand(1, 4), requires_grad=True)
b = a**2
c = b*2

d = c.mean()
e = c.sum()


d.backward(retain_graph=True)

e.backward()

$$
\begin{split}
&b_i=a_i^2\\
&c_i=2b_i=2a_i^2\\
&d=\frac{\sum_ic_i}4=\frac{\sum_i 2a_i^2}4\\
&e=\sum_i c_i=\sum_i 2a_i^2
\end{split}
$$

$$
\begin{split}
&\frac{\partial d}{\partial a_i}=a_i\\
&\frac{\partial e}{\partial a_i}=4a_i
\end{split}
$$

In [7]:
a

tensor([[0.3904, 0.6009, 0.2566, 0.7936]], requires_grad=True)

In [8]:
a.grad

tensor([[1.9522, 3.0045, 1.2829, 3.9682]])

In [9]:
5*a

tensor([[1.9522, 3.0045, 1.2829, 3.9682]], grad_fn=<MulBackward0>)

- suppose you first back-propagate loss1, then loss2 (you can also do the reverse)

```
l1.backward(retain_graph=True)
l2.backward() # now the graph is freed, and next process of batch gradient descent is ready

optimizer.step() # update the network parameters

```

## non-leaf node

In [10]:
a = Variable(torch.rand(1, 4), requires_grad=True)
b = a**2
c = b*2

d = c.mean()

In [11]:
d.backward()

In [12]:
b.grad

  b.grad


In [13]:
b

tensor([[0.8851, 0.0177, 0.8735, 0.3523]], grad_fn=<PowBackward0>)

In [14]:
b.is_leaf

False

In [15]:
a = Variable(torch.rand(1, 4), requires_grad=True)
b = a**2
b.retain_grad()
c = b*2

d = c.mean()
d.backward()
b.grad

tensor([[0.5000, 0.5000, 0.5000, 0.5000]])

$$
\begin{split}
&d = \frac{\sum_i c_i}{4}=\frac{\sum_i 2b_i}{4}=\frac{\sum_i b_i}2\\
&\frac{\partial d}{\partial b_i}=\frac12
\end{split}
$$

### nn 中间层的weights 其实也是 leaf node

In [16]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(28*28, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 10)

    def forward(self, x):
        x = nn.Flatten(x)
        x = self.fc3(nn.ReLU(self.fc2(nn.ReLU(self.fc1(x)))))
        return x

In [17]:
mlp = MLP()

In [18]:
mlp.fc1.weight.is_leaf

True

In [19]:
mlp.fc2.weight.is_leaf

True

In [20]:
mlp.fc3.weight.is_leaf

True