Define the Model
---
Also register the forward and backward hook  
via https://zhuanlan.zhihu.com/p/75054200

In [1]:
import torch
from torch import nn

# 首先我们定义一个模型
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(3, 4)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(4, 1)
        self.initialize()
    
    # 为了方便验证，我们将指定特殊的weight和bias
    def initialize(self):
        self.fc1.weight = torch.nn.Parameter(torch.Tensor([[1., 2., 3.],
                                                           [-4., -5., -6.],
                                                           [7., 8., 9.],
                                                           [-10., -11., -12.]]))
        self.fc1.bias = torch.nn.Parameter(torch.Tensor([1.0, 2.0, 3.0, 4.0]))
        self.fc2.weight = torch.nn.Parameter(torch.Tensor([[1.0, 2.0, 3.0, 4.0]]))
        self.fc2.bias = torch.nn.Parameter(torch.Tensor([2.0]))

    def forward(self, x):
        o = self.fc1(x)
        o = self.relu1(o)
        o = self.fc2(o)
        return o

# 全局变量，用于存储中间层的 feature
total_feat_out = []
total_feat_in = []
total_grad_in = []
total_grad_out = []

# 定义 forward hook function
def hook_fn_forward(module, input, output):
    print(module) # 用于区分layer
    print('input', input) # 首先打印出来
    print('output', output)
    print()
    total_feat_out.append(output) # 然后分别存入全局 list 中
    total_feat_in.append(input)
    
def hook_fn_backward(module, grad_input, grad_output):
    print(module) # 为了区分layer
    print('grad_input', grad_output)  # 为了符合反向传播的顺序，我们先打印 grad_output
    print('grad_output', grad_input)  # 再打印 grad_input
    print()
    # 保存到全局变量
    total_grad_in.append(grad_input)
    total_grad_out.append(grad_output)

model = Model()
modules = model.named_children()

# register forward and backward hook for all the layers.
for name, module in modules:
    module.register_forward_hook(hook_fn_forward)
    module.register_backward_hook(hook_fn_backward)

Forward/Backward Hook
---

In [2]:
# 注意下面代码中 x 的维度，对于linear module，输入一定是大于等于二维的
# （第一维是 batch size）。在 forward hook 中看不出来，但是 backward hook 中，
# 得到的梯度完全不对。
# 有一篇 hook 的教程就是这里出了错，作者还强行解释
x = torch.Tensor([[1.0, 1.0, 1.0]]).requires_grad_() 
o = model(x)
o.backward()

Linear(in_features=3, out_features=4, bias=True)
input (tensor([[1., 1., 1.]], requires_grad=True),)
output tensor([[  7., -13.,  27., -29.]], grad_fn=<AddmmBackward>)

ReLU()
input (tensor([[  7., -13.,  27., -29.]], grad_fn=<AddmmBackward>),)
output tensor([[ 7.,  0., 27.,  0.]], grad_fn=<ReluBackward0>)

Linear(in_features=4, out_features=1, bias=True)
input (tensor([[ 7.,  0., 27.,  0.]], grad_fn=<ReluBackward0>),)
output tensor([[90.]], grad_fn=<AddmmBackward>)

Linear(in_features=4, out_features=1, bias=True)
grad_input (tensor([[1.]]),)
grad_output (tensor([1.]), tensor([[1., 2., 3., 4.]]), tensor([[ 7.],
        [ 0.],
        [27.],
        [ 0.]]))

ReLU()
grad_input (tensor([[1., 2., 3., 4.]]),)
grad_output (tensor([[1., 0., 3., 0.]]),)

Linear(in_features=3, out_features=4, bias=True)
grad_input (tensor([[1., 0., 3., 0.]]),)
grad_output (tensor([1., 0., 3., 0.]), tensor([[22., 26., 30.]]), tensor([[1., 0., 3., 0.],
        [1., 0., 3., 0.],
        [1., 0., 3., 0.]]))

