### 层和块

回顾多层感知机，以下是简洁实现，`nn.Sequential()` 定义了一种特殊的 `Module`

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
X = torch.rand(2, 20)
net(X)

tensor([[ 0.0041, -0.0812,  0.0519, -0.0582,  0.0513, -0.1538, -0.0394, -0.0293,
          0.0650, -0.1670],
        [ 0.0208, -0.0116,  0.0035, -0.2275, -0.0389, -0.3061,  0.1036, -0.0503,
          0.0294, -0.1793]], grad_fn=<AddmmBackward0>)

自定义多层感知机，`MLP` 类继承 `nn.Module`

In [12]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__() # 调用父类的 init() 函数，继承父类的参数
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)

    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [15]:
net = MLP()
net(X) # forward 是 Module 自带的方法

tensor([[ 0.0816,  0.1456, -0.0858,  0.2108,  0.1235, -0.0143,  0.1002, -0.0445,
         -0.1516, -0.0812],
        [ 0.0262,  0.0969, -0.1211,  0.1634,  0.2465,  0.0334,  0.1324, -0.0690,
         -0.2328, -0.1449]], grad_fn=<AddmmBackward0>)

通过继承 `nn.Module` 并重写 `init(), forward()` 函数，可以做很多事情

In [19]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args: # block 是传进的层次类
            self._modules[block] = block # 以字典形式存储层次

    def forward(self, X):
        for block in self._modules.values():
            X = block(X) # 比如第一层 block = nn.Linear(20, 256)，X 就作为第一层的输入，又是第二层输出，巧妙
        return X

net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[-0.2109,  0.1698, -0.1485,  0.0298,  0.0496,  0.0377, -0.1035, -0.2001,
         -0.0377,  0.1846],
        [-0.1345,  0.1248, -0.0747,  0.1307,  0.0918, -0.0191, -0.0886, -0.2544,
          0.1496,  0.0375]], grad_fn=<AddmmBackward0>)

下面也是继承并重写 `nn.Module` 中函数的案例，并没有什么意义，只是说可以继承并做很多事

In [22]:
class MyFunction(nn.Module):
    def __init__(self):
        super().__init__()
        self.weight = torch.rand((20, 20), requires_grad=False)
        self.liner = nn.Linear(20, 20)
        
    def forward(self, X):
        X = self.liner(X)
        X = F.relu(torch.mm(X, self.weight) + 1)
        X = self.liner(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

net = MyFunction()
net(X)

tensor(0.1029, grad_fn=<SumBackward0>)

可以各种嵌套使用，只需要保证层次之间的矩阵乘法能顺利进行

In [26]:
class NestNLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 128), nn.ReLU(), 
                            nn.Linear(128, 10), nn.ReLU())
        self.linear = nn.Linear(10, 20)

    def forward(self, X):
        return self.linear(self.net(X))

my_net = nn.Sequential(NestNLP(), nn.Linear(20, 20), MyFunction())
my_net(X)

tensor(-0.1178, grad_fn=<SumBackward0>)

### 参数管理

#### 获取参数

先看单隐藏层的多层感知机

In [28]:
net = nn.Sequential(nn.Linear(4, 5), nn.ReLU(), nn.Linear(5, 1))
X = torch.rand(2, 4)
net(X)

tensor([[-0.3717],
        [-0.3441]], grad_fn=<AddmmBackward0>)

In [45]:
print(net[2].state_dict()) # state_dict 保存权重和偏差
print(type(net[2].weight))
print(net[2].weight)
print(net[2].weight.data) # 用 data 访问数据，grad 访问梯度
print(net[2].weight.grad)

OrderedDict([('weight', tensor([[ 0.1132, -0.1537, -0.4112, -0.3942, -0.0061]])), ('bias', tensor([-0.2289]))])
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([[ 0.1132, -0.1537, -0.4112, -0.3942, -0.0061]], requires_grad=True)
tensor([[ 0.1132, -0.1537, -0.4112, -0.3942, -0.0061]])
None


用 `named_parameters` 拿出层次参数

In [101]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()]) # *就是把循环里的东西都拿出来放一起
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([5, 4])) ('bias', torch.Size([5]))
('0.weight', torch.Size([5, 4])) ('0.bias', torch.Size([5])) ('2.weight', torch.Size([1, 5])) ('2.bias', torch.Size([1]))


In [52]:
net.state_dict()['0.bias'].data

tensor([ 0.2192, -0.2636, -0.1713,  0.0926, -0.2413])

从嵌套块收集参数

In [93]:
def block1():
    net = nn.Sequential(nn.Linear(2, 6), nn.ReLU(), nn.Linear(6, 2))
    return net

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block-{i}', block1()) # 循环 4 次，每次加入 1 个 block1（3层），一共 12 层，每两层名字相同
    return net

rgnet = nn.Sequential(block2(), nn.Linear(2, 4))
X = torch.rand(4, 2)
rgnet(X)

tensor([[ 0.2898, -0.3428, -0.7995,  0.6197],
        [ 0.2898, -0.3428, -0.7995,  0.6197],
        [ 0.2898, -0.3428, -0.7995,  0.6197],
        [ 0.2898, -0.3428, -0.7995,  0.6197]], grad_fn=<AddmmBackward0>)

`rgnet` 有两层，第一层 `block2` 有 4 层 `block1`，`block1` 自身有 3 层，`rgnet` 第二层就是一个全连接层

In [73]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block-0): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
    (block-1): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
    (block-2): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
    (block-3): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
  )
  (1): Linear(in_features=2, out_features=4, bias=True)
)


#### 初始化参数

`nn.init` 就是自带的初始化方法

In [87]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01) # normal_ 指的是直接替换，而不是返回值，normal是返回，带下划线的函数同理
        nn.init.zeros_(m.bias)

net.apply(init_normal) # apply 就是对 net 所有的 layer 都操作一遍初始化
net[2].weight.data[0], net[2].bias.data[0]

(tensor([ 0.0008, -0.0077,  0.0060,  0.0084, -0.0048]), tensor(0.))

In [91]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1) # constant 就是固定值
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net.state_dict()

OrderedDict([('0.weight',
              tensor([[1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.]])),
             ('0.bias', tensor([0., 0., 0., 0., 0.])),
             ('2.weight', tensor([[1., 1., 1., 1., 1.]])),
             ('2.bias', tensor([0.]))])

`xavier` 初始化

In [92]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

net.apply(xavier)
net.state_dict()

OrderedDict([('0.weight',
              tensor([[-0.6852, -0.5216, -0.4039, -0.2840],
                      [ 0.4072, -0.4805, -0.2878, -0.5593],
                      [ 0.5416,  0.2844,  0.1964, -0.7874],
                      [ 0.1614, -0.5096,  0.1630, -0.8154],
                      [-0.6718, -0.0660,  0.5819, -0.6630]])),
             ('0.bias', tensor([0., 0., 0., 0., 0.])),
             ('2.weight',
              tensor([[ 0.0696, -0.5404, -0.4006, -0.8577,  0.6255]])),
             ('2.bias', tensor([0.]))])

初始化参数也能自定义

In [105]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0]) # named_named_parameters()[0] 是 weight
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5 # 保留绝对值大于 5 的权重

net.apply(my_init)
net[0].weight

Init weight torch.Size([5, 4])
Init weight torch.Size([1, 5])


Parameter containing:
tensor([[-0.0000,  5.6333, -5.5193, -0.0000],
        [-0.0000, -9.2136,  9.0162,  0.0000],
        [-0.0000,  5.2770, -7.2623,  7.9502],
        [-0.0000,  7.6082,  0.0000, -5.1386],
        [-0.0000, -5.6072,  7.0920, -9.6170]], requires_grad=True)

也可以直接更改指定层次的信息

In [110]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 85
net[0].weight.data

tensor([[85.0000,  8.6333, -2.5193,  3.0000],
        [ 3.0000, -6.2136, 12.0162,  3.0000],
        [ 3.0000,  8.2770, -4.2623, 10.9502],
        [ 3.0000, 10.6082,  3.0000, -2.1386],
        [ 3.0000, -2.6072, 10.0920, -6.6170]])

参数绑定，就是在初始化的时候指定一个共享层

In [113]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(2, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1))
print(net[2].weight.data == net[4].weight.data)
X = torch.rand(8, 2)
net(X)

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])


tensor([[-0.0281],
        [-0.0270],
        [-0.0375],
        [-0.0435],
        [-0.0469],
        [-0.0329],
        [-0.0568],
        [-0.0479]], grad_fn=<AddmmBackward0>)

### 自定义层

自定义一个没有任何参数的层，实际上不管是层次还是 `Sequential` 都是继承的 `nn.Module`，基本都需要重写 `init, forward` 两个函数

比如 `MyLayer` 可以作为一个层次加入 `Sequential` 中

In [116]:
class MyLayer(nn.Module):
    # def __init__(self): # python3 在不用定义参数的情况下可以不重写这个 init 函数
    #     super().__init__()

    def forward(self, X):
        return X - X.mean()

net = MyLayer()
X = torch.FloatTensor([1, 2, 3, 4, 5])
net(X)

tensor([-2., -1.,  0.,  1.,  2.])

In [132]:
class MyLinear(nn.Module):
    def __init__(self, in_unit, out_unit):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_unit, out_unit)) # randn 符合标准正态分布，rand 符合 0-1 均匀分布
        self.bias = nn.Parameter(torch.randn(out_unit,)) # 生成向量而不是标量

    def forward(self, X):
        return torch.mm(X, self.weight.data) + self.bias.data

net = nn.Sequential(MyLinear(2, 4), nn.ReLU(), nn.Linear(4, 1))
net[0].weight.data

tensor([[ 0.0033, -0.2587, -1.1727, -0.1707],
        [-1.6400,  3.1341,  1.6217, -1.6836]])