### 层和块

回顾多层感知机，以下是简洁实现，`nn.Sequential()` 定义了一种特殊的 `Module`

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
X = torch.rand(2, 20)
net(X)

tensor([[ 0.0748,  0.2424, -0.0090, -0.1285,  0.2409, -0.0181,  0.0524, -0.0808,
          0.0484, -0.2322],
        [ 0.0220,  0.3030, -0.0059, -0.0274,  0.0983,  0.0382,  0.1165, -0.0497,
         -0.1206, -0.1737]], grad_fn=<AddmmBackward0>)

自定义多层感知机，`MLP` 类继承 `nn.Module`

In [2]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__() # 调用父类的 init() 函数，继承父类的参数
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)

    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [3]:
net = MLP()
net(X) # forward 是 Module 自带的方法

tensor([[ 0.0450, -0.3381,  0.0402, -0.1064,  0.2222, -0.2932, -0.0245, -0.0086,
         -0.2098, -0.2456],
        [ 0.0506, -0.3827, -0.0396, -0.2066, -0.0025, -0.1894,  0.0189, -0.0350,
         -0.0803, -0.1245]], grad_fn=<AddmmBackward0>)

通过继承 `nn.Module` 并重写 `init(), forward()` 函数，可以做很多事情

In [4]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args: # block 是传进的层次类
            self._modules[block] = block # 以字典形式存储层次

    def forward(self, X):
        for block in self._modules.values():
            X = block(X) # 比如第一层 block = nn.Linear(20, 256)，X 就作为第一层的输入，又是第二层输出，巧妙
        return X

net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[ 0.1542,  0.0193,  0.0948, -0.2359, -0.1331, -0.2414,  0.1139,  0.2610,
          0.0996,  0.0805],
        [ 0.0840, -0.0317,  0.0824, -0.1600, -0.1869, -0.1273,  0.0793,  0.1774,
          0.0734,  0.0447]], grad_fn=<AddmmBackward0>)

下面也是继承并重写 `nn.Module` 中函数的案例，并没有什么意义，只是说可以继承并做很多事

In [5]:
class MyFunction(nn.Module):
    def __init__(self):
        super().__init__()
        self.weight = torch.rand((20, 20), requires_grad=False)
        self.liner = nn.Linear(20, 20)
        
    def forward(self, X):
        X = self.liner(X)
        X = F.relu(torch.mm(X, self.weight) + 1)
        X = self.liner(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

net = MyFunction()
net(X)

tensor(0.3490, grad_fn=<SumBackward0>)

可以各种嵌套使用，只需要保证层次之间的矩阵乘法能顺利进行

In [6]:
class NestNLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 128), nn.ReLU(), 
                            nn.Linear(128, 10), nn.ReLU())
        self.linear = nn.Linear(10, 20)

    def forward(self, X):
        return self.linear(self.net(X))

my_net = nn.Sequential(NestNLP(), nn.Linear(20, 20), MyFunction())
my_net(X)

tensor(-0.1973, grad_fn=<SumBackward0>)

### 参数管理

#### 获取参数

先看单隐藏层的多层感知机

In [7]:
net = nn.Sequential(nn.Linear(4, 5), nn.ReLU(), nn.Linear(5, 1))
X = torch.rand(2, 4)
net(X)

tensor([[0.2057],
        [0.1305]], grad_fn=<AddmmBackward0>)

In [8]:
print(net[2].state_dict()) # state_dict 保存权重和偏差
print(type(net[2].weight))
print(net[2].weight)
print(net[2].weight.data) # 用 data 访问数据，grad 访问梯度
print(net[2].weight.grad)

OrderedDict([('weight', tensor([[ 0.0894,  0.2311,  0.4206,  0.1787, -0.3332]])), ('bias', tensor([0.2057]))])
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([[ 0.0894,  0.2311,  0.4206,  0.1787, -0.3332]], requires_grad=True)
tensor([[ 0.0894,  0.2311,  0.4206,  0.1787, -0.3332]])
None


用 `named_parameters` 拿出层次参数

In [9]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()]) # *就是把循环里的东西都拿出来放一起
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([5, 4])) ('bias', torch.Size([5]))
('0.weight', torch.Size([5, 4])) ('0.bias', torch.Size([5])) ('2.weight', torch.Size([1, 5])) ('2.bias', torch.Size([1]))


In [10]:
net.state_dict()['0.bias'].data

tensor([-0.4250,  0.1294, -0.3036, -0.3581,  0.0195])

从嵌套块收集参数

In [11]:
def block1():
    net = nn.Sequential(nn.Linear(2, 6), nn.ReLU(), nn.Linear(6, 2))
    return net

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block-{i}', block1()) # 循环 4 次，每次加入 1 个 block1（3层），一共 12 层，每两层名字相同
    return net

rgnet = nn.Sequential(block2(), nn.Linear(2, 4))
X = torch.rand(4, 2)
rgnet(X)

tensor([[ 0.4634, -0.7390, -0.4538,  0.2143],
        [ 0.4635, -0.7392, -0.4538,  0.2143],
        [ 0.4637, -0.7393, -0.4539,  0.2144],
        [ 0.4636, -0.7392, -0.4538,  0.2144]], grad_fn=<AddmmBackward0>)

`rgnet` 有两层，第一层 `block2` 有 4 层 `block1`，`block1` 自身有 3 层，`rgnet` 第二层就是一个全连接层

In [12]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block-0): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
    (block-1): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
    (block-2): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
    (block-3): Sequential(
      (0): Linear(in_features=2, out_features=6, bias=True)
      (1): ReLU()
      (2): Linear(in_features=6, out_features=2, bias=True)
    )
  )
  (1): Linear(in_features=2, out_features=4, bias=True)
)


#### 初始化参数

`nn.init` 就是自带的初始化方法

In [13]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01) # normal_ 指的是直接替换，而不是返回值，normal是返回，带下划线的函数同理
        nn.init.zeros_(m.bias)

net.apply(init_normal) # apply 就是对 net 所有的 layer 都操作一遍初始化
net[2].weight.data[0], net[2].bias.data[0]

(tensor([-0.0017, -0.0126,  0.0022,  0.0103, -0.0021]), tensor(0.))

In [14]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1) # constant 就是固定值
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net.state_dict()

OrderedDict([('0.weight',
              tensor([[1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.]])),
             ('0.bias', tensor([0., 0., 0., 0., 0.])),
             ('2.weight', tensor([[1., 1., 1., 1., 1.]])),
             ('2.bias', tensor([0.]))])

`xavier` 初始化

In [15]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

net.apply(xavier)
net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.2314, -0.1278, -0.0762, -0.4161],
                      [ 0.5370, -0.4357,  0.7246,  0.3739],
                      [ 0.7294, -0.4062,  0.0834,  0.0025],
                      [-0.0445, -0.5716, -0.0810, -0.6010],
                      [ 0.2431,  0.5741,  0.7020,  0.7073]])),
             ('0.bias', tensor([0., 0., 0., 0., 0.])),
             ('2.weight',
              tensor([[ 0.5427, -0.7971, -0.5501, -0.5127, -0.2577]])),
             ('2.bias', tensor([0.]))])

初始化参数也能自定义

In [16]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0]) # named_named_parameters()[0] 是 weight
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5 # 保留绝对值大于 5 的权重

net.apply(my_init)
net[0].weight

Init weight torch.Size([5, 4])
Init weight torch.Size([1, 5])


Parameter containing:
tensor([[ 0.0000,  5.4795, -8.3915, -0.0000],
        [ 0.0000, -0.0000,  0.0000, -9.6023],
        [-5.4950,  7.8027,  7.3714,  9.9293],
        [ 0.0000,  0.0000,  0.0000, -8.9807],
        [ 8.5699, -6.0642, -7.7512, -0.0000]], requires_grad=True)

也可以直接更改指定层次的信息

In [17]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 85
net[0].weight.data

tensor([[85.0000,  6.4795, -7.3915,  1.0000],
        [ 1.0000,  1.0000,  1.0000, -8.6023],
        [-4.4950,  8.8027,  8.3714, 10.9293],
        [ 1.0000,  1.0000,  1.0000, -7.9807],
        [ 9.5699, -5.0642, -6.7512,  1.0000]])

参数绑定，就是在初始化的时候指定一个共享层

In [18]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(2, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1))
print(net[2].weight.data == net[4].weight.data)
X = torch.rand(8, 2)
net(X)

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])


tensor([[0.0585],
        [0.0930],
        [0.0923],
        [0.0575],
        [0.0610],
        [0.0961],
        [0.0745],
        [0.0693]], grad_fn=<AddmmBackward0>)

### 自定义层

自定义一个没有任何参数的层，实际上不管是层次还是 `Sequential` 都是继承的 `nn.Module`，基本都需要重写 `init, forward` 两个函数

比如 `MyLayer` 可以作为一个层次加入 `Sequential` 中

In [19]:
class MyLayer(nn.Module):
    # def __init__(self): # python3 在不用定义参数的情况下可以不重写这个 init 函数
    #     super().__init__()

    def forward(self, X):
        return X - X.mean()

net = MyLayer()
X = torch.FloatTensor([1, 2, 3, 4, 5])
net(X)

tensor([-2., -1.,  0.,  1.,  2.])

In [20]:
class MyLinear(nn.Module):
    def __init__(self, in_unit, out_unit):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_unit, out_unit)) # randn 符合标准正态分布，rand 符合 0-1 均匀分布
        self.bias = nn.Parameter(torch.randn(out_unit,)) # 生成向量而不是标量

    def forward(self, X):
        return torch.mm(X, self.weight.data) + self.bias.data

net = nn.Sequential(MyLinear(2, 4), nn.ReLU(), nn.Linear(4, 1))
net[0].weight.data

tensor([[-0.1706, -0.4433, -0.1845,  0.1711],
        [ 0.1412, -1.5087, -1.5302, -0.9381]])

### 读写文件

加载和保存张量

In [24]:
import os

x = torch.arange(4)
torch.save(x, 'x-file')

x2 = torch.load("x-file")
x2

tensor([0, 1, 2, 3])

存储张量列表

In [25]:
y = torch.ones(4)
torch.save([x, y], 'x-file')
x2, y2 = torch.load("x-file")
x2, y2

(tensor([0, 1, 2, 3]), tensor([1., 1., 1., 1.]))

可以用各种数据结构存储张量并存储

In [32]:
tensor_dict = {'x' : x, 'y' : y}
torch.save(tensor_dict, 'x-file')
torch.load("x-file")
tensor_dict_out = torch.load("x-file")
tensor_dict_out

{'x': tensor([0, 1, 2, 3]), 'y': tensor([1., 1., 1., 1.])}

加载和保存模型参数，`pytorch` 一般是存储参数，如权重和偏差，以字典形式存储，调用 `net.state_dict()`

In [40]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(2, 4)
        self.output = nn.Linear(4, 1)

    def forward(self, X):
        return self.output(F.relu(self.hidden(X)))

net = MLP()
X = torch.randn(2, 2)
net(X)

tensor([[-0.3066],
        [-0.2063]], grad_fn=<AddmmBackward0>)

命名方式就是以 `模型名称.params` 命名

In [50]:
torch.save(net.state_dict(), 'mlp.params')

加载的时候要克隆一份模型（备份）

In [57]:
net_clone = MLP()
net_clone.load_state_dict(torch.load("mlp.params"))
net_clone

MLP(
  (hidden): Linear(in_features=2, out_features=4, bias=True)
  (output): Linear(in_features=4, out_features=1, bias=True)
)

In [59]:
net_clone(X) == net(X)

tensor([[True],
        [True]])