In [54]:
import torch
from torch import nn
from torch.nn import init

### 4.1 模型构造

In [2]:
# 继承Module类来构造类型
class MLP(nn.Module):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

In [3]:
X = torch.rand(2, 784)
net = MLP()
print(net)
print(X)

MLP(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)
tensor([[0.7072, 0.9415, 0.3905,  ..., 0.7163, 0.2155, 0.7359],
        [0.9706, 0.0301, 0.2840,  ..., 0.9668, 0.3638, 0.3071]])


##### Module的子类
* Sequential类
* ModuleList类
* ModuleDict类

In [13]:
# 实现一个与Sequential类相同功能的MySequential类
# Sequential类接收一个有序字典（OrderedDict）
# 或者一系列子模块作为参数
class MySequential(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySequential, self).__init__()
        if (len(args) == 1 and 
            isinstance(args[0], OrderedDict())):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    def forward(self, input):
        for module in self._modules.values():
            input = module(input)
        return input

In [14]:
net = MySequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 10),
)
print(net)
net(X)

MySequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[ 0.1700,  0.0827,  0.0531,  0.1276, -0.1362, -0.0222,  0.2020, -0.0220,
          0.0121, -0.0503],
        [ 0.1188,  0.2035, -0.0003, -0.0631,  0.0170, -0.1028,  0.0793,  0.0211,
         -0.0691, -0.0412]], grad_fn=<AddmmBackward>)

In [15]:
# ModuleList类，
# 接收一个子模块的列表作为输入
# 可以像List一样进行append和extend
net = nn.ModuleList([
    nn.Linear(784, 256),
    nn.ReLU()
])
net.append(nn.Linear(256, 10))
print(net[-1])
print(net)

Linear(in_features=256, out_features=10, bias=True)
ModuleList(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [16]:
# ModuleDict类
# 接收一个子模块的字典作为输入
# 也可以类似字典那样进行添加访问操作
net = nn.ModuleDict({
    'linear': nn.Linear(784, 256),
    'act': nn.ReLU(),
})
net['output'] = nn.Linear(256, 10)
print(net['linear'])
print(net.output)
print(net)

Linear(in_features=784, out_features=256, bias=True)
Linear(in_features=256, out_features=10, bias=True)
ModuleDict(
  (act): ReLU()
  (linear): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)


##### 构造复杂模型

In [25]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        self.rand_weight = torch.rand(
            (20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    def forward(self, x):
        x = self.linear(x)
        x = nn.functional.relu(torch.mm(
            x, self.rand_weight.data) + 1)
        x = self.linear(x)
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

In [26]:
X = torch.rand(2, 20)
net = FancyMLP()
print(net)
net(X)

FancyMLP(
  (linear): Linear(in_features=20, out_features=20, bias=True)
)


tensor(1.5027, grad_fn=<SumBackward0>)

In [30]:
# 嵌套调用
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30),
                                nn.ReLU())
    def forward(self, x):
        return self.net(x)
net = nn.Sequential(NestMLP(), 
                    nn.Linear(30, 20),
                   FancyMLP())
X = torch.rand(2, 40)
print(net)
net(X)

Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=40, out_features=30, bias=True)
      (1): ReLU()
    )
  )
  (1): Linear(in_features=30, out_features=20, bias=True)
  (2): FancyMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)


tensor(3.6027, grad_fn=<SumBackward0>)

### 4.2 模型参数的访问，初始化和共享

* nn.init模块，包含了多种模型初始化方法
* module默认初始化
    - nn.Module的模块参数都采取了较为合理的初始化策略
    - 不同layer具体哪一种初始化方法可参考源代码

In [32]:
net = nn.Sequential(nn.Linear(4, 3),
                   nn.ReLU(),
                   nn.Linear(3,1))
# pytorch已进行默认初始化
print(net)
X = torch.rand(2, 4)
Y = net(X).sum()

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)


In [33]:
print(type(net.named_parameters()))
for name, param in net.named_parameters():
    print(name, param.size())

<class 'generator'>
0.weight torch.Size([3, 4])
0.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])


In [34]:
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>


In [39]:
weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad)
Y.backward()
print(weight_0.grad)

tensor([[-0.4827,  0.1340,  0.1847, -0.1361],
        [-0.2787, -0.3143,  0.4554, -0.2498],
        [-0.2253, -0.4342, -0.3020, -0.1727]])
None
tensor([[1.7189e-04, 8.3975e-03, 4.0295e-02, 4.2247e-02],
        [1.9144e-01, 1.2983e-01, 4.9811e-01, 4.6264e-01],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]])


In [51]:
net[0].weight.data

tensor([[-0.4827,  0.1340,  0.1847, -0.1361],
        [-0.2787, -0.3143,  0.4554, -0.2498],
        [-0.2253, -0.4342, -0.3020, -0.1727]])

In [52]:
net[0].weight.grad

tensor([[1.7189e-04, 8.3975e-03, 4.0295e-02, 4.2247e-02],
        [1.9144e-01, 1.2983e-01, 4.9811e-01, 4.6264e-01],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]])

In [38]:
# torch.nn.Parameter是Tensor的子类
# 会被自动添加到模型的参数列表里
class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__(**kwargs)
        self.weight1 = nn.Parameter(torch.rand(20, 20))
        self.weight2 = torch.rand(20, 20)
    def forward(self, x):
        pass
n = MyModel()
for name, param in n.named_parameters():
    print(name)

weight1


In [56]:
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)
    if 'bias' in name:
        init.constant_(param, val=0)
        print(name, param.data)

0.weight tensor([[-0.0103,  0.0087, -0.0117,  0.0150],
        [ 0.0062,  0.0135,  0.0027, -0.0094],
        [ 0.0033,  0.0064,  0.0094,  0.0111]])
0.bias tensor([0., 0., 0.])
2.weight tensor([[ 0.0045, -0.0067,  0.0143]])
2.bias tensor([0.])


In [57]:
# 自定义初始化方法
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(mean, std)

In [63]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()
for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)
    if 'bias' in name:
        # 可以通过改变这些参数的data来改变模型参数值
        # 同时不会影响梯度
        param.data += 1
        print(name, param.data)

0.weight tensor([[-0.0000,  8.5236, -6.3441,  5.6359],
        [-0.0000, -6.1763,  8.9667,  0.0000],
        [ 5.6074,  6.4027,  6.4041, -6.1149]])
0.bias tensor([2., 2., 2.])
2.weight tensor([[-0.0000, 9.3943, 7.9373]])
2.bias tensor([2.])


In [64]:
# 共享模型参数
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight tensor([[3.]])


In [65]:
# 内存中，两个线性层其实是一个对象
print(id(net[0]) == id(net[1]))
print(id(net[0].weight) == id(net[1].weight))

True
True


In [66]:
# 反向传播计算时，共享参数的梯度是累加的
x = torch.ones(1, 1)
y = net(x).sum()
print(y)
y.backward()
print(net[0].weight.grad)

tensor(9., grad_fn=<SumBackward0>)
tensor([[6.]])


### 4.3 模型参数的延后初始化

### 4.4 自定义层

In [68]:
# 不含模型参数的自定义层
class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()

In [69]:
layer = CenteredLayer()
layer(torch.tensor([1, 2, 3, 4, 5], 
                  dtype=torch.float))

tensor([-2., -1.,  0.,  1.,  2.])

In [72]:
net = nn.Sequential(nn.Linear(8, 128),
                   CenteredLayer())
y = net(torch.rand(4, 8))
y.mean().item()

-6.984919309616089e-09

In [73]:
# 含模型参数的自定义层
# 将参数定义成Parameter，ParameterList，ParameterDict
# ParameterList接收一个Parameter实例的列表作为输入，
# 也可以使用append和extend在列表后面新增参数
class MyDense(nn.Module):
    def __init__(self):
        super(MyDense, self).__init__()
        self.params = nn.ParameterList([
            nn.Parameter(torch.randn(4, 4)) 
                for i in range(3) 
        ])
        self.params.append(nn.Parameter(
            torch.randn(4, 1)))
    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x
net = MyDense()
print(net)

MyDense(
  (params): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 4x4]
      (1): Parameter containing: [torch.FloatTensor of size 4x4]
      (2): Parameter containing: [torch.FloatTensor of size 4x4]
      (3): Parameter containing: [torch.FloatTensor of size 4x1]
  )
)


In [76]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
            'linear1': nn.Parameter(torch.randn(4, 4)),
            'linear2': nn.Parameter(torch.randn(4, 1))
        })
        self.params.update({
            'linear3': nn.Parameter(torch.randn(4, 2))
        })
    def forward(self, x, choice='linear1'):
        return torch.mm(x, self.params[choice])

net = MyDictDense()
print(net)

MyDictDense(
  (params): ParameterDict(
      (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
      (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
      (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
  )
)


In [77]:
x = torch.ones(1, 4)
print(net(x, 'linear1'))
print(net(x, 'linear2'))
print(net(x, 'linear3'))

tensor([[ 2.4713, -1.1180,  4.3188,  2.3678]], grad_fn=<MmBackward>)
tensor([[-0.4034]], grad_fn=<MmBackward>)
tensor([[-3.4841,  0.8850]], grad_fn=<MmBackward>)


In [79]:
net = nn.Sequential(
    MyDictDense(),
    MyDense(),
)
print(net)
print(net(x))

Sequential(
  (0): MyDictDense(
    (params): ParameterDict(
        (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
        (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
        (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
  (1): MyDense(
    (params): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x1]
    )
  )
)
tensor([[62.4212]], grad_fn=<MmBackward>)


### 4.5 读取和存储

In [80]:
# 可以使用save函数和load函数存储和读取Tensor
# 使用python的pickle进行对象序列化
# 可以保存各种对象，包括模型，张量，字典等
x = torch.ones(3)
torch.save(x, 'x.pt')
x2 = torch.load('x.pt')
x2

tensor([1., 1., 1.])

In [81]:
y = torch.zeros(4)
torch.save([x, y], 'xy.pt')
xy_list = torch.load('xy.pt')
xy_list

[tensor([1., 1., 1.]), tensor([0., 0., 0., 0.])]

In [82]:
torch.save({'x': x, 'y':y}, 'xy_dict.pt')
xy = torch.load('xy_dict.pt')
xy

{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}

In [88]:
# 读写模型
# 只有具有可学习参数的层，才有state_dict的条目
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)
net = MLP()
net.state_dict()

OrderedDict([('hidden.weight', tensor([[-0.5214,  0.5373,  0.3808],
                      [-0.1490,  0.1347,  0.0907]])),
             ('hidden.bias', tensor([ 0.5216, -0.2874])),
             ('output.weight', tensor([[-0.2790,  0.6809]])),
             ('output.bias', tensor([0.5046]))])

In [89]:
# 优化器也有state_dict，包含优化器的状态及所使用的超参信息
optimizer = torch.optim.SGD(net.parameters(), 
                            lr=0.001,
                           momentum=0.9)
optimizer.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [4948904128, 4948903728, 4948904288, 4948903808]}]}

##### 保存和加载模型
* 1. 保存和加载state_dict（推荐）
    ```python
    torch.save(model.state_dict(), PATH)
    model = TheModelClass(*args, **kwargs)
    model.load_state_dict(torch.load(PATH))
    ```
* 2. 保存和加载整个模型
    ```python
    torch.save(mode, PATH)
    model = torch.load(PATH)
    ```

In [90]:
X = torch.randn(2, 3)
Y = net(X)
PATH = './net.pt'
torch.save(net.state_dict(), PATH)

In [91]:
net2 = MLP()
net2.load_state_dict(torch.load(PATH))
Y2 = net2(X)
Y2 == Y

tensor([[True],
        [True]])

### 4.6 GPU计算

* 可以通过nvidia-smi命令查看显卡信息
* 计算设备
    - torch.cuda.is_available() - 查看GPU是否可用
    - torch.cuda.device_count() - 查看GPU数量
    - torch.cuda.current_device() - 当前GPU索引号
    - torch.cuda.get_device_name(0) - 根据索引查看GPU名字
* Tensor的GPU计算
    - .cuda()可以将CPU上的Tensor转换到GPU上
    - .device查看该Tensor所在设备
    - 可以直接在创建时候就指定设备
    - 存储在不同位置中的数据是不可以直接进行计算的
* 模型的GPU计算
    - 模型也可以通过.cuda转换到GPU上
    - 可以通过检查模型的参数的device属性来查看存放模型的设备