# 5.2 参数管理

In [2]:
import torch
net = torch.nn.Sequential(
    torch.nn.Linear(4, 8), torch.nn.ReLU(), torch.nn.Linear(8, 1)
)
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.1787],
        [0.2190]], grad_fn=<AddmmBackward0>)

## 5.2.1 参数访问
当通过Sequential类定义模型时， 我们可以通过索引来访问模型的任意层。 

In [4]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.2122,  0.3054, -0.1641,  0.1500, -0.3168,  0.0388,  0.1925, -0.1025]])), ('bias', tensor([0.0189]))])


### 5.2.1.1 目标参数
参数是复合的对象，包含值、梯度和额外信息。

In [5]:
print(type(net[2].bias))
print(net[2].bias)          # 取出bias实例
print(net[2].bias.data)     # 取出bias的值

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.0189], requires_grad=True)
tensor([0.0189])


### 5.2.1.2 一次性访问全部参数
递归整个树来提取每个子块的参数

In [11]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])
print(net.state_dict()['2.bias'].data)

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))
tensor([0.0189])


### 5.2.1.3. 从嵌套块收集参数

In [12]:
def block1():
    return torch.nn.Sequential(
        torch.nn.Linear(4, 8), torch.nn.ReLU(),
        torch.nn.Linear(8, 4), torch.nn.ReLU()
    )
def block2():
    net = torch.nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}', block1())
    return net
rgnet = torch.nn.Sequential(block2(), torch.nn.Linear(4,1))

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [13]:
print(rgnet[0][1][0].bias)

Parameter containing:
tensor([-0.0364, -0.4430, -0.0118, -0.4868, -0.0370,  0.0181,  0.4941,  0.0860],
       requires_grad=True)


## 5.2.2. 参数初始化
默认情况下，PyTorch会根据一个范围均匀地初始化权重和偏置矩阵
### 5.2.2.1. 内置初始化

In [14]:
def init_normal(m):
    if type(m) == torch.nn.Linear:
        torch.nn.init.normal_(m.weight, mean=0, std=0.01)
        torch.nn.init.constant_(m.bias, val=0)
net.apply(init_normal)
net[0].weight.data, net[0].bias.data

(tensor([[ 0.0086,  0.0133,  0.0060,  0.0045],
         [-0.0008, -0.0212,  0.0003,  0.0031],
         [ 0.0183, -0.0003,  0.0064, -0.0015],
         [-0.0028,  0.0152,  0.0014,  0.0048],
         [-0.0241,  0.0154, -0.0002,  0.0225],
         [ 0.0105,  0.0059,  0.0022, -0.0039],
         [-0.0096, -0.0134,  0.0099,  0.0013],
         [-0.0193,  0.0093, -0.0027, -0.0200]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

对某些块应用不同的初始化方法。

In [16]:
def xavier(m):
    if type(m) == torch.nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight) # Glorot初始化
def init_42(m):
    if type(m) == torch.nn.Linear:
        torch.nn.init.constant_(m.weight, 42)
net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.1841, -0.0263, -0.3763, -0.4992])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


### 5.2.2.2. 自定义初始化
实现了一个my_init函数来应用到net

In [17]:
def my_init(m):
    if type(m) == torch.nn.Linear:
        print("Init", *[(name, param.shape) for name, param in m.named_parameters()])
        torch.nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5
net.apply(my_init)
net[0].weight

Init ('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
Init ('weight', torch.Size([1, 8])) ('bias', torch.Size([1]))


Parameter containing:
tensor([[ 0.0000,  0.0000, -0.0000, -0.0000],
        [ 8.6211, -8.8919,  0.0000,  6.6446],
        [ 7.6325, -0.0000,  5.1875, -8.0344],
        [-0.0000, -0.0000,  0.0000, -0.0000],
        [-6.7355, -7.8584,  9.2899,  6.2489],
        [-5.1661, -7.0998, -8.3577, -5.6626],
        [ 0.0000, -0.0000,  0.0000, -7.4722],
        [-0.0000, -0.0000,  0.0000, -5.7442]], requires_grad=True)

## 5.2.3. 参数绑定
在多个层间共享参数： 我们可以定义一个稠密层，然后使用它的参数来设置另一个层的参数

In [18]:
shared = torch.nn.Linear(8, 8)      # 共享层
net = torch.nn.Sequential(
    torch.nn.Linear(4, 8), torch.nn.ReLU(),shared, torch.nn.ReLU(), shared, torch.nn.ReLU(), torch.nn.Linear(8, 1)
)
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
