### **本节介绍torch中层和块的内部参数的相关操作**

In [1]:
# 导入包
import torch
from torch import nn
import numpy as np

**一，参数访问**

In [23]:
net = nn.Sequential(nn.Linear(2,4),nn.ReLU(),nn.Linear(4,1))
print(net)
x = torch.rand(2,2)
net(x)

Sequential(
  (0): Linear(in_features=2, out_features=4, bias=True)
  (1): ReLU()
  (2): Linear(in_features=4, out_features=1, bias=True)
)


tensor([[-0.1611],
        [-0.2229]], grad_fn=<AddmmBackward0>)

In [None]:
print(net[2].state_dict()) # 访问Sequential中某一层的权重

OrderedDict({'weight': tensor([[ 0.2748, -0.3534, -0.2412,  0.3502]]), 'bias': tensor([0.4890])})


In [7]:
# 单一权重参数访问
print(type(net[2].bias))
print(type(net[2].weight),"\n")
print(net[2].bias)
print(net[2].bias.data)
print(type(net[2].bias.data),"\n")
print(net[2].weight.grad)

<class 'torch.nn.parameter.Parameter'>
<class 'torch.nn.parameter.Parameter'> 

Parameter containing:
tensor([0.4890], requires_grad=True)
tensor([0.4890])
<class 'torch.Tensor'> 

None


In [8]:
# 递归访问全部权重
print(*[(name,param.shape) for name,param in net[0].named_parameters()],"\n")
print(*[(name,param.shape) for name,param in net.named_parameters()])


('weight', torch.Size([4, 2])) ('bias', torch.Size([4])) 

('0.weight', torch.Size([4, 2])) ('0.bias', torch.Size([4])) ('2.weight', torch.Size([1, 4])) ('2.bias', torch.Size([1]))


In [11]:
# 单一权重的字典访问
print(net.state_dict()['2.weight'].data)

tensor([[ 0.2748, -0.3534, -0.2412,  0.3502]])


**二，参数初始化**

默认情况下，PyTorch会根据一个范围均匀地初始化权重和偏置矩阵，
这个范围是根据输入和输出维度计算出的。
PyTorch的`nn.init`模块提供了多种预置初始化方法。

下面我们试着调用内置的初始化。

In [None]:
# 正态分布初始化
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight,mean = 0,std = 1) # 采用标准正态分布初始化权重矩阵
        nn.init.zeros_(m.bias) # 偏置置零
net.apply(init_normal)
print(net[0].weight,net[0].bias,"\n",net[2].weight,net[2].bias)


Parameter containing:
tensor([[ 0.7972, -0.2714],
        [-0.6211, -0.1857],
        [-0.8210,  0.6690],
        [-0.7066, -0.4542]], requires_grad=True) Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True) 
 Parameter containing:
tensor([[-1.7705, -0.4246,  0.0216, -2.1172]], requires_grad=True) Parameter containing:
tensor([0.], requires_grad=True)


In [None]:
# 常量初始化
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight,1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
print(net[0].weight,net[0].bias,"\n",net[2].weight,net[2].bias)

Parameter containing:
tensor([[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True) Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True) 
 Parameter containing:
tensor([[1., 1., 1., 1.]], requires_grad=True) Parameter containing:
tensor([0.], requires_grad=True)


In [16]:
# 对不同的块采用不同的初始化
net[0].apply(init_normal)
net[2].apply(init_constant)
print(net[0].weight,net[0].bias,"\n",net[2].weight,net[2].bias)

Parameter containing:
tensor([[ 1.0790,  2.4374],
        [ 0.7575,  1.1211],
        [ 0.0847, -0.7577],
        [-0.7633,  2.1732]], requires_grad=True) Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True) 
 Parameter containing:
tensor([[1., 1., 1., 1.]], requires_grad=True) Parameter containing:
tensor([0.], requires_grad=True)


**下面进行自定义初始化**

有时，深度学习框架没有提供我们需要的初始化方法。
在下面的例子中，我们使用以下的分布为任意权重参数$w$定义初始化方法：

$$
\begin{aligned}
    w \sim \begin{cases}
        U(5, 10) & \text{ 可能性 } \frac{1}{4} \\
            0    & \text{ 可能性 } \frac{1}{2} \\
        U(-10, -5) & \text{ 可能性 } \frac{1}{4}
    \end{cases}
\end{aligned}
$$


In [17]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([4, 2])
Init weight torch.Size([1, 4])


tensor([[-6.8520, -8.9211],
        [ 5.2097, -0.0000]], grad_fn=<SliceBackward0>)

In [20]:
# 也可以直接设置参数
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
print(net[0].weight.data)
print(net[0].weight.data[0])

tensor([[42.0000, -5.9211],
        [ 8.2097,  3.0000],
        [ 3.0000, 12.8282],
        [11.7931,  8.2662]])
tensor([42.0000, -5.9211])


**参数绑定**

In [24]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(2, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(x)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


**训练一个具有共享参数的网络，观察梯度**

In [33]:
shared = nn.Linear(2,2)
model = nn.Sequential(shared,shared,nn.ReLU(),nn.Linear(2,1))
loss_fn = nn.MSELoss()

# 生成简单数据集
x = torch.rand(4,2)
y = torch.rand(1)

# 设置超参数
lr = 1e-3
wd = 0
num_epoch = 5

# 优化器
optimizer = torch.optim.SGD(model.parameters(),lr = lr,weight_decay = wd)


for epoch in range(num_epoch):
    y_pred = model(x)
    loss = loss_fn(y_pred,y)
    optimizer.zero_grad()
    loss.backward()
    for i in range(len(model)):
        if (type(model[i]) == nn.Linear):
            print(model[i].weight.grad,model[i].bias.grad)
    optimizer.step()
    print(loss.item())


tensor([[ 0.4958, -0.4090],
        [-0.2610, -0.0809]]) tensor([ 0.9880, -0.3051])
tensor([[ 0.4958, -0.4090],
        [-0.2610, -0.0809]]) tensor([ 0.9880, -0.3051])
tensor([[-0.4560,  0.0000]]) tensor([-1.6731])
0.7211236953735352
tensor([[ 0.4926, -0.4070],
        [-0.2597, -0.0804]]) tensor([ 0.9834, -0.3036])
tensor([[ 0.4926, -0.4070],
        [-0.2597, -0.0804]]) tensor([ 0.9834, -0.3036])
tensor([[-0.4518,  0.0000]]) tensor([-1.6679])
0.7165688276290894
tensor([[ 0.4895, -0.4051],
        [-0.2584, -0.0800]]) tensor([ 0.9790, -0.3020])
tensor([[ 0.4895, -0.4051],
        [-0.2584, -0.0800]]) tensor([ 0.9790, -0.3020])
tensor([[-0.4478,  0.0000]]) tensor([-1.6628])
0.7120499610900879
tensor([[ 0.4864, -0.4031],
        [-0.2571, -0.0796]]) tensor([ 0.9745, -0.3005])
tensor([[ 0.4864, -0.4031],
        [-0.2571, -0.0796]]) tensor([ 0.9745, -0.3005])
tensor([[-0.4437,  0.0000]]) tensor([-1.6577])
0.7075669765472412
tensor([[ 0.4834, -0.4011],
        [-0.2558, -0.0792]]) tensor(