In [1]:
import torch
from torch import nn
from torch.nn import init

net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))  # pytorch已进行默认初始化

print(net)
X = torch.rand(2, 4)
Y = net(X).sum()

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)


In [8]:
print(type(net.named_parameters()))
for name, param in net.named_parameters():
    print(name, param.size())

<class 'generator'>
0.weight torch.Size([3, 4])
0.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])


In [3]:
for name, param in net[0].named_parameters():
    print(name, param)

weight Parameter containing:
tensor([[-0.3822,  0.0535, -0.1759, -0.1902],
        [ 0.4303, -0.4624,  0.4275, -0.0398],
        [ 0.2808, -0.0905, -0.4938,  0.4435]], requires_grad=True)
bias Parameter containing:
tensor([-0.2838,  0.3243, -0.2651], requires_grad=True)


因为这里是单层的所以没有了层数索引的前缀。另外返回的param的类型为torch.nn.parameter.Parameter，其实这是Tensor的子类，和Tensor不同的是如果一个Tensor是Parameter，那么它会自动被添加到模型的参数列表里，来看下面这个例子。


In [4]:
class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__(**kwargs)
        # nn.Parameter会自动添加到这个模型的参数列表
        self.weight1 = nn.Parameter(torch.rand(20, 20))
        self.weight2 = torch.rand(20, 20)
    def forward(self, x):
        pass

n = MyModel()
for name, param in n.named_parameters():
    print(name)


weight1


In [5]:
weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad) # 反向传播前梯度为None
Y.backward()
print(weight_0.grad)


tensor([[-0.3822,  0.0535, -0.1759, -0.1902],
        [ 0.4303, -0.4624,  0.4275, -0.0398],
        [ 0.2808, -0.0905, -0.4938,  0.4435]])
None
tensor([[0.0000, 0.0000, 0.0000, 0.0000],
        [0.4039, 0.0273, 0.4751, 0.7084],
        [0.0000, 0.0000, 0.0000, 0.0000]])


# 初始化参数模型

In [9]:
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0,std=0.01)
        print(name, param.data)

0.weight tensor([[-0.0112, -0.0112,  0.0006, -0.0027],
        [ 0.0046,  0.0021,  0.0078, -0.0041],
        [ 0.0090, -0.0050, -0.0118, -0.0180]])
2.weight tensor([[ 0.0060, -0.0112,  0.0150]])


In [11]:
for name, param in net.named_parameters():
    if 'bias' in name:
        init.constant_(param,val=0)
        print(name, param.data)

0.bias tensor([0., 0., 0.])
2.bias tensor([0.])


# 自定义初始化方法

可以看到这就是一个inplace改变Tensor值的函数，而且这个过程是不记录梯度的。 类似的我们来实现一个自定义的初始化方法。在下面的例子里，我们令权重有一半概率初始化为0，有另一半概率初始化为[−10,−5][−10,−5]和[5,10][5,10]两个区间里均匀分布的随机数。

In [12]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()

for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)

0.weight tensor([[ 6.6246, -9.0926, -0.0000, -0.0000],
        [ 0.0000, -8.4166, -0.0000,  0.0000],
        [ 9.1037,  9.7623, -7.7955, -0.0000]])
2.weight tensor([[-0.0000, -0.0000, -5.3231]])


改变这些参数的data来改写模型参数值同时不会影响梯度:

# 共享模型参数

在有些情况下，我们希望在多个层之间共享模型参数。4.1.3节提到了如何共享模型参数: Module类的forward函数里多次调用同一个层。此外，如果我们传入Sequential的模块是同一个Module实例的话参数也是共享的，下面来看一个例子:

In [15]:
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight tensor([[3.]])


In [16]:
print(id(net[0]) == id(net[1])) 

True


In [17]:
print(id(net[0].weight) == id(net[1].weight))

True


因为模型参数里包含了梯度，所以在反向传播计算时，这些共享的参数的梯度是累加的:

In [18]:
x = torch.ones(1, 1)
y = net(x).sum()
print(y)
y.backward()
print(net[0].weight.grad)

tensor(9., grad_fn=<SumBackward0>)
tensor([[6.]])
