# 层和块

In [1]:
import torch
from torch import nn
import d2l.torch as d2l
from torch.nn import functional as F

自己定义一个二层 MLP 模型

In [2]:
class MLP(nn.Module):
    def __init__(self, num_input, num_hidden, num_output):
        super().__init__()
        self.hidden_layer = nn.Linear(num_input, num_hidden)
        self.ReLu_layer = nn.ReLU()
        self.output_layer = nn.Linear(num_hidden, num_output)
    
    def forward(self, X):
        return self.output_layer(self.ReLu_layer(self.hidden_layer(X)))

In [3]:
X = torch.randn(1,30)
net = MLP(30,40,10)
print(net(X))

tensor([[-0.2245, -0.0217,  0.2890, -0.2601, -0.2096,  0.1335, -0.0225, -0.0997,
         -0.0738, -0.0008]], grad_fn=<AddmmBackward0>)


这个 MLP 模型可以在 Sequential 里面混用

In [4]:
net = nn.Sequential(MLP(30, 40, 10), nn.Linear(10, 1))
print(net(X))

tensor([[0.0767]], grad_fn=<AddmmBackward0>)


In [5]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[str(idx)] = module
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        return X

In [6]:
net = MySequential(MLP(30, 40, 10), nn.Linear(10, 1))
print(net(X))

tensor([[-0.1957]], grad_fn=<AddmmBackward0>)


# 参数管理

先随便定义一个 $net$

In [7]:
net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,1))

有两个东西, 第一个就是 $weight$ , 第二个是 $bias$ , 然后 `net[2]` 就是 $net$ 的第二个 $Linear$ 层 

In [8]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.1178, -0.3076, -0.1768, -0.2541, -0.2558, -0.2989,  0.1587,  0.0875]])), ('bias', tensor([0.3281]))])


$Parameter$ 是 $torch$ 里面的一个可以优化的参数

直接打印就会有数据和其他的值, 因为有梯度和其他啥的, 所以要通过 $data$ 来进行访问元素

因为我们还没有计算, 所以这个地方没有梯度

In [9]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)
print(net[2].bias.grad == None)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.3281], requires_grad=True)
tensor([0.3281])
True


通过 $named\_parameters()$ , 和 $state\_dict()$ 函数来看参数, 注意这里没有 $ReLU$ 的参数, 因为这种层没有参数

In [10]:
print(*((name, param.shape) for name, param in net.named_parameters()))
print(net.state_dict()['0.weight'])
print(net.state_dict()['0.bias'])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))
tensor([[ 0.3162, -0.0132, -0.4074, -0.3726],
        [ 0.2250,  0.0811,  0.3649, -0.0370],
        [ 0.0021, -0.1292,  0.1386, -0.2706],
        [ 0.2814,  0.4665, -0.3999, -0.3446],
        [ 0.4421, -0.3497,  0.2114, -0.4226],
        [-0.2520,  0.2967, -0.3089, -0.2632],
        [-0.3439, -0.4949,  0.1946, -0.1019],
        [-0.3394,  0.1886, -0.4394,  0.4570]])
tensor([-0.1434,  0.3631,  0.2792, -0.1560, -0.1792, -0.0491, -0.4767,  0.0508])


先随便定义一些网络层, 然后来看一下他们长什么样子

In [11]:
small_net1 = nn.Sequential(nn.Linear(10, 8), nn.ReLU(), nn.Linear(8, 6))
small_net2 = nn.Sequential(nn.Linear(8, 6), nn.ReLU(), nn.Linear(6, 4))
small_net3 = nn.Sequential(nn.Linear(4, 2), nn.ReLU(), nn.Linear(2, 1))
all_net = nn.Sequential()

挨个输出看看

In [12]:
print(small_net1)
print(small_net2)
print(small_net3)

Sequential(
  (0): Linear(in_features=10, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=6, bias=True)
)
Sequential(
  (0): Linear(in_features=8, out_features=6, bias=True)
  (1): ReLU()
  (2): Linear(in_features=6, out_features=4, bias=True)
)
Sequential(
  (0): Linear(in_features=4, out_features=2, bias=True)
  (1): ReLU()
  (2): Linear(in_features=2, out_features=1, bias=True)
)


还可以给这些 $module$ 取名, 默认是从 $0...n$

In [13]:
all_net.add_module('block_1', small_net1)
all_net.add_module('block_2', small_net2)
all_net.add_module('block_3', small_net3)
print(all_net)

Sequential(
  (block_1): Sequential(
    (0): Linear(in_features=10, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=6, bias=True)
  )
  (block_2): Sequential(
    (0): Linear(in_features=8, out_features=6, bias=True)
    (1): ReLU()
    (2): Linear(in_features=6, out_features=4, bias=True)
  )
  (block_3): Sequential(
    (0): Linear(in_features=4, out_features=2, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2, out_features=1, bias=True)
  )
)


如何来初始化默认参数

In [14]:
def init_normal(m):
    if type == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
all_net.apply(init_normal)
all_net[0][0].weight.data, all_net[0][0].bias.data

(tensor([[-0.1112, -0.2360,  0.2692,  0.1939,  0.2420,  0.2507, -0.0827,  0.1366,
           0.2814, -0.3108],
         [-0.2188,  0.0254,  0.2787,  0.2183,  0.1600,  0.1530, -0.3022,  0.2801,
          -0.1571,  0.2410],
         [ 0.2288, -0.0040,  0.0439, -0.2514, -0.0246,  0.1456,  0.1470,  0.0927,
          -0.0633,  0.2097],
         [ 0.0089, -0.0130,  0.0644,  0.2716,  0.0814,  0.2445, -0.0807, -0.3156,
          -0.0203, -0.1829],
         [ 0.1457, -0.1785, -0.3043, -0.1521, -0.0594, -0.2300, -0.1402, -0.0358,
          -0.1389, -0.2248],
         [ 0.2265,  0.2877,  0.0612, -0.0908,  0.1627, -0.1247, -0.0958,  0.2580,
          -0.3087,  0.1263],
         [ 0.1166,  0.1267, -0.2627, -0.1237, -0.0869,  0.2036,  0.0854,  0.2087,
           0.2176,  0.2398],
         [ 0.1284, -0.1467,  0.2669,  0.2055,  0.1070,  0.1434, -0.2259, -0.2988,
          -0.1324,  0.1335]]),
 tensor([ 0.2783, -0.1530,  0.2433,  0.0174,  0.2770,  0.2692,  0.1059,  0.2805]))

对不同的层运用不同的初始化方法

In [15]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def const_init(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)
all_net[0].apply(xavier)
all_net[1].apply(const_init)
print(all_net[0][0].weight.data[0])
print(all_net[1][0].weight.data[0])

tensor([ 0.0187,  0.3002, -0.0713, -0.5118, -0.4220,  0.0839, -0.3169,  0.0991,
        -0.2650,  0.1380])
tensor([42., 42., 42., 42., 42., 42., 42., 42.])


参数绑定, 共同使用一个层

In [16]:
shared = nn.Linear(8,8)
net = nn.Sequential(
    nn.Linear(10, 8), nn.ReLU(),
    shared, nn.ReLU(),
    shared, nn.ReLU(),
    nn.Linear(8,1)
)

# 自定义层

In [17]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, X):
        return X - X.mean()

layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [18]:
class MyLinear(nn.Module):
    def __init__(self, in_units, out_units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, out_units))
        self.bias = nn.Parameter(torch.randn(out_units))
    def forward(self, X):
        return torch.matmul(X, self.weight.data) + self.bias.data

dense = MyLinear(5, 3)
dense(torch.rand(2, 5))

tensor([[-1.6967,  1.5170, -1.5244],
        [-0.0928,  0.4322, -1.1658]])

# 读取文件

保存一个 $list$ 或者 $dict$ 也可以

In [19]:
x = torch.arange(4)
print(x)
torch.save(x, 'x-file')
x = torch.load('x-file')
print(x)

tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3])


对于一个网络来说, $torch$ 只能储存网络中的数据, 网络的结构需要自己存下来

$state\_dict$ 是一个 $OrderedDict$

In [20]:
class MyLinear(nn.Module):
    def __init__(self, input_units, output_units):
        super().__init__()
        self.params = nn.Parameter(torch.randn(input_units, output_units))
        self.bias = nn.Parameter(torch.randn(output_units))
    def forward(self, X):
        return torch.matmul(X, self.params) + self.bias.data

def get_net():
    return nn.Sequential(MyLinear(10,5), nn.ReLU(), MyLinear(5,1))
net = get_net()
torch.save(net.state_dict(), 'MySequential.params')

In [21]:
net2 = get_net()
net2.load_state_dict(torch.load('MySequential.params'))
X = torch.rand(10)
print(net2(X) == net(X))

tensor([True])
