5.1 层和块

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
X = torch.rand(2, 20)
net(X)

tensor([[-0.1253,  0.0032, -0.2879,  0.0572, -0.0578, -0.0720, -0.1945,  0.0156,
          0.0490,  0.0347],
        [-0.0368,  0.0838, -0.1363, -0.0526, -0.0099, -0.1662, -0.0993,  0.0712,
          0.1381,  0.0475]], grad_fn=<AddmmBackward>)

In [None]:
#5.1.1 自定义块
class MLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden = nn.Linear(20, 256)
    self.out = nn.Linear(256, 10)
  def forward(self, X):
    return self.out(F.relu(self.hidden(X)))
net = MLP()
net(X)

tensor([[-0.2298, -0.0514, -0.2854,  0.1150, -0.0873,  0.0156,  0.5606, -0.5067,
          0.2351, -0.4470],
        [-0.1189, -0.0888, -0.1041, -0.0574,  0.1027, -0.0381,  0.4731, -0.4992,
          0.1790, -0.1862]], grad_fn=<AddmmBackward>)

In [None]:
#5.1.2 顺序块
class MySequential(nn.Module):
  def __init__(self, *args):
    super().__init__()
    for block in args:
      self._modules[block] = block
  def forward(self, X):
    for block in self._modules.values():
      X = block(X)
    return X
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[ 0.0876,  0.0419,  0.0030, -0.3133,  0.1389, -0.2305, -0.3783, -0.2386,
          0.3681, -0.2610],
        [ 0.0098, -0.0213, -0.0187, -0.1770,  0.1363, -0.1965, -0.1909, -0.2854,
          0.2788, -0.1528]], grad_fn=<AddmmBackward>)

In [None]:
#组合
class FixedHiddenMLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.rand_weight = torch.rand((20, 20), requires_grad = False)
    self.linear = nn.Linear(20, 20)
  def forward(self, X):
    X = self.linear(X)
    X = F.relu(torch.mm(X, self.rand_weight) + 1)
    X = self.linear(X)
    while X.abs().sum() > 1:
      X /= 2
    return X.sum()
class NestMLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(), 
                  nn.Linear(64, 32), nn.ReLU())
    self.linear = nn.Linear(32, 16)
  def forward(self, X):
    return self.linear(self.net(X))
chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(-0.0016, grad_fn=<SumBackward0>)

5.2 参数管理

In [None]:
import torch
from torch import nn
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size = (2, 4))
net(X)

tensor([[-0.2427],
        [-0.3475]], grad_fn=<AddmmBackward>)

In [None]:
#5.2.1参数访问
print(net[2].state_dict())
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)
print(*[(name, param.shape) for name, param in net.named_parameters()])

OrderedDict([('weight', tensor([[-0.2544, -0.0636,  0.2469,  0.1397, -0.0201, -0.0568, -0.1567, -0.1187]])), ('bias', tensor([-0.2337]))])
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2337], requires_grad=True)
tensor([-0.2337])
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [None]:
#从嵌套块收集参数
def block1():
  return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU())
def block2():
  net = nn.Sequential()
  for i in range(4):
    net.add_module(f'block{i}', block1())
  return net
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
print(rgnet)
rgnet[0][1][0].bias.data 

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


tensor([0.0725, 0.0507, 0.1058, 0.0442, 0.0062, 0.0250, 0.4445, 0.0601])

In [None]:
#5.2.2 参数初始化
def init_normal(m):
  if type(m) == nn.Linear:
    nn.init.normal_(m.weight, mean = 0, std = 0.01)
    nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0073, -0.0054,  0.0009,  0.0096]), tensor(0.))

In [None]:
def init_constant(m):
  if type(m) == nn.Linear:
    nn.init.constant_(m.weight, 1)
    nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [None]:
def xavier(m):
  if type(m) == nn.Linear:
    nn.init.xavier_uniform_(m.weight)
def init_42(m):
  if type(m) == nn.Linear:
    nn.init.constant_(m.weight, 42)
net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.5481, -0.0438,  0.2296, -0.1228])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [None]:
#自定义初始化
def my_init(m):
  if type(m) == nn.Linear:
    print(
        "Init",
        *[(name, param.shape) for name, param in m.named_parameters()][0]
    )
    nn.init.uniform_(m.weight, -10, 10)
    m.weight.data *= m.weight.data.abs() >= 5
net.apply(my_init)
net[0].weight[:2]
net[0].weight.data[:] +=1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([42.0000, -8.1395, 10.3308,  1.0000])

In [None]:
#5.2.3 参数绑定
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8,1))
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [None]:
#5.4 自定义层
import torch
from torch import nn
import torch.nn.functional as F
#不带参数的层
class CenteredLayer(nn.Module):
  def __init__(self):
    super().__init__()
  def forward(self, X):
    return X - X.mean()
layer = CenteredLayer()
layer(torch.FloatTensor([1,2,3,4,5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [None]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.rand(4, 8))
Y.mean()

tensor(1.8626e-09, grad_fn=<MeanBackward0>)

In [None]:
#带参数的层
class MyLinear(nn.Module):
  def __init__(self, in_units, units):
    super().__init__()
    self.weight = nn.Parameter(torch.randn(in_units, units))
    self.bias = nn.Parameter(torch.randn(units,))
  def forward(self, X):
    linear = torch.matmul(X, self.weight.data) + self.bias.data
    return F.relu(linear)
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[3.7042],
        [3.4447]])

In [1]:
import torch
from torch import nn

torch.device('cpu'), torch.cuda.device('cuda'), torch.cuda.device('cuda:1')

(device(type='cpu'),
 <torch.cuda.device at 0x7f68958b79d0>,
 <torch.cuda.device at 0x7f68958cc950>)

In [2]:
torch.cuda.device_count()

1

In [3]:
def try_gpu(i=0): 
    """如果存在，则返回gpu(i)，否则返回cpu()。"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]。"""
    devices = [
        torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [4]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [5]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())

In [6]:
net(X)

tensor([[-0.8302],
        [-0.8302]], device='cuda:0', grad_fn=<AddmmBackward>)

In [7]:
net[0].weight.data.device

device(type='cuda', index=0)