In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# 层与块

In [2]:
net = nn.Sequential(nn.Linear(20, 256),
                   nn.ReLU(),
                   nn.Linear(256, 10))
X = torch.rand(2, 20)
net(X)

tensor([[-0.2137, -0.1191,  0.0205, -0.0088,  0.0997, -0.0664,  0.0466,  0.0114,
         -0.0651, -0.0118],
        [-0.2836, -0.1509,  0.0952,  0.0233,  0.0470, -0.2109,  0.1672,  0.0276,
         -0.0639, -0.0386]], grad_fn=<AddmmBackward>)

In [5]:
# 自定义块
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)
    
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [6]:
net = MLP()
net(X)

tensor([[-0.0348,  0.1272, -0.0426, -0.0770, -0.1656,  0.1914,  0.0140, -0.2210,
          0.0949,  0.0801],
        [-0.1573,  0.0745, -0.0438, -0.0685, -0.0583,  0.0902,  0.2386, -0.0210,
          0.1383,  0.1069]], grad_fn=<AddmmBackward>)

In [12]:
# sequential自己实现
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args:
            self._modules[block] = block # _module为有序字典
    
    def forward(self, X):
        for block in self._modules.values():
            X = block(X)
        
        return X

In [13]:
net = MySequential(nn.Linear(20, 256), 
                  nn.ReLU(),
                  nn.Linear(256, 10))
net(X)

tensor([[-0.1042,  0.0628,  0.0286,  0.1234,  0.0082, -0.0929,  0.0604,  0.3298,
          0.1819, -0.1490],
        [-0.0968,  0.0644, -0.0812,  0.0327, -0.0308,  0.0555,  0.0749,  0.2287,
          0.2732, -0.1684]], grad_fn=<AddmmBackward>)

In [14]:
# 带python控制流的模块
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    
    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)
        
        # 控制流
        while X.abs().sum() > 1:  # L1范数大于1则除以2
            X /= 2
        
        return X.sum()

In [15]:
net = FixedHiddenMLP()
net(X)

tensor(0.1719, grad_fn=<SumBackward0>)

In [16]:
# 各种方法嵌套
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64),
                                nn.ReLU(),
                                nn.Linear(64, 32),
                                nn.ReLU())
        self.linear = nn.Linear(32, 16)
    
    def forward(self, X):
        return self.linear(self.net(X))

In [17]:
net = nn.Sequential(NestMLP(), 
                   nn.Linear(16, 20),
                   FixedHiddenMLP())
net(X)

tensor(0.0491, grad_fn=<SumBackward0>)

# 参数管理

In [18]:
net = nn.Sequential(nn.Linear(4, 8),
                   nn.ReLU(),
                   nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.1224],
        [0.1116]], grad_fn=<AddmmBackward>)

In [21]:
net[0].state_dict()

OrderedDict([('weight',
              tensor([[-0.2749, -0.1657,  0.1733,  0.3845],
                      [ 0.2843, -0.2361, -0.1812,  0.4631],
                      [-0.0043,  0.3980,  0.2818, -0.0649],
                      [ 0.2311,  0.0828,  0.0759, -0.4160],
                      [-0.4601,  0.1248,  0.4377,  0.0459],
                      [ 0.0071,  0.4649,  0.0105,  0.2418],
                      [ 0.0430,  0.4326,  0.3677, -0.0288],
                      [ 0.2047, -0.1832,  0.1772,  0.3060]])),
             ('bias',
              tensor([ 0.0551,  0.2713,  0.1551,  0.1279,  0.0716, -0.3910, -0.2216,  0.1064]))])

In [22]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.0969], requires_grad=True)
tensor([0.0969])


In [24]:
# 查看梯度，还未反向传播，没有梯度
net[2].weight.grad == None  

True

In [29]:
# 嵌套网络
def block1():
    net = nn.Sequential(nn.Linear(4, 8),
                       nn.ReLU(),
                       nn.Linear(8, 4),
                       nn.ReLU())
    return net

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module("block {}".format(i), block1())
    
    return net

In [31]:
net = nn.Sequential(block2(), nn.Linear(4, 1))
net(X)

tensor([[0.3333],
        [0.3333]], grad_fn=<AddmmBackward>)

In [32]:
net

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [35]:
net[0][1][0].bias

Parameter containing:
tensor([ 0.4387, -0.2747, -0.1397,  0.3192, -0.1078,  0.4106,  0.1199,  0.2653],
       requires_grad=True)

In [37]:
# 初始化
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        # nn.init.constant_(m.weight, 1) 常数初始化
        # nn.init.xavier_uniform_(m.weight) xavier初始化
        nn.init.zeros_(m.bias)
net.apply(init_normal)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [40]:
net[0][0][0].weight.data[0]

tensor([ 0.0081,  0.0035,  0.0103, -0.0052])

In [41]:
# 共享参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8),
                   nn.ReLU(),
                   shared, 
                   nn.ReLU(),
                   shared,
                   nn.ReLU(),
                   nn.Linear(8, 1))
net(X)

tensor([[-0.3573],
        [-0.3609]], grad_fn=<AddmmBackward>)

In [42]:
net[2].weight.data == net[4].weight.data

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])

# 自定义层

In [43]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, X):
        return X - X.mean()

In [44]:
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [46]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
y = net(torch.rand(4, 8))
y.mean()

tensor(1.1176e-08, grad_fn=<MeanBackward0>)

In [47]:
# 带参数的层
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
        
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [48]:
dense = MyLinear(5, 3)
dense.weight

Parameter containing:
tensor([[-0.4174,  1.5641, -0.3035],
        [-0.1439, -0.3188,  1.6736],
        [ 0.5005, -1.2170, -0.2140],
        [-0.9203, -0.3974,  0.9679],
        [ 1.2702, -0.3673, -0.7432]], requires_grad=True)

In [49]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[0.],
        [0.]])

# 读写文件

In [51]:
x = torch.arange(4)
torch.save(x, 'x_file')

In [52]:
x2 = torch.load('x_file')
x2

tensor([0, 1, 2, 3])

In [53]:
# 模型参数
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)
        
    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

In [56]:
net = MLP()
X = torch.randn((2, 20))
Y = net(X)

In [57]:
# 存储模型
torch.save(net.state_dict(), 'mlp.params')

In [58]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

# GPU

In [60]:
torch.cuda.device('cuda'), torch.device('cpu')

(<torch.cuda.device at 0x7fcf232ca1f0>, device(type='cpu'))

In [61]:
torch.cuda.device_count()

0

In [64]:
def try_all_gpu():
    devices = [
        torch.device('cuda:{}'.format(i)) for i in range(torch.cuda.device_count())
    ]
    
    return devices if devices else [torch.device('cpu')]

In [65]:
try_all_gpu()

[device(type='cpu')]

In [67]:
X = torch.ones(2, 3, device='cpu')
Y = torch.ones(2, 3, device='cuda:0')
Z = X.cuda(0)

X + Y # 一个设备上才能运算  

In [None]:
# 神经网络
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device='cuda:0')