# 模型参数、优化器和学习率

定义一个简单的模型

In [13]:
import torch
import torch.optim as optim


class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = torch.nn.Embedding(5, 5)
        self.linear = torch.nn.Linear(5, 3)
        self.linear2 = torch.nn.Linear(3, 1)
        self.activate = torch.nn.Sigmoid()

    def forward(self, x):
        return self.activate(self.linear2(self.linear(self.emb(x))))

    
model = Model()
model

Model(
  (emb): Embedding(5, 5)
  (linear): Linear(in_features=5, out_features=3, bias=True)
  (linear2): Linear(in_features=3, out_features=1, bias=True)
  (activate): Sigmoid()
)

**模型的参数，对应Parameter类，它有两个属性data（一个tensor，require_grad为False）和require_grad（一般为True）**

如果你直接打印一个parameter，它的可读性很差

In [18]:
print(model.emb.weight)
print('='*100)
print(model.emb.weight.data)

Parameter containing:
tensor([[-0.8740, -0.3672, -0.4662, -0.8098,  0.8192],
        [ 0.4842,  0.1593,  1.7196, -0.8103, -0.5121],
        [-0.4298,  0.4086, -0.9216,  0.0504, -0.4167],
        [-0.7561,  0.3412, -0.6166,  0.0970, -1.5422],
        [ 0.6535, -1.4815, -0.0965,  1.2927, -2.1822]], requires_grad=True)
tensor([[-0.8740, -0.3672, -0.4662, -0.8098,  0.8192],
        [ 0.4842,  0.1593,  1.7196, -0.8103, -0.5121],
        [-0.4298,  0.4086, -0.9216,  0.0504, -0.4167],
        [-0.7561,  0.3412, -0.6166,  0.0970, -1.5422],
        [ 0.6535, -1.4815, -0.0965,  1.2927, -2.1822]])


In [19]:
for p in model.parameters():
    print(type(p))
    print(p.data)

<class 'torch.nn.parameter.Parameter'>
tensor([[-0.8740, -0.3672, -0.4662, -0.8098,  0.8192],
        [ 0.4842,  0.1593,  1.7196, -0.8103, -0.5121],
        [-0.4298,  0.4086, -0.9216,  0.0504, -0.4167],
        [-0.7561,  0.3412, -0.6166,  0.0970, -1.5422],
        [ 0.6535, -1.4815, -0.0965,  1.2927, -2.1822]])
<class 'torch.nn.parameter.Parameter'>
tensor([[-0.3699,  0.0258, -0.1693,  0.3095,  0.1445],
        [ 0.3925,  0.2926,  0.2735,  0.0304, -0.3556],
        [-0.3941, -0.3619,  0.0924, -0.1159, -0.3334]])
<class 'torch.nn.parameter.Parameter'>
tensor([-0.0530,  0.1841,  0.3760])
<class 'torch.nn.parameter.Parameter'>
tensor([[-0.2970, -0.4838,  0.1538]])
<class 'torch.nn.parameter.Parameter'>
tensor([-0.3113])


In [20]:
for k, v in model.named_parameters():
    print(k, v.size())

emb.weight torch.Size([5, 5])
linear.weight torch.Size([3, 5])
linear.bias torch.Size([3])
linear2.weight torch.Size([1, 3])
linear2.bias torch.Size([1])


optimizer把参数分成若干个param_group，然后对他们进行更新。一般情况下，如果有模型所有参数的学习率相同，那么一个param_group就够了。

在构建优化器optimizer时，需要传递一个列表过去，列表中的每个元素可以是一个parameter或者是一个param_group，param_group就是一个词典：

In [28]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer.param_groups

[{'params': [Parameter containing:
   tensor([[-0.8740, -0.3672, -0.4662, -0.8098,  0.8192],
           [ 0.4842,  0.1593,  1.7196, -0.8103, -0.5121],
           [-0.4298,  0.4086, -0.9216,  0.0504, -0.4167],
           [-0.7561,  0.3412, -0.6166,  0.0970, -1.5422],
           [ 0.6535, -1.4815, -0.0965,  1.2927, -2.1822]], requires_grad=True),
   Parameter containing:
   tensor([[-0.3699,  0.0258, -0.1693,  0.3095,  0.1445],
           [ 0.3925,  0.2926,  0.2735,  0.0304, -0.3556],
           [-0.3941, -0.3619,  0.0924, -0.1159, -0.3334]], requires_grad=True),
   Parameter containing:
   tensor([-0.0530,  0.1841,  0.3760], requires_grad=True),
   Parameter containing:
   tensor([[-0.2970, -0.4838,  0.1538]], requires_grad=True),
   Parameter containing:
   tensor([-0.3113], requires_grad=True)],
  'lr': 0.001,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'weight_decay': 0,
  'amsgrad': False}]

可以看出，param_group的键值对有：
- params(list): 当前param_group中所有parameter
- lr(float)：当前param_group的学习率
- betas, weight_decay, amsgrad都是adam的参数

控制学习率：根据参数的名字，对参数进行分组，缺失lr的param_group会直接使用全局学习率，也就是optimizer构造方法的第二个位置参数；

然后在不同的epoch改变学习率的值

In [25]:
custom_param_groups = [{"params": [], "lr": 0.1}, {"params": [], "lr": 0.01}]
for k, v in model.named_parameters():
    if k.startswith('linear2'):
        custom_param_groups[0]["params"].append(v)
    else:
        custom_param_groups[1]["params"].append(v)
optimizer2 = optim.Adam(custom_param_groups)
optimizer2.param_groups

[{'params': [Parameter containing:
   tensor([[-0.2970, -0.4838,  0.1538]], requires_grad=True),
   Parameter containing:
   tensor([-0.3113], requires_grad=True)],
  'lr': 0.1,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'weight_decay': 0,
  'amsgrad': False},
 {'params': [Parameter containing:
   tensor([[-0.8740, -0.3672, -0.4662, -0.8098,  0.8192],
           [ 0.4842,  0.1593,  1.7196, -0.8103, -0.5121],
           [-0.4298,  0.4086, -0.9216,  0.0504, -0.4167],
           [-0.7561,  0.3412, -0.6166,  0.0970, -1.5422],
           [ 0.6535, -1.4815, -0.0965,  1.2927, -2.1822]], requires_grad=True),
   Parameter containing:
   tensor([[-0.3699,  0.0258, -0.1693,  0.3095,  0.1445],
           [ 0.3925,  0.2926,  0.2735,  0.0304, -0.3556],
           [-0.3941, -0.3619,  0.0924, -0.1159, -0.3334]], requires_grad=True),
   Parameter containing:
   tensor([-0.0530,  0.1841,  0.3760], requires_grad=True)],
  'lr': 0.01,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'weight_decay': 0,
  'amsgra

基于此，便实现人工修改学习率。