# 一个短小的Transformer示例

示例参考PyTorch官方[tutorial](https://pytorch.org/tutorials/beginner/transformer_tutorial.html)

在我的笔记中，我是推荐度[The Annotated Transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html),它的代码更加简洁明了，容易读懂，这个教程是参照官方是示例添加的中文注释。

如果是先读了哈佛NLP的代码在看这个代码，你会觉得有些奇怪，因为它们输入的形状略有不同。

在PyTorch官方的代码实现中，输入输出的形状为:
- src -- Encoder的输入序列, 形状为$\text{[S, N, E]}$
- tgt -- Decoder的输入序列, 形状为$\text{[T, N, E]}$
- src_mask -- src序列的上三角掩码, 形状为$\text{[S, S]}$
- tgt_mask -- tgt序列的上三角掩码, 形状为$\text{[T, T]}$
- memory_mask -- Encoder输出的三角掩码掩码, 形状为$\text{T, S}$
- src_key_padding_mask -- src序列的padding掩码，掩盖掉padding补全位置的信息，形状为$\text{[N, S]}$
- tgt_key_padding_mask -- tgt序列的padding掩码，掩盖掉padding补全位置的信息，形状为$\text{[N, T]}$
- memory_key_padding_mask -- memory矩阵的padding掩码，掩盖掉src padding补全位置的信息，形状为$\text{[N, S]}$

其中的$\text{S, T, N, E}$分别为源序列长度、目标序列长度、Batch size以及Embedding的长度。

这些输出的形状很关键，这会影响到数据集准备

In [1]:
# import necessary packages
import copy
import math
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from torch.nn import TransformerEncoder, TransformerEncoderLayer

## 定义模型

In [None]:
# 定义一个获取位置编码的模型。得到的矩阵是一个普通的Tensor而不是torch.nn.Parameter对象
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                             (-math.log(10000) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # 考虑到Transformer的输入为[S, N, E], 将PE矩阵拓展成[max_len, 1, dim_embed]
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # 此处的加法， pe的dimension 1 为1，维度不同会触发广播机制。
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)

In [3]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, dim_embed, nhead, nhid, nlayers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.model_type = "transformer"
        self.src_mask = None
        self.dim_embed = dim_embed
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=dim_embed)
        self.pos_encoder = PositionalEncoding(dim_embed, dropout=dropout)
        encoder_layer = TransformerEncoderLayer(dim_embed, nhead, nhid, dropout=dropout)
        self.encoder = TransformerEncoder(encoder_layer=encoder_layer, num_layers=nlayers)
        self.decoder = nn.Linear(dim_embed, vocab_size)
        self.init_weights()
        
    def init_weights(self):
        # 对模型的参数进行初始化
        init_range = 0.1
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-init_range, init_range)
        
    def _generate_square_subsequent_mask(self, size):
        """
        生成一个参数size * size大小的方阵，该方阵的对角线以上的区域取值为float("-inf")
        其余区域的取值为0，这个方阵被用于做加法到Attention机制的计算出的注意力矩阵中。
        它的目的是为了让序列中时刻t的变量不看到它之后的变量
        连续语言模型或者Transformer的Decoder的掩码矩阵，就可以明白
        """
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf'))
        return mask.masked_fill(mask == 1, float(0.0))
    
    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            self.src_mask = self._generate_square_subsequent_mask(len(src)).to(device)
        src = self.embedding(src) * math.sqrt(self.dim_embed)
        src = self.pos_encoder(src)
        output = self.encoder(src, mask=self.src_mask)
        return self.decoder(output)

## 构造训练数据

In [4]:
#
TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"), 
                            init_token='<sos>', 
                            eos_token='<eos>', 
                            lower=True)
# 使用torchtext自带的数据集，这个数据集如果本地没有，会从服务器下载到root='.data',dirname="WikiText2"的目录中
train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def batchify(data, batch_size=10):
    data = TEXT.numericalize([data.examples[0].text])
    nbatch = data.size(0)// batch_size
    # data的shape为[length, 1], 在构造成批数据的时候，需要保证length % batch_size==0
    data = data.narrow(0, 0, nbatch * batch_size)
    # 此处使用转置是为了将返回的数据构造成[N, S]的形状
    data = data.view(batch_size, -1).t().contiguous()
    return data.to(device)

In [6]:
batch_size = 20
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

In [7]:
bptt = 35
def get_batch(source: torch.Tensor):
    for batch, index in enumerate(range(0, source.size(0) - 1, bptt)):
        seq_len = min(bptt, len(source) - 1 - index)
        data = source[index: index + seq_len]
        target = source[index + 1: index + 1 + seq_len].view(-1)
        yield batch, data, target

## 训练模型

In [8]:
vocab_size = len(TEXT.vocab.stoi)
dim_embed = 200
nhid = 200 # Encoder中FeedForward层的隐藏层神经元个数
nlayers = 2
nhead = 2
dropout = 0.2
model = TransformerModel(vocab_size, dim_embed, nhead, nhid, nlayers, dropout).to(device)

In [9]:
criterion = nn.CrossEntropyLoss()
lr = 5.0
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [10]:
def train(train_data: torch.Tensor):
    model.train()
    total_loss = 0
    start_time = time.time()
    log_interval = 400
    total_batchs = len(train_data) // bptt
    for batch, data, targets in get_batch(train_data):
        optimizer.zero_grad()
        output =  model(data)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            # epoch的值由运行时命名空间提供
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | {:5.2f} ms/batch |' 
                  'loss {:5.2f} | ppl {:8.2f}'.format(epoch, batch, total_batchs, 
                                                      scheduler.get_lr()[0], 
                                                      elapsed * 1000 / log_interval, cur_loss, 
                                                      math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [11]:
def evaluate(eval_model, data_source):
    eval_model.eval()
    total_loss = 0
    vocab_size = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for batch, data, target in get_batch(data_source):
            output = eval_model(data)
            output_flat = output.view(-1, vocab_size)
            total_loss += len(data) * criterion(output_flat, target).item()
    return total_loss / (len(data_source) - 1)

In [12]:
best_val_loss = float('inf')
epochs = 6
best_model = None
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(train_data)
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
        epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)))
    print('-' * 89)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)
    scheduler.step()

| epoch   1 |   400/ 2981 batches | lr 5.00 | 10.64 ms/batch |loss  7.40 | ppl  1642.61
| epoch   1 |   800/ 2981 batches | lr 5.00 | 10.62 ms/batch |loss  6.30 | ppl   546.74
| epoch   1 |  1200/ 2981 batches | lr 5.00 | 10.63 ms/batch |loss  6.10 | ppl   446.87
| epoch   1 |  1600/ 2981 batches | lr 5.00 | 10.63 ms/batch |loss  6.05 | ppl   422.01
| epoch   1 |  2000/ 2981 batches | lr 5.00 | 10.63 ms/batch |loss  5.96 | ppl   386.34
| epoch   1 |  2400/ 2981 batches | lr 5.00 | 10.63 ms/batch |loss  5.87 | ppl   354.21
| epoch   1 |  2800/ 2981 batches | lr 5.00 | 10.66 ms/batch |loss  5.85 | ppl   348.29
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 33.37s | valid loss  5.82 | valid ppl   337.95
-----------------------------------------------------------------------------------------
| epoch   2 |   400/ 2981 batches | lr 4.51 | 10.66 ms/batch |loss  5.79 | ppl   327.03
| epoch   2 |   800/ 2981 batches | lr 4.5

In [19]:
# 测试集上的Perplexity为多少
best_model.to(device)
test_loss = evaluate(best_model, test_data)
print('-' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(test_loss,  math.exp(test_loss)))
print('-' * 89)

-----------------------------------------------------------------------------------------
| End of training | test loss  5.36 | test ppl   212.69
-----------------------------------------------------------------------------------------
