In [118]:
import math
import torch.nn.functional as F
import collections
import torch
import torch.utils.data as Data
import torch.nn as nn
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

### transformer模型及其组件

#### Position-wise Feed-Forward Networks

$$ FFN(x) = \max(0, xW_1 + b_1)W_2 + b_2 $$

In [119]:
class PositionWiseFFN(nn.Module):
    """Position-wise Feed-Forward Networks"""

    def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs):
        super(PositionWiseFFN, self).__init__()
        self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)

    def forward(self, X):
        return self.dense2(self.relu(self.dense1(X)))


ffn = PositionWiseFFN(4, 5, 8)
ffn.eval()
ffn(torch.ones((2, 3, 4))).shape

torch.Size([2, 3, 8])

In [120]:
class AddNorm(nn.Module):
    """残差连接和层归一化"""

    def __init__(self,
                 # 除mini-batch维度之外其他维度的列表(即进行层归一化的维度)
                 normalized_shape,
                 dropout):
        super(AddNorm, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # 原因:句子长度不一致,并且各个batch的信息没什么关系
        self.ln = nn.LayerNorm(normalized_shape)  # Normalized_shape is input.size()[1:]

    def forward(self, X, Y):
        # 内部为残差连接
        return self.ln(self.dropout(Y) + X)


add_norm = AddNorm([3, 4], 0.5)
add_norm.eval()
add_norm(torch.ones((2, 3, 4)), torch.ones((2, 3, 4))).shape

torch.Size([2, 3, 4])

In [121]:
def masked_softmax(X, valid_lens=None):
    """通过在最后⼀个轴上遮蔽元素来执⾏softmax操作"""

    def sequence_mask(X, valid_len, value=0):
        """Mask irrelevant entries in sequences"""
        maxlen = X.size(1)
        # 广播机制
        mask = torch.arange(maxlen, device=X.device)[None, :] < valid_len[:, None]
        X[~mask] = value
        return X

    if valid_lens is None:
        return F.softmax(X, dim=-1)
    else:
        shape = X.shape
        if valid_lens.dim() == 1:
            valid_lens = torch.repeat_interleave(valid_lens, shape[1])
        else:
            valid_lens = valid_lens.reshape(-1)
        # 被遮蔽的元素使用⼀个非常大的负值替换,使其softmax输出为0
        X = sequence_mask(X.reshape(-1, shape[-1]), valid_lens,
                          value=-1e6)
        return F.softmax(X.reshape(shape), dim=-1)


class DotProductAttention(nn.Module):
    """缩放点积注意力"""

    def __init__(self, dropout):
        super(DotProductAttention, self).__init__()
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, valid_lens=None):
        # queries.shape = (b, ?q, d)
        # keys.shape = (b, ?k, d)
        # scores.shape = (b, ?q, d) x (b, d, ?k) = (b, ?q, ?k)
        d = queries.shape[-1]
        scores = torch.bmm(queries, keys.transpose(1, 2)) / math.sqrt(d)
        self.attention_weights = masked_softmax(scores, valid_lens)
        # values.shape=(b, ?k, ?v)
        # 返回值.shape=(b, ?q, ?k) x (b, ?k, ?v) = (b, ?q, ?v)
        return torch.bmm(self.dropout(self.attention_weights), values)


class MultiHeadAttention(nn.Module):
    """多头注意力"""

    def __init__(self,
                 # 查询特征数目(E_q)
                 query_size,
                 # 键特征数目(E_k)
                 key_size,
                 # 值特征数目(E_v)
                 value_size,
                 # 多头数目
                 num_heads, dropout, bias=False):  # 模仿pytorch的参数组成
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        assert query_size % num_heads == 0, "query_size must be divisible by num_heads"
        self.W_q = nn.Linear(query_size, query_size, bias=bias)
        self.W_k = nn.Linear(key_size, query_size, bias=bias)
        self.W_v = nn.Linear(value_size, query_size, bias=bias)
        self.W_o = nn.Linear(query_size, query_size, bias=bias)
        self.attention = DotProductAttention(dropout)

    @staticmethod
    def transpose_qkv(X, num_heads):
        # 输入:X.shape=(N, L or S, E_q)
        # X.shape=(N, L or S, num_heads, E_q / num_heads)
        X = X.reshape(X.shape[0], X.shape[1], num_heads, -1)
        # X.shape=(N, num_heads, L or S, E_q / num_heads)
        X = X.permute(0, 2, 1, 3)
        # 返回值.shape=(N * num_heads, L or S, E_q / num_heads)
        return X.reshape(-1, X.shape[2], X.shape[3])

    def forward(self, queries, keys, values, valid_lens):
        """
        queries: 查询
        keys: 键
        values: 值
        valid_lens: 计算attention_weights的有效长度
        """
        # queries.shape=(N, L, E_q)
        # self.W_q(queries).shape=(N, L, E_q)
        # queries.shape=(N * num_heads, L, E_q / num_heads)

        # keys.shape=(N, S, E_k)
        # self.W_k(queries).shape=(N, S, E_q)
        # keys.shape=(N * num_heads, S, E_q / num_heads)

        # values.shape=(N, S, E_v)
        # self.W_v(values).shape=(N, S, E_q)
        # values.shape=(N * num_heads, S, E_q / num_heads)
        queries = self.transpose_qkv(self.W_q(queries), self.num_heads)
        keys = self.transpose_qkv(self.W_k(keys), self.num_heads)
        values = self.transpose_qkv(self.W_v(values), self.num_heads)

        if valid_lens is not None:
            # E_q维度信息增加到batch_size维度上
            valid_lens = torch.repeat_interleave(valid_lens, repeats=self.num_heads, dim=0)

        # output.shape=(N * num_heads, L, E_q / num_heads)
        output = self.attention(queries, keys, values, valid_lens)
        # output.shape=(N, num_heads, L, E_q / num_heads)
        output = output.reshape(-1, self.num_heads, output.shape[1], output.shape[2])
        # output.shape=(N, L, num_heads, E_q / num_heads)
        output = output.permute(0, 2, 1, 3)
        # output.shape=(N, L, E_q)
        output_concat = output.reshape(output.shape[0], output.shape[1], -1)
        # 返回值.shape=(N, L, E_q)
        return self.W_o(output_concat)

In [122]:
class EncoderBlock(nn.Module):
    """transformer编码器Block"""

    def __init__(self,
                 # 查询特征数目(E_q)
                 query_size,
                 # 键特征数目(E_k)
                 key_size,
                 # 值特征数目(E_v)
                 value_size,
                 # 除mini-batch维度之外其他维度的列表(即进行层归一化的维度)
                 norm_shape,
                 ffn_num_hiddens,
                 # 多头数
                 num_heads, dropout, use_bias=False):
        super(EncoderBlock, self).__init__()
        self.attention = MultiHeadAttention(query_size, key_size, value_size, num_heads, dropout, use_bias)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        # 多头注意力`forward`返回值的shape为:(N, L, E_q)
        # 故PositionWiseFFN第一个线性层的输入维度为E_q
        self.ffn = PositionWiseFFN(query_size, ffn_num_hiddens, query_size)
        self.addnorm2 = AddNorm(norm_shape, dropout)

    def forward(self, X, valid_lens):
        # transformer编码器Block结构为:
        # =+残差连接=>多头注意力==>add & Norm=+残差连接=>Feed Forward=>add & Norm
        Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))  # 多头自注意力
        return self.addnorm2(Y, self.ffn(Y))


encoder_blk = EncoderBlock(query_size=24,
                           key_size=24,
                           value_size=24,
                           norm_shape=[100, 24],
                           ffn_num_hiddens=48,
                           num_heads=8,
                           dropout=0.5)
encoder_blk.eval()

X = torch.ones((2, 100, 24))
valid_lens = torch.tensor([3, 2])

# 可以看出Transformer编码器中的任何层都不会改变其输⼊的形状(可叠加多层)
print(encoder_blk(X, valid_lens).shape)

torch.Size([2, 100, 24])


In [123]:
class PositionalEncoding(nn.Module):
    """位置编码"""

    def __init__(self, num_hiddens, dropout, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        # 创建⼀个足够长的P
        self.P = torch.zeros((1, max_len, num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(-1, 1) / torch.pow(10000, torch.arange(0, num_hiddens, 2,
                                                                                                      dtype=torch.float32) / num_hiddens)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        return self.dropout(X)

In [124]:
class TransformerEncoder(nn.Module):
    """transformer编码器"""

    def __init__(self,
                 # 单词表的单词数目
                 vocab_size,
                 # 查询特征数目(E_q)
                 query_size,
                 # 键特征数目(E_k)
                 key_size,
                 # 值特征数目(E_v)
                 value_size,
                 # 除mini-batch维度之外其他维度的列表(即进行层归一化的维度)
                 norm_shape,
                 ffn_num_hiddens,
                 # 多头数
                 num_heads,
                 # transformer编码器Block数
                 num_layers,
                 dropout, use_bias=False):
        super(TransformerEncoder, self).__init__()
        self.query_size = query_size
        self.pos_encoding = PositionalEncoding(query_size, dropout)
        self.embedding = nn.Embedding(vocab_size, query_size)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            # 叠加多个transformer编码器Block数
            self.blks.add_module("block" + str(i), EncoderBlock(query_size=query_size,
                                                                key_size=key_size,
                                                                value_size=value_size,
                                                                norm_shape=norm_shape,
                                                                ffn_num_hiddens=ffn_num_hiddens,
                                                                num_heads=num_heads,
                                                                dropout=dropout,
                                                                use_bias=use_bias))

    def forward(self, X, valid_lens):
        # 位置编码值在-1和1之间,因此嵌入值乘以嵌⼊维度的平方根进行缩放
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.query_size))
        self.attention_weights = [None] * len(self.blks)
        for i, blk in enumerate(self.blks):
            # 上一个Block数输出为下一个Block数的输入
            X = blk(X, valid_lens)
            self.attention_weights[i] = blk.attention.attention.attention_weights
        return X


encoder = TransformerEncoder(
    vocab_size=200,
    query_size=24,
    key_size=24,
    value_size=24,
    norm_shape=[100, 24],
    ffn_num_hiddens=48,
    num_heads=8,
    num_layers=2,
    dropout=0.5)

encoder.eval()
valid_lens = torch.tensor([3, 2])
encoder(torch.ones((2, 100), dtype=torch.long), valid_lens).shape

torch.Size([2, 100, 24])

In [125]:
class DecoderBlock(nn.Module):
    """transformer解码器Block"""

    def __init__(self,
                 # 查询特征数目(E_q)
                 query_size,
                 # 键特征数目(E_k)
                 key_size,
                 # 值特征数目(E_v)
                 value_size,
                 # 除mini-batch维度之外其他维度的列表(即进行层归一化的维度)
                 norm_shape,
                 ffn_num_hiddens,
                 # 多头数
                 num_heads,
                 dropout,
                 # 解码器的第i个Block块
                 i):
        super(DecoderBlock, self).__init__()
        self.i = i
        self.attention1 = MultiHeadAttention(
            query_size=query_size, key_size=key_size,
            value_size=value_size, num_heads=num_heads, dropout=dropout)
        self.addnorm1 = AddNorm(norm_shape, dropout)
        self.attention2 = MultiHeadAttention(
            query_size=query_size, key_size=key_size,
            value_size=value_size, num_heads=num_heads, dropout=dropout)
        self.addnorm2 = AddNorm(norm_shape, dropout)
        self.ffn = PositionWiseFFN(query_size, ffn_num_hiddens, query_size)
        self.addnorm3 = AddNorm(norm_shape, dropout)

    def forward(self, X, state):
        enc_outputs, enc_valid_lens = state[0], state[1]
        if state[2][self.i] is None:
            # 训练阶段,输出序列的所有词元都在同一时间处理
            # 预测阶段,第一个词所有Block结构的第一个多头注意力的key和value均为X
            key_values = X
        else:
            # 预测阶段,key_values为之前state[2][self.i]和本次输入X在dim=1上的连接
            # 具体见:transformer解码器预测_help.vsdx
            key_values = torch.cat((state[2][self.i], X), dim=1)
        # 更新每个Block结构对应的state[2][self.i]
        state[2][self.i] = key_values
        if self.training:
            # 如果为训练模式
            batch_size, num_steps, _ = X.shape
            # dec_valid_lens.shape=(batch_size, num_steps)
            # dec_valid_lens[0]=tensor([1, 2, ..., num_steps],即有效长度分别为1,2,...,num_steps
            dec_valid_lens = torch.arange(1, num_steps + 1, device=X.device).repeat(batch_size, 1)
        else:
            # 预测模型时,不知道后面的内容,不需要进行遮蔽操作
            dec_valid_lens = None

        # transformer解码器Block结构为:
        # =+残差连接=>多头注意力1==>add & Norm=+残差连接=>多头注意力2==>add & Norm=+残差连接=>Feed Forward=>add & Norm

        # 可以看出,key_values为state[2][self.i]与本次输入的concat
        print("key_values.shape:", key_values.shape, self.i)
        # 第一个多头注意力的key和value为key_values
        X2 = self.attention1(X, key_values, key_values, valid_lens=dec_valid_lens)
        Y = self.addnorm1(X, X2)
        # 键和值均为编码器的输出
        Y2 = self.attention2(Y, enc_outputs, enc_outputs, valid_lens=enc_valid_lens)
        Z = self.addnorm2(Y, Y2)
        return self.addnorm3(Z, self.ffn(Z)), state


decoder_blk = DecoderBlock(query_size=24, key_size=24,
                           value_size=24, norm_shape=[100, 24],
                           ffn_num_hiddens=48, num_heads=8, dropout=0.5, i=0)

decoder_blk.train()
X = torch.ones((2, 100, 24))
state = [encoder_blk(X, valid_lens), valid_lens, [None]]
decoder_blk(X, state)[0].shape

key_values.shape: torch.Size([2, 100, 24]) 0


torch.Size([2, 100, 24])

In [126]:
class TransformerDecoder(nn.Module):
    """transformer解码器"""

    def __init__(self,
                 # 单词表的单词数目
                 vocab_size,
                 # 查询特征数目(E_q)
                 query_size,
                 # 键特征数目(E_k)
                 key_size,
                 # 值特征数目(E_v)
                 value_size,
                 # 除mini-batch维度之外其他维度的列表(即进行层归一化的维度)
                 norm_shape,
                 ffn_num_hiddens,
                 # 多头数
                 num_heads,
                 # transformer解码器Block数
                 num_layers,
                 dropout):
        super(TransformerDecoder, self).__init__()
        self.query_size = query_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, query_size)
        self.pos_encoding = PositionalEncoding(query_size, dropout)
        self.blks = nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module("block" + str(i), DecoderBlock(
                query_size=query_size,
                key_size=key_size,
                value_size=value_size,
                norm_shape=norm_shape,
                ffn_num_hiddens=ffn_num_hiddens,
                num_heads=num_heads,
                dropout=dropout,
                i=i))
        # 最后的线性层
        self.dense = nn.Linear(query_size, vocab_size)

    def init_state(self, enc_outputs, enc_valid_lens):
        # 初始化每一个Block结构的state[2][self.i]均为None
        return [enc_outputs, enc_valid_lens, [None] * self.num_layers]

    def forward(self, X, state):
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.query_size))
        # 每个Block包含两个多头注意力
        self._attention_weights = [[None] * len(self.blks) for _ in range(2)]
        for i, blk in enumerate(self.blks):
            # 上一个Block输出为下一个Block的输入
            X, state = blk(X, state)  # 更新state
            # 第一个多头的注意力分数
            self._attention_weights[0][i] = blk.attention1.attention.attention_weights
            # 第二个多头的注意力分数
            self._attention_weights[1][i] = blk.attention2.attention.attention_weights
        return self.dense(X), state

    @property
    def attention_weights(self):
        return self._attention_weights

In [127]:
num_heads, num_layers, dropout = 4, 6, 0.1
key_size, query_size, value_size = 24, 24, 24
ffn_num_hiddens = 48
norm_shape = [24]

decoder = TransformerDecoder(
    vocab_size=1000,
    query_size=query_size,
    key_size=key_size,
    value_size=value_size,
    norm_shape=norm_shape,
    ffn_num_hiddens=ffn_num_hiddens,
    num_heads=num_heads,
    num_layers=num_layers,
    dropout=dropout)

valid_lens = torch.tensor([3, 2])
X0 = torch.ones((2, 100), dtype=torch.long)
state = decoder.init_state(encoder(X0, valid_lens), valid_lens)
for i in state[2]:
    print(i)  # None
    break

X1 = torch.ones((2, 100), dtype=torch.long)
output, state = decoder(X1, state)
print("output.shape:", output.shape)

print('***********************************')
for i in state[2]:
    print(i[0, :, :])
    print(i[0, :, :].shape)  # [100, 24]
    break

print('***********************************')
X2 = torch.ones((2, 100), dtype=torch.long)
output, state = decoder(X2, state)
print("output.shape:", output.shape)
for i in state[2]:
    print(i[0, :, :])
    print(i[0, :, :].shape)  # [200, 24]
    break

print('***********************************')
X3 = torch.ones((2, 100), dtype=torch.long)
output, state = decoder(X3, state)
print("output.shape:", output.shape)
for i in state[2]:
    print(i[0, :, :])
    print(i[0, :, :].shape)  # [300, 24]
    break

None
key_values.shape: torch.Size([2, 100, 24]) 0
key_values.shape: torch.Size([2, 100, 24]) 1
key_values.shape: torch.Size([2, 100, 24]) 2
key_values.shape: torch.Size([2, 100, 24]) 3
key_values.shape: torch.Size([2, 100, 24]) 4
key_values.shape: torch.Size([2, 100, 24]) 5
output.shape: torch.Size([2, 100, 1000])
***********************************
tensor([[-0.0000, -5.5179, -2.7901,  ..., 10.4995, -4.7555,  8.3973],
        [-0.0536, -6.0287, -2.2927,  ..., 10.4995, -4.7553,  8.3973],
        [ 0.0218, -7.0914, -1.9005,  ..., 10.4995, -4.7550,  8.3973],
        ...,
        [-0.5667, -0.0000, -1.8312,  ..., 10.4984, -4.7323,  8.3971],
        [-1.6256, -7.5393, -1.6814,  ..., 10.4984, -4.7320,  8.3971],
        [-2.0988, -6.5847, -1.7661,  ..., 10.4984, -4.7318,  0.0000]],
       grad_fn=<SliceBackward0>)
torch.Size([100, 24])
***********************************
key_values.shape: torch.Size([2, 200, 24]) 0
key_values.shape: torch.Size([2, 200, 24]) 1
key_values.shape: torch.Size([2, 

In [128]:
class EncoderDecoder(nn.Module):
    """The base class for the encoder-decoder architecture"""

    def __init__(self, encoder, decoder):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder  # 编码器
        self.decoder = decoder  # 解码器

    def forward(self, enc_X, dec_X, enc_valid_lens):
        enc_outputs = self.encoder(enc_X, enc_valid_lens)
        dec_state = self.decoder.init_state(enc_outputs, enc_valid_lens=enc_valid_lens)
        return self.decoder(dec_X, dec_state)

### 数据处理

In [129]:
with open('fra.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

# 左边为英文句子,右边为对应的法文句子
print(raw_text[:75])

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [130]:
def preprocess_nmt(text):
    """预处理"英语<--->法语"数据集"""

    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

    # 使用空格替换不间断空格(non-breaking space)
    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    # 使用小写字母替换大写字母
    text = text.lower()
    # 在单词和标点符号之间插⼊空格
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text)]
    return ''.join(out)


text = preprocess_nmt(raw_text)
print(text[:80])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !


In [131]:
def tokenize_nmt(text,
                 num_examples=None):  # 参与训练的训练样本数
    """词元化"英语<--->法语"数据数据集"""
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        source.append(parts[0].split(' '))  # 英文数据
        target.append(parts[1].split(' '))  # 法文数据
    return source, target


source, target = tokenize_nmt(text)
source[:10], target[:10]  # 每个子列表表示一个句子的切分(根据' '切分)

([['go', '.'],
  ['hi', '.'],
  ['run', '!'],
  ['run', '!'],
  ['who', '?'],
  ['wow', '!'],
  ['fire', '!'],
  ['help', '!'],
  ['jump', '.'],
  ['stop', '!']],
 [['va', '!'],
  ['salut', '!'],
  ['cours', '!'],
  ['courez', '!'],
  ['qui', '?'],
  ['ça', 'alors', '!'],
  ['au', 'feu', '!'],
  ['à', "l'aide", '!'],
  ['saute', '.'],
  ['ça', 'suffit', '!']])

In [132]:
def count_corpus(tokens):
    """Count token frequencies"""
    # Here `tokens` is a 1D list or 2D list
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of token lists into a list of tokens
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)


class Vocab:
    """Vocabulary for text"""

    def __init__(self, tokens=None,
                 # The minimum frequency needed to include a token in the vocabulary.
                 min_freq=2,
                 reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus(tokens)
        # Sort according to frequencies
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        # The index for the unknown token is 0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {
            token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        """Index for the unknown token"""
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs


# '<unk>':未知词元
# '<pad>':填充词元
# '<sos>':开始词元
# '<eos>':结束词元
src_vocab = Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<sos>', '<eos>'])
len(src_vocab)

10012

In [133]:
def truncate_pad(line, num_steps, padding_token):
    """截断或填充文本序列"""
    if len(line) > num_steps:
        return line[:num_steps]  # 句子截断
    return line + [padding_token] * (num_steps - len(line))  # 句子填充


truncate_pad(src_vocab[source[0]], 10, src_vocab['<pad>'])

[47, 4, 1, 1, 1, 1, 1, 1, 1, 1]

In [134]:
def build_array_nmt(lines, vocab, num_steps):
    """将文本序列转换为数值矩阵"""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]  # 使用'<eos>'表示句子的结尾
    array = torch.tensor([truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).to(dtype=torch.int32).sum(1)
    return array, valid_len


array, valid_len = build_array_nmt(source, src_vocab, 15)
print(array)  # 通过vocab将句子从token转换为idx(等长,不足部分1填充)
print(array.shape)
print(valid_len)  # 每个句子的实际长度

tensor([[  47,    4,    3,  ...,    1,    1,    1],
        [2944,    4,    3,  ...,    1,    1,    1],
        [ 435,  126,    3,  ...,    1,    1,    1],
        ...,
        [ 381,   60,   26,  ...,  480,   68, 4696],
        [  66,  295,   90,  ...,   10, 1170, 1526],
        [  17,  176,   32,  ...,    8, 1963,   16]])
torch.Size([167130, 15])
tensor([ 3,  3,  3,  ..., 15, 15, 15])


In [135]:
def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator"""
    dataset = Data.TensorDataset(*data_arrays)
    return Data.DataLoader(dataset, batch_size,
                           shuffle=is_train)  # 训练模式下设置shuffle=True


def load_data_nmt(text, batch_size, num_steps, num_examples=None):
    """返回翻译数据集的迭代器和词汇表"""
    source, target = tokenize_nmt(text, num_examples=num_examples)
    # 英文数据词表
    src_vocab = Vocab(source, min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])
    # 法文数据词表
    tgt_vocab = Vocab(target, min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab


train_iter, src_vocab, tgt_vocab = load_data_nmt(text, batch_size=32, num_steps=15, num_examples=500)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X:', X.type(torch.int32))
    print('valid lengths for X:', X_valid_len)
    print('Y:', Y.type(torch.int32))
    print('valid lengths for Y:', Y_valid_len)
    break

X: tensor([[  7,  89,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 59,  28,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 32,   5,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [  0,  22,   5,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [  7,   0,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 65,  14,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 85,  10,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [  7, 143,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 34,  12,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 10, 106,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 20,  26,   4,   3,   1,   1,   1,   1,   1,   1

### 自定义损失函数

In [136]:
def sequence_mask(X, valid_len, value=0):
    """每行有效长度之外的值进行填充"""
    maxlen = X.size(1)
    # 广播机制
    mask = torch.arange(maxlen, dtype=torch.float32, device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X


X = torch.tensor([[1, 2, 3],
                  [4, 5, 6]])
# 通过零值化屏蔽不相关的项,以便后面任何不相关预测的计算都是与零的乘积,结果都等于零
sequence_mask(X, torch.tensor([1, 2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [137]:
class MaskedSoftmaxCELoss(nn.Module):
    """带遮蔽的softmax交叉熵损失函数"""

    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        unweighted_loss = nn.CrossEntropyLoss(reduction='none')(pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss


loss = MaskedSoftmaxCELoss()
# 指定这些序列的有效⻓度为4、2、0,可以看出第⼀个序列的损失为第二个序列的两倍,第三个序列的损失为零
loss(torch.ones(3, 4, 10), torch.ones((3, 4), dtype=torch.long), torch.tensor([4, 2, 0]))

tensor([2.3026, 1.1513, 0.0000])

### 模型训练

In [138]:
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """训练seq2seq模型"""
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()

    for epoch in range(num_epochs):
        for batch_idx, batch in enumerate(data_iter):
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher Forcing
            Y_hat, _ = net(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 3)  # 梯度裁剪
            optimizer.step()
            if batch_idx % 300 == 0:
                print('loss:', l.sum().item())

In [139]:
batch_size, num_steps = 64, 10
train_iter, src_vocab, tgt_vocab = load_data_nmt(text, batch_size, num_steps, num_examples=1000)
lr, num_epochs, device = 0.005, 300, torch.device("cuda" if torch.cuda.is_available() else "cpu")
ffn_num_hiddens, num_heads, num_layers, dropout = 64, 4, 2, 0.1
key_size, query_size, value_size = 32, 32, 32
norm_shape = [32]

encoder = TransformerEncoder(
    vocab_size=len(src_vocab),
    query_size=query_size,
    key_size=key_size,
    value_size=value_size,
    norm_shape=norm_shape,
    ffn_num_hiddens=ffn_num_hiddens,
    num_heads=num_heads,
    num_layers=num_layers,
    dropout=dropout)
decoder = TransformerDecoder(
    vocab_size=len(tgt_vocab),
    query_size=query_size,
    key_size=key_size,
    value_size=value_size,
    norm_shape=norm_shape,
    ffn_num_hiddens=ffn_num_hiddens,
    num_heads=num_heads,
    num_layers=num_layers,
    dropout=dropout)

net = EncoderDecoder(encoder, decoder)
train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)

key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
loss: 174.52337646484375
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([64, 10, 32]) 1
key_values.shape: torch.Size([64, 10, 32]) 0
key_values.shape: torch.Size([

### 模型预测与评估

In [142]:
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, device):
    """Seq2Seq模型的预测"""
    net.eval()
    src_tokens = src_vocab[src_sentence.lower().split(' ')] + [src_vocab['<eos>']]
    enc_valid_len = torch.tensor([len(src_tokens)], device=device)
    src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    enc_X = torch.unsqueeze(torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = net.encoder(enc_X, enc_valid_len)
    # 最终的隐藏状态
    dec_state = net.decoder.init_state(enc_outputs, enc_valid_lens=enc_valid_len)
    dec_X = torch.unsqueeze(torch.tensor([tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    for _ in range(num_steps):
        Y, dec_state = net.decoder(dec_X, dec_state)
        # 使用具有预测最高可能性的词元,作为解码器在下⼀时间步的输⼊
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        attention_weight_seq.append(net.decoder.attention_weights)
        if pred == tgt_vocab['<eos>']:  # 如果单词为'<eos>',则表示输出序列预测结束
            break
        output_seq.append(pred)
    # 重新翻译回句子
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq

In [143]:
engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
all_attension_weight_seq = []
for eng, fra in zip(engs, fras):
    translation, dec_attention_weight_seq = predict_seq2seq(net, eng, src_vocab, tgt_vocab, num_steps, device)
    all_attension_weight_seq.append(dec_attention_weight_seq)
    chencherry = SmoothingFunction()
    # 使用bleu指标进行结果评估
    blen = sentence_bleu([translation], fra, weights=(1 / 2.0, 1 / 2.0), smoothing_function=chencherry.method1)
    # 最终翻译效果
    print(f'{eng} => {translation}, bleu {blen:.3f}')

key_values.shape: torch.Size([1, 1, 32]) 0
key_values.shape: torch.Size([1, 1, 32]) 1
key_values.shape: torch.Size([1, 2, 32]) 0
key_values.shape: torch.Size([1, 2, 32]) 1
key_values.shape: torch.Size([1, 3, 32]) 0
key_values.shape: torch.Size([1, 3, 32]) 1
go . => va !, bleu 1.000
key_values.shape: torch.Size([1, 1, 32]) 0
key_values.shape: torch.Size([1, 1, 32]) 1
key_values.shape: torch.Size([1, 2, 32]) 0
key_values.shape: torch.Size([1, 2, 32]) 1
key_values.shape: torch.Size([1, 3, 32]) 0
key_values.shape: torch.Size([1, 3, 32]) 1
key_values.shape: torch.Size([1, 4, 32]) 0
key_values.shape: torch.Size([1, 4, 32]) 1
i lost . => j'ai perdu ., bleu 1.000
key_values.shape: torch.Size([1, 1, 32]) 0
key_values.shape: torch.Size([1, 1, 32]) 1
key_values.shape: torch.Size([1, 2, 32]) 0
key_values.shape: torch.Size([1, 2, 32]) 1
key_values.shape: torch.Size([1, 3, 32]) 0
key_values.shape: torch.Size([1, 3, 32]) 1
key_values.shape: torch.Size([1, 4, 32]) 0
key_values.shape: torch.Size([1, 4,

In [229]:
# 第0句话第2个单词第二个Block结构第二个多头注意力
atw_0111 = all_attension_weight_seq[0][1][1][1]
atw_0111

tensor([[[0.8109, 0.0996, 0.0895, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000]],

        [[0.0225, 0.5829, 0.3947, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000]],

        [[0.2349, 0.3766, 0.3886, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000]],

        [[0.4526, 0.3046, 0.2428, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000]]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [230]:
# atw_0111.shape=(N * num_heads, L, S)
# 这里N=1,num_heads=4,L=1,S=10
atw_0111.shape

torch.Size([4, 1, 10])