In [1]:
import collections
import torch
import torch.utils.data as Data
import torch.nn as nn
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

### 数据处理

In [2]:
with open('fra.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

# 左边为英文句子,右边为对应的法文句子
print(raw_text[:75])

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [3]:
def preprocess_nmt(text):
    """预处理"英语<--->法语"数据集"""

    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

    # 使用空格替换不间断空格(non-breaking space)
    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    # 使用小写字母替换大写字母
    text = text.lower()
    # 在单词和标点符号之间插⼊空格
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text)]
    return ''.join(out)


text = preprocess_nmt(raw_text)
print(text[:80])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !


In [4]:
def tokenize_nmt(text,
                 num_examples=None):  # 使用的训练样本数
    """词元化"英语<--->法语"数据数据集"""
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        source.append(parts[0].split(' '))  # 英文数据
        target.append(parts[1].split(' '))  # 法文数据
    return source, target


source, target = tokenize_nmt(text)
source[:10], target[:10]  # 每个子列表表示一个句子的切分(根据' '切分)

([['go', '.'],
  ['hi', '.'],
  ['run', '!'],
  ['run', '!'],
  ['who', '?'],
  ['wow', '!'],
  ['fire', '!'],
  ['help', '!'],
  ['jump', '.'],
  ['stop', '!']],
 [['va', '!'],
  ['salut', '!'],
  ['cours', '!'],
  ['courez', '!'],
  ['qui', '?'],
  ['ça', 'alors', '!'],
  ['au', 'feu', '!'],
  ['à', "l'aide", '!'],
  ['saute', '.'],
  ['ça', 'suffit', '!']])

In [5]:
def count_corpus(tokens):
    """Count token frequencies"""
    # Here `tokens` is a 1D list or 2D list
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of token lists into a list of tokens
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)


class Vocab:
    """Vocabulary for text"""

    def __init__(self, tokens=None, min_freq=2, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus(tokens)
        # Sort according to frequencies
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        # The index for the unknown token is 0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {
            token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        """Index for the unknown token"""
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs


# '<unk>':未知词元
# '<pad>':填充词元
# '<sos>':开始词元
# '<eos>':结束词元
src_vocab = Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<sos>', '<eos>'])
len(src_vocab)

10012

In [6]:
def truncate_pad(line, num_steps, padding_token):
    """截断或填充文本序列"""
    if len(line) > num_steps:
        return line[:num_steps]  # 句子截断
    return line + [padding_token] * (num_steps - len(line))  # 句子填充


truncate_pad(src_vocab[source[0]], 10, src_vocab['<pad>'])

[47, 4, 1, 1, 1, 1, 1, 1, 1, 1]

In [7]:
def build_array_nmt(lines, vocab, num_steps):
    """将文本序列转换为数值矩阵"""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]  # 使用'<eos>'表示句子的结尾
    array = torch.tensor([truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).to(dtype=torch.int32).sum(1)
    return array, valid_len


array, valid_len = build_array_nmt(source, src_vocab, 15)
print(array)  # 通过vocab将句子从token转换为idx(等长,不足部分1填充)
print(array.shape)
print(valid_len)  # 每个句子的实际长度

tensor([[  47,    4,    3,  ...,    1,    1,    1],
        [2944,    4,    3,  ...,    1,    1,    1],
        [ 435,  126,    3,  ...,    1,    1,    1],
        ...,
        [ 381,   60,   26,  ...,  480,   68, 4696],
        [  66,  295,   90,  ...,   10, 1170, 1526],
        [  17,  176,   32,  ...,    8, 1963,   16]])
torch.Size([167130, 15])
tensor([ 3,  3,  3,  ..., 15, 15, 15])


In [8]:
def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator"""
    dataset = Data.TensorDataset(*data_arrays)
    return Data.DataLoader(dataset, batch_size, shuffle=is_train)


def load_data_nmt(text, batch_size, num_steps, num_examples=None):
    """返回翻译数据集的迭代器和词汇表"""
    source, target = tokenize_nmt(text, num_examples=num_examples)
    src_vocab = Vocab(source, min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = Vocab(target, min_freq=2,
                      reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab


train_iter, src_vocab, tgt_vocab = load_data_nmt(text, batch_size=32, num_steps=15, num_examples=500)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X:', X.type(torch.int32))
    print('valid lengths for X:', X_valid_len)
    print('Y:', Y.type(torch.int32))
    print('valid lengths for Y:', Y_valid_len)
    break

X: tensor([[  8, 107,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 20,  39,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [  7, 142,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 50,  14,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [  9, 145,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [  6,   0,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 38,  33,   4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 87,  88,   5,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 92,   6,   8,  12,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [ 50,  14,   5,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1],
        [  7,  67,   4,   3,   1,   1,   1,   1,   1,   1

### Seq2Seq模型

In [9]:
class Seq2SeqEncoder(nn.Module):
    """用于序列到序列学习的循环神经⽹络编码器"""

    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0, bidirectional=False):
        super(Seq2SeqEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers,
                          dropout=dropout, bidirectional=bidirectional)

    def forward(self, X):
        # self.embedding(X).shape=(N, T, C);其中T为序列的长度,N为min-batch的大小,C为输入的特征数目
        X = self.embedding(X)
        # 循环神经默认输入要求为:(T, N, C)
        X = X.permute(1, 0, 2)
        output, state = self.rnn(X)
        # output.shape=(T, N, hidden_size)
        # state.shape=(num_layers, N, hidden_size)
        return output, state


encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, hidden_size=16,
                         num_layers=2)
encoder.eval()

X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
print(output.shape)
print(state.shape)

torch.Size([7, 4, 16])
torch.Size([2, 4, 16])


In [10]:
# state[-1].shape=(N, hidden_size)
state[-1].shape

torch.Size([4, 16])

In [11]:
class Seq2SeqDecoder(nn.Module):
    """⽤于序列到序列学习的循环神经网络解码器"""

    def __init__(self, vocab_size, embed_size, hidden_size, num_layers,
                 dropout=0, bidirectional=False):
        super(Seq2SeqDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size + hidden_size, hidden_size, num_layers,
                          dropout=dropout, bidirectional=bidirectional)
        self.dense = nn.Linear(hidden_size, vocab_size)

    def init_state(self, enc_outputs):
        return enc_outputs[1]

    def forward(self, X, state):
        # embedding(X).shape=(N, T, C)
        # embedding(X).permute(1, 0, 2).shape=(T, N, C)
        X = self.embedding(X).permute(1, 0, 2)
        # state.shape=(num_layers, N, hidden_size)
        # state[-1].shape=(N, hidden_size)  # 取最后一层的隐藏状态
        # context.shape=(T, N, hidden_size)
        context = state[-1].repeat(X.shape[0], 1, 1)  # 上下文向量(固定)
        # Notice how the last hidden state is actually the context we pass along to the decoder
        # X_and_context = (T, N, c+hidden_size)
        X_and_context = torch.cat((X, context), 2)
        # print(X_and_context.shape)
        # 使用循环神经⽹络编码器最终的隐藏状态(或是多个隐节状态的加权总和)来初始化解码器的隐藏状态
        output, state = self.rnn(X_and_context, state)
        output = self.dense(output).permute(1, 0, 2)
        return output, state


decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, hidden_size=16,
                         num_layers=2)
decoder.eval()

X = torch.zeros((4, 7), dtype=torch.long)
state = decoder.init_state(encoder(X))
output, state = decoder(X, state)
output.shape, state.shape

(torch.Size([4, 7, 10]), torch.Size([2, 4, 16]))

In [12]:
class EncoderDecoder(nn.Module):
    """The base class for the encoder-decoder architecture"""

    def __init__(self, encoder, decoder):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X):
        enc_outputs = self.encoder(enc_X)
        dec_state = self.decoder.init_state(enc_outputs)
        return self.decoder(dec_X, dec_state)

### 自定义损失函数

In [13]:
def sequence_mask(X, valid_len, value=0):
    """在序列中屏蔽不相关的项"""
    maxlen = X.size(1)
    # 广播机制
    mask = torch.arange((maxlen), dtype=torch.float32, device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X


X = torch.tensor([[1, 2, 3],
                  [4, 5, 6]])
# 通过零值化屏蔽不相关的项,以便后⾯任何不相关预测的计算都是与零的乘积,结果都等于零
sequence_mask(X, torch.tensor([1, 2]))

tensor([[1, 0, 0],
        [4, 5, 0]])

In [14]:
class MaskedSoftmaxCELoss(nn.Module):
    """带遮蔽的softmax交叉熵损失函数"""

    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        unweighted_loss = nn.CrossEntropyLoss(reduction='none')(pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss


loss = MaskedSoftmaxCELoss()
# 指定这些序列的有效⻓度为4、2、0,可以看出第⼀个序列的损失为第二个序列的两倍,第三个序列的损失为零
loss(torch.ones(3, 4, 10), torch.ones((3, 4), dtype=torch.long), torch.tensor([4, 2, 0]))

tensor([2.3026, 1.1513, 0.0000])

### 模型训练

In [15]:
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """训练seq2seq模型"""
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()

    for epoch in range(num_epochs):
        for batch_idx, batch in enumerate(data_iter):
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<sos>']] * Y.shape[0], device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher Forcing
            Y_hat, _ = net(X, dec_input)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 3)  # 梯度裁剪
            optimizer.step()
            if batch_idx % 300 == 0:
                print('loss:', l.sum().item())

In [16]:
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.1
batch_size, num_steps = 32, 25
lr, num_epochs, device = 0.005, 200, torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iter, src_vocab, tgt_vocab = load_data_nmt(text, batch_size, num_steps, num_examples=1000)
encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers,
                         dropout)
decoder = Seq2SeqDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers,
                         dropout)
net = EncoderDecoder(encoder, decoder)

train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)

loss: 33.91590118408203
loss: 18.085134506225586
loss: 17.26862335205078
loss: 17.816463470458984
loss: 17.16301918029785
loss: 15.098230361938477
loss: 12.243768692016602
loss: 11.89013671875
loss: 11.012125015258789
loss: 10.215425491333008
loss: 8.221678733825684
loss: 9.161933898925781
loss: 7.329930782318115
loss: 7.815098762512207
loss: 5.945168972015381
loss: 7.575524806976318
loss: 6.553491592407227
loss: 5.807292938232422
loss: 5.255470275878906
loss: 5.180303573608398
loss: 5.396553993225098
loss: 5.665897369384766
loss: 5.24554443359375
loss: 4.559234619140625
loss: 4.248633861541748
loss: 4.740425109863281
loss: 4.453648567199707
loss: 4.661025524139404
loss: 4.555342674255371
loss: 4.360511779785156
loss: 5.905474662780762
loss: 4.218197345733643
loss: 4.021627426147461
loss: 4.142810821533203
loss: 3.6002283096313477
loss: 4.467589378356934
loss: 3.933612585067749
loss: 4.298162937164307
loss: 3.852544069290161
loss: 3.5493593215942383
loss: 3.612654209136963
loss: 3.3324

### 模型预测与评估

In [17]:
def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, device):
    """Seq2Seq模型的预测"""
    net.eval()
    src_tokens = src_vocab[src_sentence.lower().split(' ')] + [src_vocab['<eos>']]
    src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    enc_X = torch.unsqueeze(torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = net.encoder(enc_X)
    # 最终的隐藏状态
    dec_state = net.decoder.init_state(enc_outputs)
    # 预测的第一个单词
    dec_X = torch.unsqueeze(torch.tensor([tgt_vocab['<sos>']], dtype=torch.long, device=device), dim=0)
    output_seq = []
    for _ in range(num_steps):
        Y, dec_state = net.decoder(dec_X, dec_state)
        # 使用具有预测最高可能性的词元,作为解码器在下⼀时间步的输⼊
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        if pred == tgt_vocab['<eos>']:  # 如果单词为'<eos>',则表示输出序列预测结束
            break
        output_seq.append(pred)
    # 重新翻译回句子
    return ' '.join(tgt_vocab.to_tokens(output_seq))

In [18]:
engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
for eng, fra in zip(engs, fras):
    translation = predict_seq2seq(net, eng, src_vocab, tgt_vocab, num_steps, device)
    chencherry = SmoothingFunction()
    # 使用bleu指标进行结果评估
    blen = sentence_bleu([translation], fra, weights=(1 / 2.0, 1 / 2.0), smoothing_function=chencherry.method1)
    print(f'{eng} => {translation}, bleu {blen:.3f}')

go . => <unk> !, bleu 0.193
i lost . => je l’ai perdu ., bleu 0.674
he's calm . => il est riche ., bleu 0.695
i'm home . => je suis chez moi, bleu 0.886
