In [4]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim 
import random 
import spacy
import datasets 
import torchtext
import tqdm
import evaluate

In [5]:
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# 确保程序的每次运行都具有确定性，可以得到相同的结果，用于程序的复现
torch.backends.cudnn.deterministic = True

#### Preparing Data
- 获取数据集
- tokenize, 加入special tokens
- train_data's vocabulary
- numerialize
- with_format 修改index的数据类型

In [6]:
dataset = datasets.load_dataset("bentrevett/multi30k")
# print(dataset)
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [7]:
# 数据集每条数据是一个字典，key：语种，value：对应的句子
print(train_data[0])
type(train_data[0])

{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}


dict

In [8]:
# tokenize
# token一种更为笼统的说法，包含了words, numbers, punctuations, any special symbol
# 利用spacy库加载两个用来处理德语/英语的模型
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [9]:
str = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(str)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [10]:
# 函数 接收example数据，对应的处理器，最大的长度，是否lower，句子的开始结尾token
def tokenizer_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens" : en_tokens, "de_tokens" : de_tokens}

In [11]:
max_length = 1000
sos_token = "<sos>"
eos_token = "<eos>"
lower = True 

# function_keyword_arguments
fn_kwargs = {
    "en_nlp" : en_nlp,
    "de_nlp" : de_nlp,
    "max_length" : max_length,
    "lower" : lower,
    "sos_token" : sos_token,
    "eos_token" : eos_token,
}

# 通过调用tokenizer_example函数并传入对应的参数字典，来向dataset中添加函数返回的key-value
train_data = train_data.map(tokenizer_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenizer_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenizer_example, fn_kwargs=fn_kwargs)

In [10]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [12]:
# vocabularies 
# 生成token和index之间的映射字典，传入神经网络的是这些映射整数
# 理论上说，生成的字母表应该包含所有可能出现的token，但是如果token不出现在train_data中但是出现在test_data或者valid_data中呢
# 引入<unk> unkown token，其对应的index通常是一个固定的数，比如0，用来表示那些在vocabulary中无法找到的token

# 出现次数小于min_freq的应该被tokenize为<unk>，人为在train_data中添加了<unk>的token
min_freq = 2
unk_token = "<unk>"
# 每次输入一个batch，batch中的句子长度应该在传入时与这一批中的最大长度相同，转化为token时自动padding
pad_token = "<pad>"
special_tokens = [
    sos_token,
    eos_token,
    unk_token,
    pad_token,
]

# vocabulary 只能在train_data上建立，防止信息泄露
en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [12]:
# 这表示int to string的前十个，可以切片，因为是存在list中
en_vocab.get_itos()[:10]

['<sos>', '<eos>', '<unk>', '<pad>', 'a', '.', 'in', 'the', 'on', 'man']

In [13]:
# 无法切片因为是dict
en_vocab.get_stoi()

{'company': 1869,
 'metal': 307,
 'green': 52,
 'track': 302,
 'pours': 2356,
 'vase': 5829,
 'the': 7,
 'crashes': 3951,
 'non': 5340,
 'bonding': 4685,
 'face': 158,
 'doll': 1775,
 'garbage': 1120,
 'workers': 228,
 'winter': 446,
 'taste': 3737,
 'seems': 1207,
 'outdoors': 341,
 'her': 44,
 'activity': 1973,
 'hose': 834,
 'turbulent': 5808,
 'group': 38,
 'gentleman': 524,
 'bald': 683,
 'underway': 4509,
 'a': 4,
 'scaffolds': 5551,
 'outside': 57,
 'drivers': 3393,
 'readying': 3612,
 'jeans': 175,
 'black': 26,
 'to': 18,
 'jungle': 1235,
 'supervision': 5722,
 'or': 258,
 'all': 255,
 'examined': 4955,
 'observes': 1725,
 'i': 956,
 'ball': 68,
 'sandwich': 1293,
 '<sos>': 0,
 'cheerleaders': 927,
 'fisherman': 1421,
 'firetruck': 1419,
 'sunrise': 3726,
 'pub': 2816,
 'fires': 4989,
 'sing': 1209,
 'street': 39,
 'moped': 1504,
 'in': 6,
 'customers': 850,
 'sheepdog': 3654,
 'taping': 5747,
 'mood': 5309,
 'attire': 453,
 'kicked': 2761,
 'probably': 2812,
 'soccer': 123,
 

In [14]:
en_vocab.get_stoi()["in"]

6

In [15]:
# 还可以直接写成
en_vocab["in"]

6

In [16]:
len(de_vocab), len(en_vocab)

(7853, 5893)

In [17]:
# "The" 中含有大写字母一定不在en_vocab中
print("The" in en_vocab)
# en_vocab["The"] # 会报错，可以设置这个值，在取不存在与vocab中的单词时返回

False


In [13]:
# vocab 中<unk>的index不一定是0，与specials参数中的顺序有关
assert en_vocab[pad_token] == de_vocab[pad_token]
assert en_vocab[unk_token] == de_vocab[unk_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [14]:
# 上面在取vocabulary之外的值时，就会报错，这里设置默认的不在vocabulary中的token的index值为unk_index
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [20]:
en_vocab["The"]

2

In [21]:
# 一个有用的method，lookup_indeces，接收一个token list返回对应的index

tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 2, 821]

In [22]:
# 相应的lookup_tokens
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', 'shows']

In [15]:
# 类似tokenize_example，写一个numericalize_example通过map应用
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids" : en_ids, "de_ids" : de_ids}

In [16]:
# fn_kwargs 是function_keyword arguments的缩写
fn_kwargs = {"en_vocab" : en_vocab, "de_vocab" : de_vocab}
train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

In [25]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [0, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 1],
 'de_ids': [0, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 1]}

In [26]:
# 检查一些是否对应
en_vocab.lookup_tokens(train_data[0]['en_ids'])

['<sos>',
 'two',
 'young',
 ',',
 'white',
 'males',
 'are',
 'outside',
 'near',
 'many',
 'bushes',
 '.',
 '<eos>']

In [17]:
# 目前的index都是python的内置int类型，希望转化为"torch" for Pytorch
data_type = "torch"
format_columns = ["en_ids", "de_ids"]
# with_format method把对应的columns转化为type类型，默认是只返回columns中包含的features
# 设置output_all_columns=True，可以保留所有的特征
train_data = train_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)
test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [28]:
train_data[0]

{'en_ids': tensor([   0,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            1]),
 'de_ids': tensor([   0,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    1]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [18]:
# 每个batch进行padding处理,collate means collect and combine

# 传入的参数是用来padding的index，返回一个函数collate_fn
# closure闭包的写法，让内部的函数能够持续地使用pad_index，而不用创建全局变量或者创建一个类
def get_collate_fn(pad_index):
    # 返回一个字典，分开的batch（两个语言分开），已经使用pad_index padding使得所有长度与原长度最大相等，传入的参数是一个batch大小的data
    def collate_fn(batch):
        batch_en_ids = [example['en_ids'] for example in batch]
        batch_de_ids = [example['de_ids'] for example in batch]
        # batch_first=False（默认为False），返回的数据size就是[max_length, batch_size]的，也就是每一列是padding后的
        # numericalize 的sequence，如果设置batch_first=True，那么返回的数据就是[batch_size, max_length]符合每一行是seq的直觉
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        return {"en_ids" : batch_en_ids, "de_ids" : batch_de_ids}
    return collate_fn 

In [19]:
# 利用Pytorch的DataLoader
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader


In [20]:
# 为了发挥最大性能，batch_size应该适合GPU的内存
# 训练时数据设置为shuffle=True，可以得到更稳定的结果，valid和test上没必要shuffle
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [32]:
# 展示数据表明被处理后的数据是按列排布的
# for data in train_data_loader:
# print(data)

In [21]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        # 这里的input_dim是vocab的长度，用来初始化一个input_dim x embedding_dim的矩阵，每个word学习为一个向量表示
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        # embedded的维度是[max_length, batch_size, embedding_dim]
        embedded = self.dropout(self.embedding(src))
        # outputs 是最后一层LSTM的hidden输出，共有max_length个hidden，每个hidden的输出是一个hidden_size的向量
        # 维度是[max_length, batch_size, num_directions x hidden_size]
        # hidden 是每层LSTM最后一个hidden的输出，共有n_layers个hidden，每个hidden的输出是一个hidden_size的向量
        # 维度是[n_layers x num_directions, batch_size, hidden_size]
        # cell 是cell state，和hidden一起生成，维度相同
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell
        

In [24]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        # 输出的维度是vocablary的维度，每个token的预测值
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input [1, batch_size] seq_length是1
        input = input.unsqueeze(0)
        # embedded [1, batch_size, embedding_dim]
        embedded = self.dropout(self.embedding(input))
        # output [1, batch_size, hidden_dim]
        # hidden [n_layers x n_directions, batch_size, hidden_size]--这里的n_directions是1
        # cell 的维度和hidden 相同
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # 压掉加上的一个维度
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [25]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert( 
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert(
            encoder.n_layers == decoder.n_layers
        ), "N_layers dimensions of encoder and decoder must be equal!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # trg [seq_length, batch_size]
        batch_size = src.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # 输出每组预测值
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        # input 取batch中每个seq的第一个token
        input = trg[0, :] # 可以写成trg[0]
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            # output [batch_size, vocabs_size] 
            # argmax(1) 每行最大值的索引，batch_size的预测word的vocab索引
            top1 = output.argmax(1)
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[t] if teacher_force else top1
        return outputs

In [26]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2seq(encoder, decoder, device).to(device)

In [27]:
# 初始化权重
def init_weights(m):
    # 模型里的参数（权重和bias）初始化
    for name, param in m.named_parameters():
        # 初始化为-0.08~0.08的均匀分布
        # 函数名后的_表示函数对参数进行本地操作，直接修改参数并不返回一个副本
        nn.init.uniform_(param.data, -0.08, 0.08)
# 将函数应用到函数的每个子模块
model.apply(init_weights)

Seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [61]:
def count_parameters(model):
    # numel() tensor方法number of element
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"The model has {count_parameters(model):,} trainable parameters.")

The model has 13,898,501 trainable parameters.


In [31]:
optimizer = optim.Adam(model.parameters())

In [32]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [64]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        # src [seq_length, batch_size]
        src = batch['de_ids'].to(device)
        trg = batch['en_ids'].to(device)
        optimizer.zero_grad()
        # output [trg_length, batch_size, trg_vocab_size]
        output = model(src, trg, teacher_forcing_ratio)
        output_dim = output.shape[-1]
        # 第一个恒为0，去掉避免计算loss
        # 拉平为[(trg_length-1) x batch_size, trg_vocab_size]
        output = output[1:].view(-1, output_dim)
        # trg [(trg_length-1) x batch_size]
        trg = trg[1:].view(-1)
        # 这里传入的output每个元素是一个未经softmax的向量，向量的每个分量值是每个类别的分数
        loss = criterion(output, trg)
        # 计算梯度
        loss.backward()
        # 裁剪梯度，防止梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)
        

In [29]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch['de_ids'].to(device)
            trg = batch['en_ids'].to(device)
            # 设置不用teacher_forcing
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [66]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float('inf')

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model, 
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    print(f"\tTrain_loss:{train_loss:7.3f} | Train PPL:{np.exp(train_loss):7.3f}")
    print(f"\tValid_loss:{valid_loss:7.3f} | Valid PPL:{np.exp(valid_loss):7.3f}")

 10%|█████████▍                                                                                    | 1/10 [05:02<45:21, 302.38s/it]

	Train_loss:  5.033 | Train PPL:153.356
	Valid_loss:  4.917 | Valid PPL:136.613


 20%|██████████████████▊                                                                           | 2/10 [10:59<44:38, 334.84s/it]

	Train_loss:  4.384 | Train PPL: 80.185
	Valid_loss:  4.703 | Valid PPL:110.317


 30%|████████████████████████████▏                                                                 | 3/10 [16:58<40:20, 345.72s/it]

	Train_loss:  4.107 | Train PPL: 60.776
	Valid_loss:  4.550 | Valid PPL: 94.603


 40%|█████████████████████████████████████▌                                                        | 4/10 [22:46<34:38, 346.45s/it]

	Train_loss:  3.895 | Train PPL: 49.174
	Valid_loss:  4.326 | Valid PPL: 75.614


 50%|███████████████████████████████████████████████                                               | 5/10 [28:39<29:05, 349.03s/it]

	Train_loss:  3.702 | Train PPL: 40.546
	Valid_loss:  4.277 | Valid PPL: 72.005


 60%|████████████████████████████████████████████████████████▍                                     | 6/10 [41:47<33:12, 498.13s/it]

	Train_loss:  3.524 | Train PPL: 33.935
	Valid_loss:  4.108 | Valid PPL: 60.822


 70%|█████████████████████████████████████████████████████████████████▊                            | 7/10 [58:21<33:00, 660.23s/it]

	Train_loss:  3.396 | Train PPL: 29.841
	Valid_loss:  4.002 | Valid PPL: 54.707


 80%|█████████████████████████████████████████████████████████████████████████▌                  | 8/10 [1:14:54<25:32, 766.38s/it]

	Train_loss:  3.206 | Train PPL: 24.682
	Valid_loss:  3.930 | Valid PPL: 50.915


 90%|██████████████████████████████████████████████████████████████████████████████████▊         | 9/10 [1:31:25<13:56, 836.48s/it]

	Train_loss:  3.076 | Train PPL: 21.681
	Valid_loss:  3.829 | Valid PPL: 45.998


100%|███████████████████████████████████████████████████████████████████████████████████████████| 10/10 [1:48:05<00:00, 648.59s/it]

	Train_loss:  2.962 | Train PPL: 19.341
	Valid_loss:  3.739 | Valid PPL: 42.071





In [33]:
model.load_state_dict(torch.load('tut1-model.pt'))
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"\tTest_loss:{test_loss:7.3f} | Test PPL:{np.exp(test_loss):7.3f}")

	Test_loss:  3.767 | Test PPL: 43.239


In [53]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        # 若sentence是字符串，转化为token
        if isinstance(sentence, type(str)):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        # 加上一个维度，batch dimension
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        # 循环传入输入，一个接一个地生成单词
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            # 预测的idx
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
        return tokens

In [47]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]
sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [54]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab, 
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device
)
translation

['<sos>',
 'a',
 'man',
 'with',
 'a',
 'white',
 'hat',
 'is',
 'looking',
 'at',
 '.',
 '<eos>']

In [55]:
translations = [
    translate_sentence(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    ) for example in tqdm.tqdm(test_data)
]

100%|██████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:25<00:00, 39.08it/s]


In [56]:
bleu = evaluate.load('bleu')

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [57]:
predictions = [" ".join(translation[1:-1]) for translation in translations]
references = [[example['en']] for example in test_data]

In [58]:
predictions[0], references[0]

('a man with a white hat is looking at .',
 ['A man in an orange hat starring at something.'])

In [62]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens
    return tokenizer_fn

In [63]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)
tokenizer_fn(predictions[0]), tokenizer_fn(references[0][0])

(['a', 'man', 'with', 'a', 'white', 'hat', 'is', 'looking', 'at', '.'],
 ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.'])

In [64]:
results = bleu.compute(
    predictions=predictions,
    references=references,
    tokenizer=tokenizer_fn
)

In [65]:
results

{'bleu': 0.13863526787905198,
 'precisions': [0.483822601010101,
  0.1949965729952022,
  0.09464017991004497,
  0.0467328370554177],
 'brevity_penalty': 0.9699983984332067,
 'length_ratio': 0.9704395772706387,
 'translation_length': 12672,
 'reference_length': 13058}