In [48]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
import random 
import spacy
import datasets 
import torchtext
import tqdm
import evaluate

In [49]:
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# 确保程序的每次运行都具有确定性，可以得到相同的结果，用于程序的复现
torch.backends.cudnn.deterministic = True

#### Preparing Data
- 获取数据集
- tokenize, 加入special tokens
- train_data's vocabulary
- numerialize
- with_format 修改index的数据类型

In [50]:
dataset = datasets.load_dataset("bentrevett/multi30k")
# print(dataset)
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [51]:
# 数据集每条数据是一个字典，key：语种，value：对应的句子
print(train_data[0])
type(train_data[0])

{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}


dict

In [52]:
# tokenize
# 一种更为笼统的说法，包含了words, numbers, punctuations, any special symbol
# 利用spacy库加载两个用来处理德语/英语的模型
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [53]:
str = "What a lovely day it is today!"
[token.text for token in en_nlp.tokenizer(str)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [54]:
# 函数 接收example数据，对应的处理器，最大的长度，是否lower，句子的开始结尾token
def tokenizer_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens" : en_tokens, "de_tokens" : de_tokens}

In [55]:
max_length = 1000
sos_token = "<sos>"
eos_token = "<eos>"
lower = True 

fn_kwargs = {
    "en_nlp" : en_nlp,
    "de_nlp" : de_nlp,
    "max_length" : max_length,
    "lower" : lower,
    "sos_token" : sos_token,
    "eos_token" : eos_token,
}

# 通过调用tokenizer_example函数并传入对应的参数字典，来向dataset中添加函数返回的key-value
train_data = train_data.map(tokenizer_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenizer_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenizer_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [56]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [57]:
# vocabularies 
# 生成token和index之间的映射字典，传入神经网络的是这些映射整数
# 理论上说，生成的字母表应该包含所有可能出现的token，但是如果token不出现在train_data中但是出现在test_data或者valid_data中呢
# 引入<unk> unkown token，其对应的index通常是一个固定的数，比如0，用来表示那些在vocabulary中无法找到的token

# 出现次数小于min_freq的应该被tokenize为<unk>，人为在train_data中添加了<unk>的token
min_freq = 2
unk_token = "<unk>"
# 每次输入一个batch，batch中的句子长度应该在传入时与这一批中的最大长度相同，转化为token时自动padding
pad_token = "<pad>"
special_tokens = [
    sos_token,
    eos_token,
    unk_token,
    pad_token,
]

# vocabulary 只能在train_data上建立，防止信息泄露
en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [58]:
# 这表示int to string的前十个，可以切片，因为是存在list中
en_vocab.get_itos()[:10]

['<sos>', '<eos>', '<unk>', '<pad>', 'a', '.', 'in', 'the', 'on', 'man']

In [59]:
# 无法切片因为是dict
en_vocab.get_stoi()

{'company': 1869,
 'metal': 307,
 'green': 52,
 'track': 302,
 'pours': 2356,
 'vase': 5829,
 'the': 7,
 'crashes': 3951,
 'non': 5340,
 'bonding': 4685,
 'face': 158,
 'doll': 1775,
 'garbage': 1120,
 'workers': 228,
 'winter': 446,
 'taste': 3737,
 'seems': 1207,
 'outdoors': 341,
 'her': 44,
 'activity': 1973,
 'hose': 834,
 'turbulent': 5808,
 'group': 38,
 'gentleman': 524,
 'bald': 683,
 'underway': 4509,
 'a': 4,
 'scaffolds': 5551,
 'outside': 57,
 'drivers': 3393,
 'readying': 3612,
 'jeans': 175,
 'black': 26,
 'to': 18,
 'jungle': 1235,
 'supervision': 5722,
 'or': 258,
 'all': 255,
 'examined': 4955,
 'observes': 1725,
 'i': 956,
 'ball': 68,
 'sandwich': 1293,
 '<sos>': 0,
 'cheerleaders': 927,
 'fisherman': 1421,
 'firetruck': 1419,
 'sunrise': 3726,
 'pub': 2816,
 'fires': 4989,
 'sing': 1209,
 'street': 39,
 'moped': 1504,
 'in': 6,
 'customers': 850,
 'sheepdog': 3654,
 'taping': 5747,
 'mood': 5309,
 'attire': 453,
 'kicked': 2761,
 'probably': 2812,
 'soccer': 123,
 

In [60]:
en_vocab.get_stoi()["in"]

6

In [61]:
# 还可以直接写成
en_vocab["in"]

6

In [62]:
len(de_vocab), len(en_vocab)

(7853, 5893)

In [63]:
# "The" 中含有大写字母一定不在en_vocab中
print("The" in en_vocab)
# en_vocab["The"] # 会报错，可以设置这个值，在取不存在与vocab中的单词时返回

False


In [64]:
# vocab 中<unk>的index不一定是0，与specials参数中的顺序有关
assert en_vocab[pad_token] == de_vocab[pad_token]
assert en_vocab[unk_token] == de_vocab[unk_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [65]:
# 上面在取vocabulary之外的值时，就会报错，这里设置默认的不在vocabulary中的token的index值为unk_index
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [66]:
en_vocab["The"]

2

In [67]:
# 一个有用的method，lookup_indeces，接收一个token list返回对应的index

tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 2, 821]

In [68]:
# 相应的lookup_tokens
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', 'shows']

In [69]:
# 类似tokenize_example，写一个numericalize_example通过map应用
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids" : en_ids, "de_ids" : de_ids}

In [70]:
# fn_kwargs 是function_keyword arguments的缩写
fn_kwargs = {"en_vocab" : en_vocab, "de_vocab" : de_vocab}
train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [71]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [0, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 1],
 'de_ids': [0, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 1]}

In [72]:
# 检查一些是否对应
en_vocab.lookup_tokens(train_data[0]['en_ids'])

['<sos>',
 'two',
 'young',
 ',',
 'white',
 'males',
 'are',
 'outside',
 'near',
 'many',
 'bushes',
 '.',
 '<eos>']

In [73]:
# 目前的index都是python的内置int类型，希望转化为"torch" for Pytorch
data_type = "torch"
format_columns = ["en_ids", "de_ids"]
# with_format method把对应的columns转化为type类型，默认是只返回columns中包含的features
# 设置output_all_columns=True，可以保留所有的特征
train_data = train_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)
test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [74]:
train_data[0]

{'en_ids': tensor([   0,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            1]),
 'de_ids': tensor([   0,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    1]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [75]:
# 每个batch进行padding处理,collate means collect and combine

# 传入的参数是用来padding的index，返回一个函数collate_fn
# closure闭包的写法，让内部的函数能够持续地使用pad_index，而不用创建全局变量或者创建一个类
def get_collate_fn(pad_index):
    # 返回一个字典，分开的batch（两个语言分开），已经使用pad_index padding使得所有长度与原长度最大相等，传入的参数是一个batch大小的data
    def collate_fn(batch):
        batch_en_ids = [example['en_ids'] for example in batch]
        batch_de_ids = [example['de_ids'] for example in batch]
        # batch_first=False（默认为False），返回的数据size就是[max_length, batch_size]的，也就是每一列是padding后的
        # numericalize 的sequence，如果设置batch_first=True，那么返回的数据就是[batch_size, max_length]符合每一行是seq的直觉
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        return {"en_ids" : batch_en_ids, "de_ids" : batch_de_ids}
    return collate_fn 

In [76]:
# 利用Pytorch的DataLoader
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader


In [77]:
# 为了发挥最大性能，batch_size应该适合GPU的内存
# 训练时数据设置为shuffle=True，可以得到更稳定的结果，valid和test上没必要shuffle
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [80]:
# 展示数据表明被处理后的数据是按列排布的
# for data in train_data_loader:
# print(data)

{'en_ids': tensor([[  0,   0,   0,  ...,   0,   0,   0],
        [  4,   4,  46,  ...,   4,   4,   4],
        [ 14,   9, 120,  ...,  14, 224,  33],
        ...,
        [  3,   3,   3,  ...,   3,   3,   3],
        [  3,   3,   3,  ...,   3,   3,   3],
        [  3,   3,   3,  ...,   3,   3,   3]]), 'de_ids': tensor([[  0,   0,   0,  ...,   0,   0,   0],
        [  8,   5,   8,  ...,   8,   5,   5],
        [ 16,  13,  16,  ...,  16, 550,  25],
        ...,
        [  3,   3,   3,  ...,   3,   3,   3],
        [  3,   3,   3,  ...,   3,   3,   3],
        [  3,   3,   3,  ...,   3,   3,   3]])}
{'en_ids': tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 4, 46,  4,  ...,  4,  4, 24],
        [34,  9,  9,  ..., 24,  9, 55],
        ...,
        [ 3,  3,  3,  ...,  3,  3,  3],
        [ 3,  3,  3,  ...,  3,  3,  3],
        [ 3,  3,  3,  ...,  3,  3,  3]]), 'de_ids': tensor([[  0,   0,   0,  ...,   0,   0,   0],
        [  5,   5,   5,  ...,   5,   6,   5],
        [ 26,  13,  13,  ...,