In [4]:
from torch.utils.data import Dataset, random_split
import json
from data import TRANS

max_dataset_size = 220000
train_dataset_size = 200000
valid_dataset_size = 20001

data = TRANS('../../data/translation2019zh/translation2019zh_train.json')
train_data, valid_data = random_split(data, [train_dataset_size, valid_dataset_size])
test_data = TRANS('../../data/translation2019zh/translation2019zh_valid.json')    
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 200000
valid set size: 20001
test set size: 39323
{'english': 'Part 2 discusses some approaches to tracking down memory problems.', 'chinese': '第 2 部分将讨论一些跟踪内存问题的方法。'}


In [5]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]



In [8]:
zh_sentence = train_data[0]["chinese"]
en_sentence = train_data[0]["english"]

inputs = tokenizer(zh_sentence)

# # 注意添加上下文管理器
# wrong_targets = tokenizer(en_sentence)
# print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))

with tokenizer.as_target_tokenizer():
    targets = tokenizer(en_sentence)

print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁第', '▁2', '▁', '部分', '将', '讨论', '一些', '跟踪', '内', '存', '问题', '的方法', '。', '</s>']
['▁Part', '▁2', '▁discusses', '▁some', '▁approaches', '▁to', '▁tracking', '▁down', '▁memory', '▁problems', '.', '</s>']


In [9]:
import torch

max_input_length = 128
max_target_length = 128

inputs = [train_data[s_idx]["chinese"] for s_idx in range(4)]
targets = [train_data[s_idx]["english"] for s_idx in range(4)]

model_inputs = tokenizer(
    inputs, 
    padding=True, 
    max_length=max_input_length, 
    truncation=True,
    return_tensors="pt"
)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        targets, 
        padding=True, 
        max_length=max_target_length, 
        truncation=True,
        return_tensors="pt"
    )["input_ids"]

end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
for idx, end_idx in enumerate(end_token_index):
    labels[idx][end_idx+1:] = -100

print('batch_X shape:', {k: v.shape for k, v in model_inputs.items()})
print('batch_y shape:', labels.shape)
print(model_inputs)
print(labels)

batch_X shape: {'input_ids': torch.Size([4, 31]), 'attention_mask': torch.Size([4, 31])}
batch_y shape: torch.Size([4, 35])
{'input_ids': tensor([[  440,   413,     7,  1054,    96,   621,   617,  7119,   475,  6060,
           112,  3465,     9,     0, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000],
        [    7, 13874,   521, 10612, 62719,     2,   176, 13874,     9,     0,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000],
        [  196,   288, 21116,   449, 18051,  1924,  5318, 16539,  1447, 31324,
          2702,  7035, 19320,  6781,  5978, 10161,   864,     9,   318,   397,
          2220,  1437,   316,  1336,   449,   430,  1191,  8174, 17744,     9,
             0],
        [11363,  2271,  1491, 17615, 39577,   272,  1744,  2795,   265, 38809,
          2452,     2