## Data sourcing and processing

In [21]:
from torch.utils.data import Dataset,random_split
import json

class trans_data(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        data = {}
        with open(data_file, 'rt',encoding='utf-8') as f:
            for idx,line in enumerate(f):
                line = line.strip()
                sample = json.loads(line.strip())
                data[idx] = sample
        return data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

data = trans_data('../input/translation2019zh/translation2019zh_train.json')
train_data,valid_data = random_split(data,[int(len(data)*0.9),len(data)-int(len(data)*0.9)])
test_data = trans_data('../input/translation2019zh/translation2019zh_valid.json')


In [22]:
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 4645290
valid set size: 516144
test set size: 39323
{'english': 'Results of observation showed that applying the wax-chromium Replica technique to the transmission electron microscope was an effective way…', 'chinese': '观察结果说阴，用于透射电镜的复型技术是研究木材细胞壁表面形貌的一种具有高分辨率的有效方法。'}


## 0x02 data processing

In [23]:
from transformers import AutoTokenizer 
# Helsinki 提高多中文翻译模型
model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
zh_sentence = train_data[0]['chinese']
en_sentence = train_data[0]['english']

input = tokenizer(zh_sentence)
with tokenizer.as_target_tokenizer():
    target = tokenizer(en_sentence)

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]



In [24]:
import torch

max_input_length = 128
max_target_length = 128

inputs = [train_data[s_idx]["chinese"] for s_idx in range(4)]
targets = [train_data[s_idx]["english"] for s_idx in range(4)]

model_inputs = tokenizer(
    inputs, 
    padding=True, 
    max_length=max_input_length, 
    truncation=True,
    return_tensors="pt"
)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        targets, 
        padding=True, 
        max_length=max_target_length, 
        truncation=True,
        return_tensors="pt"
    )["input_ids"]

end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
for idx, end_idx in enumerate(end_token_index):
    labels[idx][end_idx+1:] = -100

print('batch_X shape:', {k: v.shape for k, v in model_inputs.items()})
print('batch_y shape:', labels.shape)
print(model_inputs)
print(labels)

batch_X shape: {'input_ids': torch.Size([4, 31]), 'attention_mask': torch.Size([4, 31])}
batch_y shape: torch.Size([4, 29])
{'input_ids': tensor([[    7,  6215,  1686,   300, 17944,     2,  1022, 12556,  7233,  3534,
         25472,    11,  7597,  3514,   653,    69,   751, 12835, 23293, 25483,
         21603,  8054, 17021,  7130,   744,  1072, 42437,  7538,  1187,     9,
             0],
        [    7,  5583,  7262,    11, 14455, 43322, 13699, 25900,    11, 40144,
         52319,    36,  4177, 29137, 47797,   188,  8181,  8223, 12072,  6534,
         24238, 10154,     9,     0, 65000, 65000, 65000, 65000, 65000, 65000,
         65000],
        [ 5233,    96, 23758,   454,  2923, 23011,  3410,   268,     7,  2658,
          3094,     9,     0, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000],
        [  335,  4718,   747, 12856,  1488,  4655, 62643,  8424, 11789,     9,
             0, 65000