In [1]:
from torch.utils.data import Dataset, random_split
import json
from data import TRANS

max_dataset_size = 220000
train_dataset_size = 200000
valid_dataset_size = 20001

data = TRANS('../../data/translation2019zh/translation2019zh_train.json')
train_data, valid_data = random_split(data, [train_dataset_size, valid_dataset_size])
test_data = TRANS('../../data/translation2019zh/translation2019zh_valid.json')    
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 200000
valid set size: 20001
test set size: 39323
{'english': '“We did find other tools that capture and analyze both dynamic and static queries, but they were limited to single queries,” Matthews says.', 'chinese': '“我们未发现其他工具能够同时采集和分析动态和静态查询，它们仅限于单种查询，” Matthews 说。'}


# 数据预处理

In [2]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [3]:
zh_sentence = train_data[0]["chinese"]
en_sentence = train_data[0]["english"]

inputs = tokenizer(zh_sentence)

# # 注意添加上下文管理器
# wrong_targets = tokenizer(en_sentence)
# print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))

with tokenizer.as_target_tokenizer():
    targets = tokenizer(en_sentence)

print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁“', '我们', '未', '发现', '其他', '工具', '能够', '同时', '采集', '和分析', '动态', '和', '静', '态', '查询', ',', '它们', '仅限于', '单', '种', '查询', ',', '”', '▁Ma', 't', 'th', 'ew', 's', '▁', '说', '。', '</s>']
['▁“', 'We', '▁did', '▁find', '▁other', '▁tools', '▁that', '▁capture', '▁and', '▁analyze', '▁both', '▁dynamic', '▁and', '▁static', '▁queries', ',', '▁but', '▁they', '▁were', '▁limited', '▁to', '▁single', '▁queries', ',', '”', '▁Matthew', 's', '▁says', '.', '</s>']




In [4]:
import torch

max_input_length = 128
max_target_length = 128

inputs = [train_data[s_idx]["chinese"] for s_idx in range(4)]
targets = [train_data[s_idx]["english"] for s_idx in range(4)]

model_inputs = tokenizer(
    inputs, 
    padding=True, 
    max_length=max_input_length, 
    truncation=True,
    return_tensors="pt"
)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        targets, 
        padding=True, 
        max_length=max_target_length, 
        truncation=True,
        return_tensors="pt"
    )["input_ids"]

end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
for idx, end_idx in enumerate(end_token_index):
    labels[idx][end_idx+1:] = -100 # 是否可以认为，pad设置为-100是为了让softmax之后的权重置为0？

print('batch_X shape:', {k: v.shape for k, v in model_inputs.items()})
print('batch_y shape:', labels.shape)
print(model_inputs)
print(labels)

batch_X shape: {'input_ids': torch.Size([4, 41]), 'attention_mask': torch.Size([4, 41])}
batch_y shape: torch.Size([4, 34])
{'input_ids': tensor([[  196,   230,   997,  2075,   628,  1904,   577,  1120, 29517, 10116,
          9241,    16, 13271, 20797, 12586,     2,   896, 14273,  3681,  4657,
         12586,     2,   215,  4403,    59,   496, 10676,    22,     7,   300,
             9,     0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000],
        [    7,  5508,    11, 21332, 63068,  3244,   300,    35,  2272,    65,
         21094,  1003,  2476,     2,    69, 23809,  1444,   149, 63275,  9602,
            11, 52918,     2,  1444,  3848,    11, 15025, 17020,  8648, 51373,
             2, 47969,    11,  7356, 28239,  1345,    11, 21938,  8906,     9,
             0],
        [ 3954,     2,  2130,    65, 44997,    11,  8283,  5518,  3215,   408,
           486,     2,   330,  2130,  2433,  4925,     2,  1027, 16285, 17721,
             9,     0, 65000, 65000, 6

In [5]:
from transformers import AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

max_input_length = 128
max_target_length = 128

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['chinese'])
        batch_targets.append(sample['english'])
    batch_data = tokenizer(
        batch_inputs,
        padding=True,
        max_length=max_input_length,
        truncation=True,
        return_tensors='pt'
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch_targets,
            padding=True,
            max_length=max_target_length,
            truncation=True,
            return_tensors='pt'
        )["input_ids"]
        batch_data["decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(labels)
        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
        for idx, end_idx in enumerate(end_token_index):
            labels[idx][end_idx + 1:] = -100
        batch_data['labels'] = labels
    return batch_data

train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=32, shuffle=False, collate_fn=collote_fn)

Using cuda device


In [6]:
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'])
batch shape: {'input_ids': torch.Size([32, 46]), 'attention_mask': torch.Size([32, 46]), 'decoder_input_ids': torch.Size([32, 48]), 'labels': torch.Size([32, 48])}
{'input_ids': tensor([[    7,  5700,  3321,  ..., 65000, 65000, 65000],
        [    7,  2620,   207,  ..., 65000, 65000, 65000],
        [19979, 22446,   142,  ..., 65000, 65000, 65000],
        ...,
        [ 6302,    51,  1977,  ..., 65000, 65000, 65000],
        [    7, 39443,  9913,  ..., 65000, 65000, 65000],
        [ 1243,  4510,  6029,  ..., 65000, 65000, 65000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'decoder_input_ids': tensor([[65000,    24,  5090,  ..., 65000, 65000, 65000],
        [65000, 16591,  2415,  ..., 65000, 65000, 65000],
        [65



# 训练代码 

In [7]:
from modeling import MarianForMT # 为什么这里直接import另外一个文件中的类会报错？显示缺失参数？
from transformers import AutoConfig
from torch import nn
from transformers.models.marian import MarianPreTrainedModel, MarianModel, MarianMTModel
import torch

# class MarianForMT(MarianMTModel):
#     def __init__(self, config):
#         super().__init__(config)
#         self.model = MarianMTModel(config=config)
#         target_vocab_size = config.decoder_vocab_size
#         # 这行代码的意思是在PyTorch模型中注册一个名为"final_logits_bias"的缓冲区，并初始化为一个全零的张量，维度为(1, target_vocab_size)。
#         # 这个缓冲区可以被模型访问和使用，通常用于存储模型的参数或其他需要持久化的数据。
#         self.register_buffer("final_logits_bias", torch.zeros((1, target_vocab_size)))
#         self.lm_head = nn.Linear(config.d_model, target_vocab_size, bias=False)
#         self.post_init() # 这个方法在对象初始化完成后自动调用，可以用来执行一些需要在对象创建后立即执行的操作。
        
#     def forward(self, x):
#         output = self.model(**x)
#         sequence_output = output.last_hidden_state
#         lm_logits = self.lm_head(sequence_output) + self.final_logits_bias
#         return lm_logits
    
#     def other_func(self):
#         pass


config = AutoConfig.from_pretrained(model_checkpoint)
model = MarianForMT.from_pretrained(model_checkpoint, config=config).to(device)
print(model)

Some weights of the model checkpoint at Helsinki-NLP/opus-mt-zh-en were not used when initializing MarianForMT: ['model.decoder.layers.4.encoder_attn.q_proj.weight', 'model.decoder.layers.5.encoder_attn.out_proj.bias', 'model.encoder.layers.0.self_attn.q_proj.weight', 'model.encoder.layers.4.self_attn.q_proj.bias', 'model.decoder.layers.3.self_attn.k_proj.weight', 'model.encoder.layers.0.fc2.weight', 'model.encoder.layers.2.fc2.bias', 'model.decoder.layers.2.self_attn.k_proj.weight', 'model.decoder.layers.2.encoder_attn.q_proj.weight', 'model.decoder.layers.0.self_attn.q_proj.weight', 'model.decoder.layers.0.encoder_attn.out_proj.weight', 'model.encoder.layers.1.fc2.bias', 'model.encoder.layers.3.final_layer_norm.bias', 'model.decoder.layers.1.self_attn.out_proj.bias', 'model.decoder.layers.2.fc2.weight', 'model.decoder.layers.3.self_attn.k_proj.bias', 'model.encoder.layers.1.fc2.weight', 'model.encoder.layers.1.self_attn.q_proj.bias', 'model.decoder.layers.0.self_attn.v_proj.bias', 'm

MarianForMT(
  (model): MarianMTModel(
    (model): MarianModel(
      (shared): Embedding(65001, 512, padding_idx=65000)
      (encoder): MarianEncoder(
        (embed_tokens): Embedding(65001, 512, padding_idx=65000)
        (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
        (layers): ModuleList(
          (0-5): 6 x MarianEncoderLayer(
            (self_attn): MarianAttention(
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (activation_fn): SiLUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, 

# 模型评测

In [8]:
from sacrebleu.metrics import BLEU

predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
bad_predictions_1 = ["This This This This"]
bad_predictions_2 = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

bleu = BLEU()
print(bleu.corpus_score(predictions, references).score)
print(bleu.corpus_score(bad_predictions_1, references).score)
print(bleu.corpus_score(bad_predictions_2, references).score)

46.750469682990165
1.683602693167689
0.0


In [9]:
from sacrebleu.metrics import BLEU

predictions = [
    "我在复旦大学学习摆烂，复旦大学很sb。"
]

# references = [
#     [
#         "我在环境优美的复旦大学学习躺平。"
#     ]
# ]
references = [
    "我在环境优美的复旦大学学习躺平。"
]

bleu = BLEU(tokenize='zh')
print(f'BLEU: {bleu.corpus_score(predictions, references).score}')
bleu = BLEU()
print(f'wrong BLEU: {bleu.corpus_score(predictions, references).score}')

BLEU: 2.1476912089159055
wrong BLEU: 0.0


In [10]:
model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
orig_model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
orig_model = orig_model.to(device)

sentence = ["我在环境优美的复旦大学学习躺平。", "我在环境优美的复旦大学卷生卷死。"]
sentence_inputs = tokenizer(sentence, return_tensors="pt", padding=True).to(device)
sentence_generated_tokens = orig_model.generate(
    sentence_inputs["input_ids"],
    attention_mask=sentence_inputs["attention_mask"],
    max_length=128
)
# sentence_decoded_pred = tokenizer.decode(sentence_generated_tokens[1], skip_special_tokens=True)
sentence_decoded_pred = tokenizer.batch_decode(sentence_generated_tokens, skip_special_tokens=True)
print(sentence_decoded_pred)



['I studied flattened at the University of Rehabilitation in a beautiful environment.', 'I was born and died at the University of Rehabilitation in a beautiful environment.']


In [11]:
from transformers import AdamW, get_scheduler
from run_sim_cls import train_loop, test_loop

learning_rate = 2e-5
epoch_num = 3
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_training_steps=epoch_num * len(train_dataloader),
    num_warmup_steps=0
)
total_loss = 0
best_bleu = 0
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_bleu = test_loop(tokenizer, valid_dataloader, model, mode='Valid')
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'../model/seq2seq/epoch_{t+1}_valid_bleu_model_weights.bin')
print("Done!")

Epoch 1/3
-------------------------------




  0%|          | 0/6250 [00:00<?, ?it/s]

TypeError: linear(): argument 'input' (position 1) must be Tensor, not NoneType

In [None]:
import numpy as np
import tqdm

test_data = TRANS('../../data/translation2019zh/translation2019zh_valid.json')
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collote_fn)

import json

model.load_state_dict(torch.load('../model/seq2seq/epoch_1_valid_bleu_model_weights.bin'))

model.eval()
with torch.no_grad():
    print('evaluating on test set...')
    sources, preds, labels = [], [], []
    for batch_data in tqdm(test_dataloader):
        batch_data = batch_data.to(device)
        generated_tokens = model.generate(
            batch_data["input_ids"],
            attention_mask=batch_data["attention_mask"],
            max_length=max_target_length,
        ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_sources = tokenizer.batch_decode(
            batch_data["input_ids"].cpu().numpy(), 
            skip_special_tokens=True, 
            use_source_tokenizer=True
        )
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        sources += [source.strip() for source in decoded_sources]
        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    bleu_score = bleu.corpus_score(preds, labels).score
    print(f"Test BLEU: {bleu_score:>0.2f}\n")
    results = []
    print('saving predicted results...')
    for source, pred, label in zip(sources, preds, labels):
        results.append({
            "sentence": source, 
            "prediction": pred, 
            "translation": label[0]
        })
    with open('../model/seq2seq/test_data_pred.json', 'wt', encoding='utf-8') as f:
        for exapmle_result in results:
            f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n')

# 解码

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)