In [1]:
from torch.utils.data import Dataset, random_split
import json
from data import TRANS

max_dataset_size = 220000
train_dataset_size = 200000
valid_dataset_size = 20001

data = TRANS('../../data/translation2019zh/translation2019zh_train.json')
train_data, valid_data = random_split(data, [train_dataset_size, valid_dataset_size])
test_data = TRANS('../../data/translation2019zh/translation2019zh_valid.json')    
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 200000
valid set size: 20001
test set size: 39323
{'english': 'Zock said it\'s too soon to tell people to swear off spray cleaners altogether, but added, "Nevertheless, from the perspective of precaution, we may recommend to use sprays only when really necessary.', 'chinese': 'Zock说，现在就告诫人们远离喷雾清洁剂还为时尚早，但他补充说：“毫无疑问，从预防的角度来看，我们推荐仅仅在必要的时候才使用喷雾剂。'}


# 数据预处理

In [2]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [3]:
zh_sentence = train_data[0]["chinese"]
en_sentence = train_data[0]["english"]

inputs = tokenizer(zh_sentence)

# # 注意添加上下文管理器
# wrong_targets = tokenizer(en_sentence)
# print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))

with tokenizer.as_target_tokenizer():
    targets = tokenizer(en_sentence)

print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁Z', 'ock', '说', ',', '现在就', '告诫', '人们', '远离', '喷', '雾', '清洁', '剂', '还为', '时', '尚', '早', ',', '但他', '补充说', ':“', '毫无疑问', ',', '从', '预防', '的', '角度来看', ',', '我们', '推荐', '仅仅', '在', '必要', '的时候', '才', '使用', '喷', '雾', '剂', '。', '</s>']
['▁Z', 'ock', '▁said', '▁it', "'", 's', '▁too', '▁soon', '▁to', '▁tell', '▁people', '▁to', '▁swear', '▁off', '▁spray', '▁cleaner', 's', '▁altogether', ',', '▁but', '▁added', ',', '▁"', 'Never', 'the', 'less', ',', '▁from', '▁the', '▁perspective', '▁of', '▁precaution', ',', '▁we', '▁may', '▁recommend', '▁to', '▁use', '▁spray', 's', '▁only', '▁when', '▁really', '▁necessary', '.', '</s>']




In [4]:
import torch

max_input_length = 128
max_target_length = 128

inputs = [train_data[s_idx]["chinese"] for s_idx in range(4)]
targets = [train_data[s_idx]["english"] for s_idx in range(4)]

model_inputs = tokenizer(
    inputs, 
    padding=True, 
    max_length=max_input_length, 
    truncation=True,
    return_tensors="pt"
)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(
        targets, 
        padding=True, 
        max_length=max_target_length, 
        truncation=True,
        return_tensors="pt"
    )["input_ids"]

end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
for idx, end_idx in enumerate(end_token_index):
    labels[idx][end_idx+1:] = -100 # 是否可以认为，pad设置为-100是为了让softmax之后的权重置为0？

print('batch_X shape:', {k: v.shape for k, v in model_inputs.items()})
print('batch_y shape:', labels.shape)
print(model_inputs)
print(labels)

batch_X shape: {'input_ids': torch.Size([4, 40]), 'attention_mask': torch.Size([4, 40])}
batch_y shape: torch.Size([4, 46])
{'input_ids': tensor([[ 6932, 14872,   300,     2, 20172, 37620,  2283, 20999, 14522, 26668,
         10696,  5898, 16604,   142,  5942,  4966,     2, 22937, 25217,  7306,
         30508,     2,   233,  1889,    11, 22487,     2,   230, 17586,  8195,
            36,  4634,  2408,  1880,   421, 14522, 26668,  5898,     9,     0],
        [    7, 36783,    36,  5564,  3156,   150, 13633,  4571,  2806,     2,
          5454,   242,    16,   391,  2936,    11, 55220, 20303,   123,  3286,
             2,   333,  1473,  1400,    36,  1284, 26531,     9,     0, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000],
        [ 1243,  1956,  7870, 23809, 16376,  5264,     2,  1132, 25995,  2505,
          2837,  8509,  1041,   485, 22857, 37082,     2, 33218, 27876,  3284,
          8359, 19522,    11, 27955,     9,     0, 65000, 65000, 65000

In [5]:
from transformers import AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

max_input_length = 128
max_target_length = 128

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['chinese'])
        batch_targets.append(sample['english'])
    batch_data = tokenizer(
        batch_inputs,
        padding=True,
        max_length=max_input_length,
        truncation=True,
        return_tensors='pt'
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch_targets,
            padding=True,
            max_length=max_target_length,
            truncation=True,
            return_tensors='pt'
        )["input_ids"]
        batch_data["decode_input_ids"] = model.prepare_decoder_input_ids_from_labels(labels)
        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
        for idx, end_idx in enumerate(end_token_index):
            labels[idx][end_idx + 1:] = -100
        batch_data['labels'] = labels
    return batch_data

train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=32, shuffle=False, collate_fn=collote_fn)

Using cuda device


In [6]:
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'decode_input_ids', 'labels'])
batch shape: {'input_ids': torch.Size([32, 41]), 'attention_mask': torch.Size([32, 41]), 'decode_input_ids': torch.Size([32, 44]), 'labels': torch.Size([32, 44])}
{'input_ids': tensor([[ 3636, 32728,  2582,  ..., 65000, 65000, 65000],
        [    7, 13882, 36601,  ..., 65000, 65000, 65000],
        [  799,  5657, 17030,  ..., 65000, 65000, 65000],
        ...,
        [    7,  6713,  6281,  ..., 65000, 65000, 65000],
        [13263, 39057,  8116,  ..., 65000, 65000, 65000],
        [    7, 38443,     2,  ..., 65000, 65000, 65000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'decode_input_ids': tensor([[65000,  2919,  2431,  ..., 65000, 65000, 65000],
        [65000,    24, 33014,  ..., 65000, 65000, 65000],
        [65000



# 训练代码 

In [17]:
# from modeling import MarianForMT # 为什么这里直接import另外一个文件中的类会报错？显示缺失参数？
from transformers import AutoConfig

from torch import nn
from transformers.models.marian import MarianPreTrainedModel, MarianModel
import torch
class MarianForMT(MarianPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model = MarianModel(config=config)
        target_vocab_size = config.decoder_vocab_size
        # 这行代码的意思是在PyTorch模型中注册一个名为"final_logits_bias"的缓冲区，并初始化为一个全零的张量，维度为(1, target_vocab_size)。
        # 这个缓冲区可以被模型访问和使用，通常用于存储模型的参数或其他需要持久化的数据。
        self.register_buffer("final_logits_bias", torch.zeros((1, target_vocab_size)))
        self.lm_head = nn.Linear(config.d_model, target_vocab_size, bias=False)
        self.post_init() # 这个方法在对象初始化完成后自动调用，可以用来执行一些需要在对象创建后立即执行的操作。
        
    def forward(self, x):
        output = self.model(**x)
        sequence_output = output.last_hidden_state
        lm_logits = self.lm_head(sequence_output) + self.final_logits_bias
        return lm_logits
    
    def other_func(self):
        pass


config = AutoConfig.from_pretrained(model_checkpoint)
model = MarianForMT.from_pretrained(model_checkpoint, config=config).to(device)
print(model)

Some weights of MarianForMT were not initialized from the model checkpoint at Helsinki-NLP/opus-mt-zh-en and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MarianForMT(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), e

# 模型评测

In [21]:
from sacrebleu.metrics import BLEU

predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
bad_predictions_1 = ["This This This This"]
bad_predictions_2 = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

bleu = BLEU()
print(bleu.corpus_score(predictions, references).score)
print(bleu.corpus_score(bad_predictions_1, references).score)
print(bleu.corpus_score(bad_predictions_2, references).score)

46.750469682990165
1.683602693167689
0.0


In [30]:
from sacrebleu.metrics import BLEU

predictions = [
    "我在复旦大学学习摆烂，复旦大学很sb。"
]

# references = [
#     [
#         "我在环境优美的复旦大学学习躺平。"
#     ]
# ]
references = [
    "我在环境优美的复旦大学学习躺平。"
]

bleu = BLEU(tokenize='zh')
print(f'BLEU: {bleu.corpus_score(predictions, references).score}')
bleu = BLEU()
print(f'wrong BLEU: {bleu.corpus_score(predictions, references).score}')

BLEU: 2.1476912089159055
wrong BLEU: 0.0


In [29]:
model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
orig_model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
orig_model = orig_model.to(device)

sentence = ["我在环境优美的复旦大学学习躺平。", "我在环境优美的复旦大学卷生卷死。"]
sentence_inputs = tokenizer(sentence, return_tensors="pt", padding=True).to(device)
sentence_generated_tokens = orig_model.generate(
    sentence_inputs["input_ids"],
    attention_mask=sentence_inputs["attention_mask"],
    max_length=128
)
# sentence_decoded_pred = tokenizer.decode(sentence_generated_tokens[1], skip_special_tokens=True)
sentence_decoded_pred = tokenizer.batch_decode(sentence_generated_tokens, skip_special_tokens=True)
print(sentence_decoded_pred)

['I studied flattened at the University of Rehabilitation in a beautiful environment.', 'I was born and died at the University of Rehabilitation in a beautiful environment.']


In [31]:
from transformers import AdamW, get_scheduler
from run_sim_cls import train_loop, test_loop

learning_rate = 2e-5
epoch_num = 3
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_training_steps=epoch_num * len(train_dataloader),
    num_warmup_steps=0
)
total_loss = 0
best_bleu = 0
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_bleu = test_loop(tokenizer, valid_dataloader, model, mode='Valid')
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin')
print("Done!")

Epoch 1/3
-------------------------------




  0%|          | 0/6250 [00:00<?, ?it/s]



AttributeError: 'MarianForMT' object has no attribute 'prepare_decoder_input_ids_from_labels'