In [1]:
from datasets import load_dataset

dataset = load_dataset("opus100", "en-zh")

In [2]:
max_length = 256

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-zh")


In [3]:
article = dataset['validation'][2]['translation']['en']

input_text = f">>cmn_Hans<< {article}"
inputs = tokenizer(input_text, return_tensors="pt")

translated_tokens = model.generate(**inputs,  max_length=256)
tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

'A/AC.154/363 2005年9月20日美利坚合众国常驻联合国代表给委员会主席的信[阿、中、英、法、俄、西]'

In [4]:
dataset['validation'][2]['translation']['zh']

'A/AC.154/363 2005年9月20日美利坚合众国常驻联合国代表给委员会主席的信 [阿、中、英、法、俄、西]'

In [5]:
def preprocess_function(examples):
    inputs = [f">>cmn_Hans<< {ex['en']}" for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)
    # Use as_target_tokenizer for MarianMT
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_datasets_train = dataset['train'].map(
    preprocess_function,
    batched= True,
    remove_columns=dataset["train"].column_names,
    batch_size = 2000
)

tokenized_datasets_validation = dataset['validation'].map(
    preprocess_function,
    batched= True,
    remove_columns=dataset["validation"].column_names,
    batch_size = 32
)

tokenized_datasets_test = dataset['test'].map(
    preprocess_function,
    batched= True,
    remove_columns=dataset["test"].column_names,
    batch_size = 32)

In [7]:
# Verify preprocessing

for i in range(5):
    print(f"\nSample {i}")
    print("Raw label:", dataset['validation'][i]['translation']['zh'])
    print("Tokenized label ids:", tokenized_datasets_validation[i]['labels'])
    print("Decoded label:", tokenizer.decode(tokenized_datasets_validation[i]['labels'], skip_special_tokens=True))
    print("Label length:", len(tokenized_datasets_validation[i]['labels']))


Sample 0
Raw label: 然而，劳工组织行政法庭相对于联合国行政法庭似乎有些优势。
Tokenized label ids: [1244, 2, 6244, 31735, 18629, 33802, 3247, 3452, 12722, 10, 0]
Decoded label: 然而,劳工组织行政法庭相对于联合国行政法庭似乎有些优势。
Label length: 11

Sample 1
Raw label: 这样吧，我载你们到5英里外的贝拉餐厅
Tokenized label ids: [5296, 498, 2, 124, 8124, 1350, 274, 152, 19403, 1024, 12, 39571, 27232, 0]
Decoded label: 这样吧,我载你们到5英里外的贝拉餐厅
Label length: 14

Sample 2
Raw label: A/AC.154/363 2005年9月20日美利坚合众国常驻联合国代表给委员会主席的信 [阿、中、英、法、俄、西]
Tokenized label ids: [80, 42, 1701, 858, 14254, 2459, 117, 784, 33, 215, 44, 449, 77, 3675, 6535, 47357, 345, 1434, 14, 82, 14, 4000, 14, 357, 14, 5525, 14, 1392, 393, 0]
Decoded label: A/AC.154/363 2005年9月20日美利坚合众国常驻联合国代表给委员会主席的信 [阿、中、英、法、俄、西]
Label length: 30

Sample 3
Raw label: - 他人在戒备森严的牢里
Tokenized label ids: [30, 319, 12171, 14574, 7096, 4322, 13975, 12, 18265, 571, 0]
Decoded label: - 他人在戒备森严的牢里
Label length: 11

Sample 4
Raw label: 在整个十九世纪和二十世纪上半叶，正如我们先前听到的那样，有6 000多万欧洲人移民海外，主要移向美洲——我重复一遍，6 000万。
Tokenized label ids: [8, 8

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [9]:
# Access the layers and freeze the specified number of layers
# Specify the number of layers to freeze from the end

for parameter in model.parameters():
    parameter.requires_grad = True
num_layers_to_freeze = 10  # Adjust as needed
for layer_index, layer in enumerate(model.model.encoder.layers):
    print
    if layer_index < len(model.model.encoder.layers) - num_layers_to_freeze:
        for parameter in layer.parameters():
            parameter.requires_grad = False

num_layers_to_freeze = 10  # Adjust as needed
for layer_index, layer in enumerate(model.model.decoder.layers):
    print
    if layer_index < len(model.model.encoder.layers) - num_layers_to_freeze:
        for parameter in layer.parameters():
            parameter.requires_grad = False

In [10]:
import evaluate

metric = evaluate.load("sacrebleu")

import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [11]:
import torch

# Check if a GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import Seq2SeqTrainingArguments

model.to(device)
training_args = Seq2SeqTrainingArguments(
    f"finetuned-nlp-en-zh",
    gradient_checkpointing=True,
    per_device_train_batch_size=32,
    learning_rate=1e-5,
    warmup_steps=300,
    max_steps=3000,
    fp16=True,
    optim='adamw_torch',
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    eval_steps=300, 
    metric_for_best_model="bleu",
    predict_with_generate=True,
    push_to_hub=False,
)

In [12]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_validation,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss,Bleu
300,No log,1.700684,10.267808
600,1.857700,1.690102,14.789708
900,1.857700,1.688226,15.349514
1200,1.854400,1.68748,15.081833
1500,1.820000,1.68381,13.209658
1800,1.820000,1.682773,10.344941
2100,1.832200,1.681652,13.199083
2400,1.832200,1.681083,12.709203
2700,1.824500,1.679697,15.196545
3000,1.806400,1.679275,15.159275




TrainOutput(global_step=3000, training_loss=1.8325392049153646, metrics={'train_runtime': 2906.5997, 'train_samples_per_second': 33.028, 'train_steps_per_second': 1.032, 'total_flos': 2180927578963968.0, 'train_loss': 1.8325392049153646, 'epoch': 0.096})

In [15]:
model.save_pretrained("../model")
tokenizer.save_pretrained("../model")

('../model\\tokenizer_config.json',
 '../model\\special_tokens_map.json',
 '../model\\vocab.json',
 '../model\\source.spm',
 '../model\\target.spm',
 '../model\\added_tokens.json')