In [11]:
!pip install evaluate sacrebleu

import torch
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import evaluate
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




In [12]:
raw_datasets = load_dataset("wmt14", "de-en")
print(raw_datasets)
print(raw_datasets["train"][0])

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})
{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}


In [13]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

src_lang = "en_XX"
tgt_lang = "de_DE"

tokenizer.src_lang = src_lang
model.config.forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]

print("Model and tokenizer ready")


Model and tokenizer ready


In [14]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, hf_split, tokenizer, max_length=128):
        self.data = hf_split
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        src_text = example["translation"]["en"]
        tgt_text = example["translation"]["de"]

        model_inputs = self.tokenizer(
            src_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                tgt_text,
                max_length=self.max_length,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )

        item = {
            "input_ids": model_inputs["input_ids"].squeeze(0),
            "attention_mask": model_inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0),
        }
        return item

max_length = 128

train_dataset = TranslationDataset(raw_datasets["train"], tokenizer, max_length)
val_dataset   = TranslationDataset(raw_datasets["validation"], tokenizer, max_length)
test_dataset  = TranslationDataset(raw_datasets["test"], tokenizer, max_length)


In [15]:
!pip install sacrebleu
import numpy as np

bleu = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [[l.strip()] for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}



In [16]:
'''
batch_size = 4
num_epochs = 3
learning_rate = 3e-5

training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-en-de",
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_steps=100,
    save_strategy="no",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
'''

'\nbatch_size = 4\nnum_epochs = 3\nlearning_rate = 3e-5\n\ntraining_args = Seq2SeqTrainingArguments(\n    output_dir="./mbart-en-de",\n    evaluation_strategy="epoch",\n    learning_rate=learning_rate,\n    per_device_train_batch_size=batch_size,\n    per_device_eval_batch_size=batch_size,\n    num_train_epochs=num_epochs,\n    weight_decay=0.01,\n    predict_with_generate=True,\n    logging_steps=100,\n    save_strategy="no",\n)\n\ndata_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n\ntrainer = Seq2SeqTrainer(\n    model=model,\n    args=training_args,\n    train_dataset=train_dataset,\n    eval_dataset=val_dataset,\n    tokenizer=tokenizer,\n    data_collator=data_collator,\n    compute_metrics=compute_metrics,\n)\n'

In [17]:
# Fine-tuning
# trainer.train()

# Evaluation on test set
# results = trainer.evaluate(test_dataset)
# print("Test BLEU:", results["eval_bleu"])


In [18]:
model.eval()
preds = []
refs = []

# for example in raw_datasets["test"]:
#     src_text = example["translation"]["en"]
#     tgt_text = example["translation"]["de"]
#     ...

# (bleu.compute(...) as shown earlier)


In [19]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

src_lang = "en_XX"
tgt_lang = "de_DE"

tokenizer.src_lang = src_lang
model.config.forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]

print("Model reloaded successfully.")


Model reloaded successfully.


In [21]:
def translate_sentence(sentence_en):
    model.eval()
    inputs = tokenizer(
        sentence_en,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding=True
    ).to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

example = "This is a sentence for testing and I might say something more complicated to test you."
print("English:", example)
print("German:", translate_sentence(example))



English: This is a sentence for testing and I might say something more complicated to test you.




German: Dies ist ein Satz zum Testen und ich könnte etwas komplizierteres sagen, um Sie zu testen.


In [22]:
'''
example = raw_datasets["test"][0]["translation"]["en"]
print("English:", example)
# print("German (ref):", raw_datasets["test"][0]["translation"]["de"])
# print("German (model):", translate_sentence(example))
'''

'\nexample = raw_datasets["test"][0]["translation"]["en"]\nprint("English:", example)\n# print("German (ref):", raw_datasets["test"][0]["translation"]["de"])\n# print("German (model):", translate_sentence(example))\n'

In [23]:
bleu = evaluate.load("sacrebleu")

model.eval()
preds = []
refs = []

n_eval = 50

for i, example in enumerate(tqdm(raw_datasets["validation"])):
    src = example["translation"]["en"]
    tgt = example["translation"]["de"]

    inputs = tokenizer(
        src,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding=True,
    ).to(device)

    with torch.no_grad():
        generated = model.generate(
            **inputs,
            max_length=128,
            num_beams=4,
            early_stopping=True,
        )

    pred = tokenizer.decode(generated[0], skip_special_tokens=True)
    preds.append(pred)
    refs.append([tgt])

    if i + 1 >= n_eval:
        break

result = bleu.compute(predictions=preds, references=refs)
print("Validation BLEU:", result["score"])


  0%|          | 0/3000 [00:00<?, ?it/s]

Validation BLEU: 27.66010717166846


I used the pretrained mBART50 model for English→German translation together with the WMT14 dataset.
I implemented a simple translation pipeline and calculated BLEU on a small subset of the validation set,
because running the full sequence generation on CPU-only Colab would be too slow. The BLEU score on this subset was above 0.10, showing that the translation setup works correctly.
