In [1]:
!pip install transformers datasets evaluate sacrebleu tf-keras ipywidgets sentencepiece sacremoses "accelerate>=0.26.0"

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate>=0.26.0
  Downloading accelerate-1.3.0-py3-none-any.whl (336 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m336.6/336.6 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: accelerate
Successfully installed accelerate-1.3.0


In [2]:
# import libraries
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq, Seq2SeqTrainingArguments,
                          Seq2SeqTrainer)

2025-02-14 22:34:51.350840: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739572491.368596    3065 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739572491.374013    3065 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# load dataset
dataset = load_dataset("qanastek/ELRC-Medical-V2", "en-fr")
ds = dataset["train"].train_test_split(test_size=0.2, seed=42) if "train" in dataset else dataset.train_test_split(test_size=0.2, seed=42)

In [4]:
# load pretrained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [5]:
def tokenize_fn(examples):
    en_texts = [ex["en"] for ex in examples["translation"]]
    fr_texts = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(en_texts, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(fr_texts, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_ds = ds.map(tokenize_fn, batched=True)

In [7]:
# data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [8]:
# training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs"
)



In [9]:
# load bleu metric using evaluate
metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[lbl.strip()] for lbl in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [11]:
# setup trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [12]:
# fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,0.636,0.55165,57.847126
2,0.5386,0.539249,58.447882
3,0.4963,0.537779,58.537629




TrainOutput(global_step=1974, training_loss=0.533922158717626, metrics={'train_runtime': 806.3848, 'train_samples_per_second': 39.134, 'train_steps_per_second': 2.448, 'total_flos': 601858583691264.0, 'train_loss': 0.533922158717626, 'epoch': 3.0})

In [13]:
# evaluate the model on the test set
eval_results = trainer.evaluate(tokenized_ds["test"])
print("evaluation metrics:", eval_results)

evaluation metrics: {'eval_loss': 0.5377790331840515, 'eval_bleu': 58.5376294840819, 'eval_runtime': 231.4471, 'eval_samples_per_second': 11.363, 'eval_steps_per_second': 0.713, 'epoch': 3.0}


In [17]:
# generate predictions on a few test examples
sample_test = tokenized_ds["test"].select(range(5))
predictions = trainer.predict(sample_test)
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(sample_test["labels"], skip_special_tokens=True)

for i, (pred, label) in enumerate(zip(decoded_preds, decoded_labels)):
    print(f"example {i}:")
    print("english:", example["translation"]["en"])
    print("pred:", pred)
    print("label:", label)

example 0:
english: 13- to encourage governments to adopt strategies with concrete measures to ensure improved gender balance in decision making, to engage in dialogue with social partners and sectorial regulatory bodies to promote/adopt relevant measures - To continue to engage in dialogue with social partners To implement projects aiming to improve the gender balance in decision making positions.
pred: Avant de tirer ses conclusions, elle s'est engag√©e directement avec les parlements nationaux sur les questions soulev√©es, en particulier lors des r√©unions des commissions parlementaires pour les affaires de l'Union des parlements de l'UE (COSAC) du 13 juin 2016, au cours desquelles un √©change pr√©liminaire a port√© sur les aspects proc√©duraux, et du 11 juillet 2016, au cours duquel une discussion de fond a eu lieu dans le cadre d'un d√©bat plus large sur la dimension sociale de l'UE.
label: Avant de tirer ses conclusions, elle a dialogu√© directement avec les parlements nationaux 

In [19]:
# get a few original examples from the test set (for english and reference french)
sample_original = ds["test"].select(range(5))

# get predictions from the fine-tuned (sft) model
ft_predictions = trainer.predict(sample_test)
ft_decoded_preds = tokenizer.batch_decode(ft_predictions.predictions, skip_special_tokens=True)


# load the base model and set up a trainer for it with an eval_dataset
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
base_trainer = Seq2SeqTrainer(
    model=base_model,
    args=training_args,  # reuse same args
    eval_dataset=sample_test,  # add this line
    tokenizer=tokenizer,
    data_collator=data_collator
)
base_predictions = base_trainer.predict(sample_test)
base_decoded_preds = tokenizer.batch_decode(base_predictions.predictions, skip_special_tokens=True)


# print comparisons: original english, fine-tuned pred, base model pred, and reference french
for i, example in enumerate(sample_original):
    english_text = example["translation"]["en"]
    ref_text = example["translation"]["fr"]
    print(f"example {i}:")
    print("english      :", english_text)
    print("sft prediction:", ft_decoded_preds[i])
    print("base prediction:", base_decoded_preds[i])
    print("reference    :", ref_text)
    print("-" * 50)

  base_trainer = Seq2SeqTrainer(


example 0:
english      : Before drawing its conclusions, it engaged directly with national Parliaments on the issues raised, in particular at the Conference of Parliamentary Committees for Union Affairs of Parliaments of the EU (COSAC) meetings of 13 June 2016, where a preliminary exchange focused on procedural aspects took place, and of 11 July 2016, where a substantive discussion took place in the context of a broader debate on the social dimension of the EU.
sft prediction: Avant de tirer ses conclusions, elle s'est engag√©e directement avec les parlements nationaux sur les questions soulev√©es, en particulier lors des r√©unions des commissions parlementaires pour les affaires de l'Union des parlements de l'UE (COSAC) du 13 juin 2016, au cours desquelles un √©change pr√©liminaire a port√© sur les aspects proc√©duraux, et du 11 juillet 2016, au cours duquel une discussion de fond a eu lieu dans le cadre d'un d√©bat plus large sur la dimension sociale de l'UE.
base prediction: Avant 