In [1]:
from datasets import load_dataset
import pandas as pd
import os

base_path = "/content/drive/MyDrive/nlp_7th_sem/lab4"

train_file = f"{base_path}/train.csv"
validation_file = f"{base_path}/test.csv"

if not (os.path.exists(train_file) and os.path.exists(validation_file)):
    raise RuntimeError(f"Error: Dataset files not found. Please ensure {train_file} and {validation_file} are in the current directory.")

raw_datasets = load_dataset(
        "csv",
        data_files={
            "train": train_file,
            "validation": validation_file
        }
    )
print("Successfully loaded datasets:")
print(raw_datasets)


Successfully loaded datasets:
DatasetDict({
    train: Dataset({
        features: ['en', 'uk'],
        num_rows: 3500
    })
    validation: Dataset({
        features: ['en', 'uk'],
        num_rows: 1000
    })
})


In [2]:
from transformers import AutoTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer

MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-en-uk"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = MarianMTModel.from_pretrained(MODEL_CHECKPOINT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import numpy as np

def get_token_lengths(dataset, language_key):
    lengths = []

    if language_key == 'uk':
        with tokenizer.as_target_tokenizer():
            for text in dataset[language_key]:
                lengths.append(len(tokenizer.tokenize(text)))
    else:
        for text in dataset[language_key]:
            lengths.append(len(tokenizer.tokenize(text)))

    return lengths

train_en_lengths = get_token_lengths(raw_datasets['train'], 'en')
train_uk_lengths = get_token_lengths(raw_datasets['train'], 'uk')

print("--- English (Source) Token Lengths in Training Data ---")
print(f"Min Length: {np.min(train_en_lengths)}")
print(f"Max Length: {np.max(train_en_lengths)}")
print(f"Mean Length: {np.mean(train_en_lengths):.2f}")
print(f"95th Percentile Length: {np.percentile(train_en_lengths, 95)}")


print("\n--- Ukrainian (Target) Token Lengths in Training Data ---")
print(f"Min Length: {np.min(train_uk_lengths)}")
print(f"Max Length: {np.max(train_uk_lengths)}")
print(f"Mean Length: {np.mean(train_uk_lengths):.2f}")
print(f"95th Percentile Length: {np.percentile(train_uk_lengths, 95)}")



--- English (Source) Token Lengths in Training Data ---
Min Length: 4
Max Length: 43
Mean Length: 13.90
95th Percentile Length: 22.0

--- Ukrainian (Target) Token Lengths in Training Data ---
Min Length: 4
Max Length: 59
Mean Length: 15.20
95th Percentile Length: 27.0


In [13]:
max_input_length = 32
max_target_length = 36

def preprocess_function(examples):
    model_inputs = tokenizer(examples['en'], max_length=max_input_length, truncation=True, padding=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['uk'], max_length=max_target_length, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [10]:
%pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/104.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [12]:
%pip install unbabel-comet

Collecting unbabel-comet
  Downloading unbabel_comet-2.2.7-py3-none-any.whl.metadata (19 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m55.5/55.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<2.0.0,>=1.20.0 (from unbabel-comet)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x8

In [17]:
import evaluate
import numpy as np


metric_bleu = evaluate.load("sacrebleu")
metric_meteor = evaluate.load("meteor")
metric_comet = evaluate.load("comet")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    references_for_bleu_meteor = [[label] for label in decoded_labels]
    references_for_comet = decoded_labels

    bleu_result = metric_bleu.compute(predictions=decoded_preds, references=references_for_bleu_meteor)

    meteor_result = metric_meteor.compute(predictions=decoded_preds, references=references_for_bleu_meteor)

    sources = raw_datasets['validation']['en']
    comet_result = metric_comet.compute(
        predictions=decoded_preds,
        references=references_for_comet,
        sources=sources
    )

    return {
        "bleu": round(bleu_result["score"], 4),
        "meteor": round(meteor_result["meteor"], 4),
        "comet_score": round(comet_result["mean_score"], 4)
    }

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [20]:
from transformers import Seq2SeqTrainingArguments
from transformers import MarianMTModel, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir=f"{base_path}/opus_mt_en_uk_fine_tuned",
    eval_strategy="steps",
    eval_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [21]:
print("--- Baseline Metrics BEFORE Fine-tuning ---")
baseline_metrics = trainer.evaluate()
print(baseline_metrics)

--- Baseline Metrics BEFORE Fine-tuning ---


INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'eval_loss': 0.9664192795753479, 'eval_model_preparation_time': 0.0028, 'eval_bleu': 30.8987, 'eval_meteor': 0.5501, 'eval_comet_score': 0.8259, 'eval_runtime': 49.0335, 'eval_samples_per_second': 20.394, 'eval_steps_per_second': 1.285}


In [22]:
print("\n--- Starting Fine-tuning ---")
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.



--- Starting Fine-tuning ---


Step,Training Loss,Validation Loss,Model Preparation Time,Bleu,Meteor,Comet Score
200,No log,0.40436,0.0028,42.5423,0.6702,0.8586
400,No log,0.378903,0.0028,44.4399,0.6788,0.8639
600,0.444200,0.370784,0.0028,44.5999,0.6794,0.8639


INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


TrainOutput(global_step=657, training_loss=0.41379066943396536, metrics={'train_runtime': 242.7049, 'train_samples_per_second': 43.262, 'train_steps_per_second': 2.707, 'total_flos': 88983207936000.0, 'train_loss': 0.41379066943396536, 'epoch': 3.0})

In [23]:
print("\n--- Final Metrics AFTER Fine-tuning ---")
final_metrics = trainer.evaluate()
print(final_metrics)


--- Final Metrics AFTER Fine-tuning ---


INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'eval_loss': 0.3704577386379242, 'eval_model_preparation_time': 0.0028, 'eval_bleu': 44.7703, 'eval_meteor': 0.6803, 'eval_comet_score': 0.8645, 'eval_runtime': 54.1249, 'eval_samples_per_second': 18.476, 'eval_steps_per_second': 1.164, 'epoch': 3.0}
