In [1]:
!pip install transformers[sentencepiece] #==4.51.3
!pip install -U datasets #==3.6.0
!pip install evaluate #==0.4.3
!pip install accelerate #==1.6.0
!pip install sacrebleu #==2.5.1
#datasets depends on fsspec==2025.3.0

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [2]:
#clone repo from Github and navigate to correct working directory
!git clone https://github.com/fubotz/BMT_2025S
%cd /content/BMT_2025S/week10_files

Cloning into 'BMT_2025S'...
remote: Enumerating objects: 516, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 516 (delta 95), reused 8 (delta 8), pack-reused 368 (from 1)[K
Receiving objects: 100% (516/516), 172.48 MiB | 15.54 MiB/s, done.
Resolving deltas: 100% (270/270), done.
Updating files: 100% (64/64), done.
/content/BMT_2025S/week10_files


In [3]:
#txt to json
import os
import json

# json from huggingface
#{ "translation": { "en": "Others have dismissed him as a joke.", "ro": "Alții l-au numit o glumă." } }
#{ "translation": { "en": "And some are holding out for an implosion.", "ro": "Iar alții așteaptă implozia." } }

def txt2json(src_trg, src_file, trg_file, out_file):
    if not os.path.exists(src_file):
        print(f"Source file not found: {src_file}")
        return
    if not os.path.exists(trg_file):
        print(f"Target file not found: {trg_file}")
        return

    with open(src_file, 'r', encoding="utf-8") as src, \
         open(trg_file, 'r', encoding="utf-8") as trg, \
         open(out_file, 'w', encoding="utf-8") as out_json:

        src_id, trg_id = src_trg.split('-')

        for line_s, line_t in zip(src, trg):
            line_s = line_s.strip()
            line_t = line_t.strip()
            out = { "translation": { src_id: line_s, trg_id: line_t } }
            x = json.dumps(out, ensure_ascii=False)
            out_json.write(x + "\n")

    print(f"JSON file created: {out_file}")

#NB: updated txt2json

In [4]:
lang_pair = "en-de"

train_src = "Vienna_Environmental.en-de.train.en"
train_trg = "Vienna_Environmental.en-de.train.de"
train_json = "Vienna_Environmental.en-de.train.json"

txt2json(lang_pair, train_src, train_trg, train_json)

#valid_src = "Vienna_Environmental.en-de.valid.en.txt"
#valid_trg = "Vienna_Environmental.en-de.valid.de.txt"
#valid_json = "Vienna_Environmental.en-de.valid.json"

#txt2json(lang_pair, valid_src, valid_trg, valid_json)

JSON file created: Vienna_Environmental.en-de.train.json


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM,  AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, DatasetDict
import os
import evaluate
import numpy as np


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def finetune_vienna():

    model_id = "Helsinki-NLP/opus-mt-en-de"
    max_length = 250

    code2lang = {
    "de": "German",
    "fr": "French",
    "en": "English",
    "nl": "Dutch",
    "pt": "Portuguese",
    "ru": "Russian",
    "zh": "Chinese",
    "ro": "Romanian",
    "es": "Spanish"
    }

    source_code = 'en'
    target_code = 'de'
    data_files = "Vienna_Environmental.en-de.train.json"
    output_dir = 'models/opus-finetune-vienna'
    train_bs = 6
    grad_acc = 1
    lr = 5e-5
    w_steps = 0.03
    n_epoch = 5
    lr_scheduler_type = "linear"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.src_lang = source_code  #for multilingual models ??
    tokenizer.tgt_lang = target_code
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map={"": 0})


    print_trainable_parameters(model)
    print(model)

    metric = evaluate.load("sacrebleu")

    def preprocess_parallel_function(examples):
        inputs = [ex[source_code] for ex in examples["translation"]]
        targets = [ex[target_code] for ex in examples["translation"]]
        #inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs, max_length=max_length, padding=False, truncation=True)

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, padding=False, truncation=True)


        model_inputs["labels"] = labels["input_ids"]

        return model_inputs

    def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [[label.strip()] for label in labels]

        return preds, labels

    def compute_metrics(eval_preds, ignore_pad_token_for_loss=False):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
        result = {'bleu' : result['score']}
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        return result

    data = load_dataset("json", data_files=data_files)
    data_split = data['train'].train_test_split(test_size=0.1, seed=42)
    data = DatasetDict({'train': data_split['train'],
                        'valid': data_split['test']})   #validation set

    column_names = data["train"].column_names
    #print(column_names)
    data = data.map(preprocess_parallel_function, batched=True)
    label_pad_token_id = -100

    trainer = Seq2SeqTrainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["valid"],
        args=Seq2SeqTrainingArguments(
            per_device_train_batch_size=train_bs,
            gradient_accumulation_steps=grad_acc,
            per_device_eval_batch_size=2,
            eval_accumulation_steps=2,
            warmup_ratio=w_steps,
            lr_scheduler_type=lr_scheduler_type,
            num_train_epochs=n_epoch,
            predict_with_generate=True,
            metric_for_best_model='bleu',
            load_best_model_at_end=True,
            learning_rate=lr,
            save_total_limit=2,
            generation_num_beams=5,
            save_strategy="epoch",
            eval_strategy="epoch",
            output_dir=output_dir,
            report_to="none",
        ),
        data_collator=DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id, model=model),
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    model.config.use_cache=False  #Silence the warnings. Please re-enable for inference!
    trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [6]:
!ls
!head -n 200 Vienna_Environmental.en-de.train.de
!tail -n 200 Vienna_Environmental.en-de.train.de

#!paste
#!shuf
#!cut
#!grep

cut_dataset.py
EuroPat.de-en.20k.de
EuroPat.de-en.20k.en
EuroPat.de-en.20k.train.clean.json
EuroPat.de-en.20k.train.json
finetune_MarianMT_BMT2025S.ipynb
finetune_MarianMT_BMT2025S_SCHAMBECK_Fabian.ipynb
Vienna_Environmental.en-de.test.de
Vienna_Environmental.en-de.test.en
Vienna_Environmental.en-de.train.de
Vienna_Environmental.en-de.train.en
Vienna_Environmental.en-de.train.json
Dies ist auch der Grund, warum der Ausbau der Fernwärme eine wichtige Maßnahme sowohl im „Klimaschutzprogramm der Stadt Wien“ (KliP) als auch bei der „Urbanen Luft Initiative“ (ULI) zur Reduktion der Luftschadstoffe und beim „Städtischen Energieeffizienzprogramm“ (STEP) zur Steigerung der Energieeffizienz darstellt. 
Derzeit speisen 15 Erzeugungsanlagen an zehn Standorten Heißwasser in das Fernwärmeverbundnetz ein. 
Die Abwärme aus den Hausmüllverbrennungsanlagen Flötzersteig und Spittelau sowie der Sonderabfall- und Klärschlammverbrennungsanlage Simmeringer Haide wird als Grundlast ganzjährig verwendet. 
Zur

In [8]:
#fine-tune pre-trained OPUS on vienna corpus
finetune_vienna()

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

trainable params: 74410496 || all params: 74410496 || trainable%: 100.0
MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=5

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.22206,26.2248,33.98
2,No log,1.167482,30.0918,29.44
3,No log,1.201287,26.5937,34.19
4,0.897800,1.216067,23.7727,38.93
5,0.897800,1.231448,25.7446,34.37


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


In [9]:
#use models
#import models and tokenizer (now: local ft model)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("models/opus-finetune-vienna") #path of your directory with the finetuned (NB: dont load from checkpoint but parent folder? --> contains final version of ft model)

model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-finetune-vienna", device_map={"":0}) #NOTE load in gpu!!!!



In [10]:
%cd /content/BMT_2025S/week10_files

/content/BMT_2025S/week10_files


In [11]:
#upload source file and read
import codecs

file_name = "Vienna_Environmental.en-de.test.en"
mt_output = []

with codecs.open(file_name, 'r', 'utf-8') as src:
    for line in src:
        line = line.strip()
        encoded = tokenizer(line, return_tensors="pt", padding=True).to("cuda") #tokenize; NOTE add to gpu!!!
        generated_tokens = model.generate(**encoded, num_beams=6, early_stopping=True) #add beam search 6
        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) #de tokenize
        print(translation[0])

Abfallwirtschaft
» Abfallbilanz
» Abfallsammlung
» Entsorgung von Abfällen
» Abfallvermeidung
» Kontaminierte Standorte
Grundlagen der Wiener Abfallwirtschaft sind Abfallvermeidung, Abfallzerlegung und Abfallverwertung
Abfallbilanz
Zusätzlich zum Bundesabfallwirtschaftsgesetz wird die Abfallwirtschaft in Wien durch das Wiener Abfallwirtschaftsgesetz (Wiener AWG) geregelt.
Zwei städtische Abteilungen sind in Wien für die Abfallwirtschaft zuständig, die MA 48 für Abfallwirtschaft, Straßenreinigung und Fahrzeugflotte sowie die MA 22 für Umweltschutz.
Während die MA 48 für die gemeinsame Sammlung und Behandlung von Abfällen von privaten Haushalten und Betrieben verantwortlich ist, hat die MA 22 die Aufgabe, die Umsetzung der Abfallvorschriften zu überwachen.
Auf strategischer Ebene arbeiten sie z. B. bei der Umsetzung der „Strategischen Umweltprüfung (SEA) für den Wiener Abfallwirtschaftsplan oder bei der Initiierung von Abfallvermeidungsprojekten zusammen.
Die Grundprinzipien der Wiener A

In [12]:
#!pip freeze > requirements.txt

In [13]:
#!cat requirements.txt

# TODO

1. Select a corpus from ([opus](https://opus.nlpl.eu/results/en&de/corpus-result-table))
2. sample 20k segments
3. clean corpus (optional)
4. fine-tune for 5 epochs (NB: fine-tune pre-trained or fine-tuned model? --> I fine-tuned pre-trained again on different corpus)
5. translate with vienna test file

# Files

1. de translation marianft with greedy search
2. de translation marianft with beam search  
3. de translation marianft similar corpus with beam search

# Step 1: Get Data

- Dataset downloaded from Opus en-de in moses format (EuroPat - v3): https://opus.nlpl.eu/EuroPat/en&de/v3/EuroPat

- Created local script cut_dataset.py that extracts first 20k segments from files EuroPat.de-en.de and EuroPat.de-en.en in folder EuroPat.de-en.txt.zip:

```python
import zipfile

def extract_first_20k_from_zip(zip_path, out_src_path, out_tgt_path, src_file="EuroPat.de-en.en", tgt_file="EuroPat.de-en.de", max_lines=20000):
    with zipfile.ZipFile(zip_path, 'r') as z:
        with z.open(src_file) as src_f, z.open(tgt_file) as tgt_f:
            with open(out_src_path, 'w', encoding='utf-8') as out_src, \
                 open(out_tgt_path, 'w', encoding='utf-8') as out_tgt:

                for i, (src_line, tgt_line) in enumerate(zip(src_f, tgt_f)):
                    if i >= max_lines:
                        break
                    out_src.write(src_line.decode('utf-8'))
                    out_tgt.write(tgt_line.decode('utf-8'))

    print(max_lines)
    print(out_src_path)
    print(out_tgt_path)


if __name__ == "__main__":
    extract_first_20k_from_zip(
        zip_path="EuroPat.de-en.txt.zip",
        out_src_path="EuroPat.de-en.20k.en",
        out_tgt_path="EuroPat.de-en.20k.de"
    )
```

- The script and cut files were subsequentially uploaded to GitHub (https://github.com/fubotz/BMT_2025S)

In [14]:
#create bilingual .json
lang_pair = "en-de"

train_ft_src = "EuroPat.de-en.20k.en"
train_ft_trg = "EuroPat.de-en.20k.de"
train_ft_json = "EuroPat.de-en.20k.train.json"

txt2json(lang_pair, train_ft_src, train_ft_trg, train_ft_json)

JSON file created: EuroPat.de-en.20k.train.json


In [15]:
!head EuroPat.de-en.20k.train.json

{"translation": {"en": "Similarly, a perfect longitudinal guide along the guide track can also be achieved.", "de": "Gleichfalls kann damit aber auch eine einwandfreie Längsführung entlang der Führungsbahn erzielt werden."}}
{"translation": {"en": "Example 2 Cloning and Expression of Botulinum Neurotoxin Type B (BoNT/B)", "de": "Beispiel 2: Klonierung und Expression von Botulinum-Neurotoxin Typ B (BoNT/B)"}}
{"translation": {"en": "The active compounds are administered, directly or in the form of suitable preparations, enterally, parenterally, dermally, nasally, by treatment of the environment or with the aid of active-compound-containing shaped articles such as, for example, strips, plates, bands, collars, ear tags, limb bands, marking devices.", "de": "Die Anwendung der Wirkstoffe erfolgt direkt oder in Form von geeigneten Zubereitungen enteral, parenteral, dermal, nasal, durch Behandlung der Umgebung oder mit Hilfe wirkstoffhaltiger Formkörper wie z.B. Streifen, Platten Bänder, Hals

In [18]:
!wc -l EuroPat.de-en.20k.train.json #before cleaning

20000 EuroPat.de-en.20k.train.json


# Step 2: Clean Data

In [19]:
import re

class TextCleaner:
    def __init__(self, min_len=3, max_len=100):
        self.min_len = min_len
        self.max_len = max_len
        self.skip_patterns = [
            r'\bFIG\.', r'\bFigur\b', r'\bExample\s*\d+', r'\bBeispiel\s*\d+',
            r'\bTable\s*\d+', r'\bTabelle\s*\d+'
        ]

    def should_skip(self, text):
        return any(re.search(p, text) for p in self.skip_patterns)

    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)    #normalize spaces
        text = re.sub(r'\s+([.,;!?])', r'\1', text)     #remove space before punctuation
        return text.strip()

    def is_reasonable_length(self, text):
        tokens = text.strip().split()
        return self.min_len <= len(tokens) <= self.max_len

In [20]:
input_path = "EuroPat.de-en.20k.train.json"
output_path = "EuroPat.de-en.20k.train.clean.json"

cleaner = TextCleaner()
kept, skipped = 0, 0

with open(input_path, 'r', encoding='utf-8') as fin, \
     open(output_path, 'w', encoding='utf-8') as fout:

    for line in fin:
        try:
            entry = json.loads(line)
            trans = entry.get("translation", {})

            src_lang, tgt_lang = list(trans.keys())
            src_text, tgt_text = trans[src_lang], trans[tgt_lang]

            src_text = cleaner.clean_text(src_text)
            tgt_text = cleaner.clean_text(tgt_text)

            if cleaner.should_skip(src_text) or cleaner.should_skip(tgt_text):
                skipped += 1
                continue

            if not cleaner.is_reasonable_length(src_text) or not cleaner.is_reasonable_length(tgt_text):
                skipped += 1
                continue

            cleaned_entry = {"translation": {src_lang: src_text, tgt_lang: tgt_text}}

            fout.write(json.dumps(cleaned_entry, ensure_ascii=False) + "\n")
            kept += 1

        except Exception as e:
            print(f"Skipped due to error: {e}")
            skipped += 1

print(f"Cleaning finished. Kept: {kept} || Skipped: {skipped}")
print(f"Cleaned file: {output_path}")

Cleaning finished. Kept: 18246 || Skipped: 1754
Cleaned file: EuroPat.de-en.20k.train.clean.json


# Step 3: Finetune

In [21]:
def finetune_europat():

    model_id = "Helsinki-NLP/opus-mt-en-de"     #NB: ID for pretrained loaded model from hf
    max_length = 250

    code2lang = {
    "de": "German",
    "fr": "French",
    "en": "English",
    "nl": "Dutch",
    "pt": "Portuguese",
    "ru": "Russian",
    "zh": "Chinese",
    "ro": "Romanian",
    "es": "Spanish"
    }

    source_code = 'en'
    target_code = 'de'
    data_files = "EuroPat.de-en.20k.train.clean.json"
    output_dir = 'models/opus-finetune-europat'
    train_bs = 6
    grad_acc = 1
    lr = 5e-5
    w_steps = 0.03
    n_epoch = 5
    lr_scheduler_type = "linear"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.src_lang = source_code  #for multilingual models ??
    tokenizer.tgt_lang = target_code
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map={"": 0})


    print_trainable_parameters(model)
    print(model)

    metric = evaluate.load("sacrebleu")

    def preprocess_parallel_function(examples):
        inputs = [ex[source_code] for ex in examples["translation"]]
        targets = [ex[target_code] for ex in examples["translation"]]
        #inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs, max_length=max_length, padding=False, truncation=True)

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, padding=False, truncation=True)


        model_inputs["labels"] = labels["input_ids"]

        return model_inputs

    def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [[label.strip()] for label in labels]

        return preds, labels

    def compute_metrics(eval_preds, ignore_pad_token_for_loss=False):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
        result = {'bleu' : result['score']}
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        return result

    data = load_dataset("json", data_files=data_files)
    data_split = data['train'].train_test_split(test_size=0.1, seed=42)
    data = DatasetDict({'train': data_split['train'],
                        'valid': data_split['test']})   #validation set

    column_names = data["train"].column_names
    #print(column_names)
    data = data.map(preprocess_parallel_function, batched=True)
    label_pad_token_id = -100

    trainer = Seq2SeqTrainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["valid"],
        args=Seq2SeqTrainingArguments(
            per_device_train_batch_size=train_bs,
            gradient_accumulation_steps=grad_acc,
            per_device_eval_batch_size=2,
            eval_accumulation_steps=2,
            warmup_ratio=w_steps,
            lr_scheduler_type=lr_scheduler_type,
            num_train_epochs=n_epoch,
            predict_with_generate=True,
            metric_for_best_model='bleu',
            load_best_model_at_end=True,
            learning_rate=lr,
            save_total_limit=2,
            generation_num_beams=5,
            save_strategy="epoch",
            eval_strategy="epoch",
            output_dir=output_dir,
            report_to="none",
        ),
        data_collator=DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=label_pad_token_id, model=model),
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    model.config.use_cache=False  #Silence the warnings. Please re-enable for inference!
    trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [22]:
#fine-tune pre-trained OPUS on europat corpus
finetune_europat()

trainable params: 74410496 || all params: 74410496 || trainable%: 100.0
MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=5

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16421 [00:00<?, ? examples/s]



Map:   0%|          | 0/1825 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.3412,1.221214,32.7506,38.7644
2,1.0511,1.201837,32.4117,43.5304
3,0.8668,1.209816,33.6462,40.2159
4,0.7158,1.221281,34.021,39.7907
5,0.6127,1.230739,34.0449,39.8762


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


# Step 4: Inference


## Marian FT vienna

In [24]:
#use models
#import models and tokenizer (now: local ft model)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("models/opus-finetune-vienna") #path of your directory with the finetuned (NB: dont load from checkpoint but parent folder? --> contains final version of ft model)

model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-finetune-vienna", device_map={"":0}) #NOTE load in gpu!!!!



In [25]:
%cd /content/BMT_2025S/week10_files

/content/BMT_2025S/week10_files


In [27]:
#upload source file and read
import codecs

file_name = "Vienna_Environmental.en-de.test.en"
mt_output = open('Vienna_Environmental.en-de.test.marian.vienna.greedy.de', 'w', encoding='utf-8')

with codecs.open(file_name, 'r', 'utf-8') as src:
  for line in src:
    line = line.strip()
    encoded = tokenizer(line, return_tensors="pt", padding=True).to("cuda") #tokenize
    generated_tokens = model.generate(**encoded) #default: greedy search
    #generated_tokens = model.generate(**encoded, num_beams=6, early_stopping=True) #beam size 6
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) #de tokenize
    print(translation[0], file=mt_output)

mt_output.close()

## Marian FT europat

In [28]:
#use models
#import models and tokenizer (now: local ft model)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("models/opus-finetune-europat") #path of your directory with the finetuned (NB: dont load from checkpoint but parent folder? --> contains final version of ft model)

model = AutoModelForSeq2SeqLM.from_pretrained("models/opus-finetune-europat", device_map={"":0}) #NOTE load in gpu!!!!

In [29]:
%cd /content/BMT_2025S/week10_files

/content/BMT_2025S/week10_files


In [31]:
#upload source file and read
import codecs

file_name = "Vienna_Environmental.en-de.test.en"
mt_output = open('Vienna_Environmental.en-de.test.marian.europat.greedy.de', 'w', encoding='utf-8')

with codecs.open(file_name, 'r', 'utf-8') as src:
  for line in src:
    line = line.strip()
    encoded = tokenizer(line, return_tensors="pt", padding=True).to("cuda") #tokenize
    generated_tokens = model.generate(**encoded) #default: greedy search
    #generated_tokens = model.generate(**encoded, num_beams=6, early_stopping=True) #beam size 6
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) #de tokenize
    print(translation[0], file=mt_output)

mt_output.close()

In [23]:
#translate Vienna...:

#greedy
#beam_6
#similar corpus with beam_6 (pre and ft!)