<a href="https://colab.research.google.com/github/emmammolloy/arousal-in-translation/blob/main/MELD_S2_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install unbabel-comet optuna

Collecting unbabel-comet
  Downloading unbabel_comet-2.2.6-py3-none-any.whl.metadata (19 kB)
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<2.0.0,>=1.20.0 (from unbabel-comet)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytorch-lightning<3.0.0,>=2.0.0 (from un

In [None]:
import os
import json
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from comet import download_model, load_from_checkpoint
from sklearn.model_selection import KFold
from functools import partial

import torch
torch.set_float32_matmul_precision("high")

import optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
#update path as needed
#path="/content/drive/MyDrive/CCS2/"
path = "/content/drive/MyDrive/Uni/ITC/ComputationalCogSci/CCS2/CCS2"
#path="/content/"

In [None]:
def translate(input_sentences, model, tokenizer, tgt_lang, max_length=45):
    """Translates English sentences into the target language using mBART."""

    #tokenise
    tokenizer.src_lang = "en_XX"
    tokenizer.tgt_lang =  tgt_lang
    inputs = tokenizer(
        input_sentences,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_length
    ).to(model.device)

    #generate translation
    translated_tokens = model.generate(
        **inputs,
        num_beams=4,
        max_length=max_length,
        early_stopping=True,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang) #force decoder to generate in tgt
    )
    #decode translation
    translations = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

    return translations

In [None]:
def preprocess(batch):
    inputs = tokenizer(
        [f"<{lang}> {text}" for text, lang in zip(batch["Utterance"], batch["tgt_lang"])],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )
    labels = []
    for gold, lang in zip(batch["Utterance"], batch["tgt_lang"]):
        label = tokenizer(gold, max_length=max_length, truncation=True, padding="max_length")["input_ids"]
        label = [(token if token != tokenizer.pad_token_id else -100) for token in label] #replace padding token ID with -100
        labels.append(label)
    inputs["labels"] = labels
    inputs["src"] = batch["Utterance"] #add og source for COMET eval
    return inputs

# Baseline test set

In [None]:
#base model - mbart already finetuned for translation and has our languages, and can have custom tokens added
model_name = "facebook/mbart-large-50-one-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [None]:
#load test data
df = pd.read_csv(os.path.join(path, "./hand_in/test_set_zupan.csv"))
df_test = df[["Sentence", "Emotion"]]
df_test.head()

Unnamed: 0,Sentence,Emotion
0,That dog is such a character.,Amusement
1,The look on your face was priceless.,Amusement
2,That was entertaining.,Amusement
3,That was priceless.,Amusement
4,What a funny joke!,Amusement


In [None]:
#translate test sentences with base model
df_test["de_baseline"] = translate(df_test["Sentence"].tolist(), model, tokenizer, tgt_lang="de_DE")
df_test["ru_baseline"] = translate(df_test["Sentence"].tolist(), model, tokenizer, tgt_lang="ru_RU")
df_test["pl_baseline"] = translate(df_test["Sentence"].tolist(), model, tokenizer, tgt_lang="pl_PL")
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["de_baseline"] = translate(df_test["Sentence"].tolist(), model, tokenizer, tgt_lang="de_DE")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["ru_baseline"] = translate(df_test["Sentence"].tolist(), model, tokenizer, tgt_lang="ru_RU")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_

Unnamed: 0,Sentence,Emotion,de_baseline,ru_baseline,pl_baseline
0,That dog is such a character.,Amusement,Dieser Hund ist so ein Charakter.,Этот собака такой характер.,Ten pies jest taki charakter.
1,The look on your face was priceless.,Amusement,Der Blick auf dein Gesicht war unschätzbar.,Выгляд на твоем лице был бесценным.,Wygląd na twarzy był bezcenny.
2,That was entertaining.,Amusement,Das war unterhaltsam.,Это было забавно.,To było zabawne.
3,That was priceless.,Amusement,Das war unschätzbar.,Это было бесценное.,Było to bezcenne.
4,What a funny joke!,Amusement,Was für ein lustiger Witz!,Какая смешная шутка!,Co za zabawny żart!


In [None]:
df_test.to_csv(os.path.join(path, "./hand_in/test_set_zupan_baselinetranslations.csv"))

# create training data

In [None]:
#load data
df = pd.read_csv(os.path.join(path, "./hand_in/arousal_data.csv"))
df.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,Subtitle_de,Subtitle_ru,Subtitle_pl,StartTime,EndTime,File_ID,utt_id,arousal_value
0,43,You had no right to tell me you ever had feeli...,Ross,anger,negative,5,0,2,7,"Du hattest kein Recht, mir zu sagen, dass du i...","У тебя не было права говорить, что у тебя были...",Nie miałaś prawa mówić mi o tym.,0 days 00:18:28.565000,0 days 00:18:31.234000,dia5_utt0.mp4,dia5_utt0,0.764808
1,43,What?,Rachel,surprise,negative,5,1,2,7,Was?,Что?,Co?,0 days 00:18:31.693000,0 days 00:18:33.528000,dia5_utt1.mp4,dia5_utt1,0.913514
2,44,I was doing great with Julie before I found ou...,Ross,anger,negative,5,2,2,7,"Ich hab mich glänzend mit Julie verstanden, be...","У меня с Джули всё было замечательно, пока я н...",Było mi dobrze z Julie!,0 days 00:18:31.693000,0 days 00:18:35.196000,dia5_utt2.mp4,dia5_utt2,0.894707
3,45,"Hey, I was doin' great before I found out abou...",Rachel,anger,negative,5,3,2,7,"Ich konnte auch nicht klagen, bevor ich das vo...","У меня с Джули всё было замечательно, пока я н...","Mnie też było, bez ciebie! Łatwo mi patrzeć na...",0 days 00:18:35.364000,0 days 00:18:39.951000,dia5_utt3.mp4,dia5_utt3,0.819732
4,46,The point is I...,Ross,anger,negative,5,4,2,7,,,"Chodzi o to, że...",0 days 00:19:12.818000,0 days 00:19:14.030000,dia5_utt4.mp4,dia5_utt4,0.717749


In [None]:
#create input column with high/low arousal token prepended
def add_arousal_token(row):
    if row["arousal_value"] >= 0.5:
        return "<high_arousal> " + row["Utterance"]
    else:
        return "<low_arousal> " + row["Utterance"]

df["model_input"] = df.apply(add_arousal_token, axis=1)
df.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,Subtitle_de,Subtitle_ru,Subtitle_pl,StartTime,EndTime,File_ID,utt_id,arousal_value,model_input
0,43,You had no right to tell me you ever had feeli...,Ross,anger,negative,5,0,2,7,"Du hattest kein Recht, mir zu sagen, dass du i...","У тебя не было права говорить, что у тебя были...",Nie miałaś prawa mówić mi o tym.,0 days 00:18:28.565000,0 days 00:18:31.234000,dia5_utt0.mp4,dia5_utt0,0.764808,<high_arousal> You had no right to tell me you...
1,43,What?,Rachel,surprise,negative,5,1,2,7,Was?,Что?,Co?,0 days 00:18:31.693000,0 days 00:18:33.528000,dia5_utt1.mp4,dia5_utt1,0.913514,<high_arousal> What?
2,44,I was doing great with Julie before I found ou...,Ross,anger,negative,5,2,2,7,"Ich hab mich glänzend mit Julie verstanden, be...","У меня с Джули всё было замечательно, пока я н...",Było mi dobrze z Julie!,0 days 00:18:31.693000,0 days 00:18:35.196000,dia5_utt2.mp4,dia5_utt2,0.894707,<high_arousal> I was doing great with Julie be...
3,45,"Hey, I was doin' great before I found out abou...",Rachel,anger,negative,5,3,2,7,"Ich konnte auch nicht klagen, bevor ich das vo...","У меня с Джули всё было замечательно, пока я н...","Mnie też było, bez ciebie! Łatwo mi patrzeć na...",0 days 00:18:35.364000,0 days 00:18:39.951000,dia5_utt3.mp4,dia5_utt3,0.819732,"<high_arousal> Hey, I was doin' great before I..."
4,46,The point is I...,Ross,anger,negative,5,4,2,7,,,"Chodzi o to, że...",0 days 00:19:12.818000,0 days 00:19:14.030000,dia5_utt4.mp4,dia5_utt4,0.717749,<high_arousal> The point is I...


In [None]:
df.to_csv(os.path.join(path, "./hand_in/arousal_data_inputs.csv"), index=False)

In [None]:
df_de = df[["Utterance", "Subtitle_de"]].rename(columns={"Subtitle_de": "Subtitle"})
df_de["tgt_lang"] = "de_DE"

df_ru = df[["Utterance", "Subtitle_ru"]].rename(columns={"Subtitle_ru": "Subtitle"})
df_ru["tgt_lang"] = "ru_RU"

df_pl = df[["Utterance", "Subtitle_pl"]].rename(columns={"Subtitle_pl": "Subtitle"})
df_pl["tgt_lang"] = "pl_PL"

#concatenate into one multilingual dataset
df_mt = pd.concat([df_de, df_ru, df_pl], ignore_index=True)
df_mt["arousal"] = df["model_input"]

df_mt

Unnamed: 0,Utterance,Subtitle,tgt_lang,arousal
0,You had no right to tell me you ever had feeli...,"Du hattest kein Recht, mir zu sagen, dass du i...",de_DE,<high_arousal> You had no right to tell me you...
1,What?,Was?,de_DE,<high_arousal> What?
2,I was doing great with Julie before I found ou...,"Ich hab mich glänzend mit Julie verstanden, be...",de_DE,<high_arousal> I was doing great with Julie be...
3,"Hey, I was doin' great before I found out abou...","Ich konnte auch nicht klagen, bevor ich das vo...",de_DE,"<high_arousal> Hey, I was doin' great before I..."
4,The point is I...,,de_DE,<high_arousal> The point is I...
...,...,...,...,...
1246,You or me?,Ty czy ja?,pl_PL,
1247,"I got it. Uh, Joey, women don't have Adam's ap...","Przejmuję. Joey, kobiety nie mają jabłka Adama.",pl_PL,
1248,"You guys are messing with me, right?",Nabieracie mnie?,pl_PL,
1249,Yeah.,Tak!,pl_PL,


In [None]:
#drop nas
df_mt = df_mt.dropna(subset=["Subtitle"]).sort_values("tgt_lang").reset_index(drop=True)
df_mt = df_mt.dropna(subset=["arousal"]).sort_values("tgt_lang").reset_index(drop=True)
df_mt

Unnamed: 0,Utterance,Subtitle,tgt_lang,arousal
0,You had no right to tell me you ever had feeli...,"Du hattest kein Recht, mir zu sagen, dass du i...",de_DE,<high_arousal> You had no right to tell me you...
1,"""Oh, Chandler, now, now, that's it. There, fas...","Oh, Chandler! Jetzt! Ja! Weiter so. Ja! Und sc...",de_DE,"<high_arousal> ""Oh, Chandler, now, now, that's..."
2,"We laugh, we play.","Wir werden spielen und lachen,",de_DE,"<low_arousal> We laugh, we play."
3,We have a good time.,Wir werden schon unseren Spaß haben.,de_DE,<high_arousal> We have a good time.
4,"Carol, we've been through this before, ok?",Ich mach das doch nicht zum ersten Mal.,de_DE,"<low_arousal> Carol, we've been through this b..."
...,...,...,...,...
386,"Yes, hello.","Ja, hallo.",de_DE,"<high_arousal> Yes, hello."
387,Transit Authority?,"Verkehrsbetriebe, hallo.",de_DE,<high_arousal> Transit Authority?
388,Hi.,Hi!,de_DE,<high_arousal> Hi.
389,"I miss Janice though. ""Hello, Chandler Bing.""","Ich vermisse Janice. ""Hallo, Chandler Bing!""",de_DE,"<high_arousal> I miss Janice though. ""Hello, C..."


In [None]:
ds_mt = Dataset.from_pandas(df_mt[["Utterance", "Subtitle", "tgt_lang"]])
ds_mt

Dataset({
    features: ['Utterance', 'Subtitle', 'tgt_lang'],
    num_rows: 391
})

# hyperparameter tuning

In [None]:
#base model - mbart already finetuned for translation and has our languages, and can have custom tokens added
model_name = "facebook/mbart-large-50-one-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#load COMET for eval
comet_model_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(comet_model_path)

max_length = 45

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.40k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [None]:
# hyperparam tuning
def compute_metrics(eval_preds, source_inputs):
    """Compute COMET score"""
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    sources = source_inputs[:len(decoded_preds)]
    data = [{"src": src, "mt": mt, "ref": ref} for mt, ref, src in zip(decoded_preds, decoded_labels, sources)]
    comet_scores = comet_model.predict(data, batch_size=8, gpus=1).scores
    return {"comet": np.mean(comet_scores)}


def run_kfold_training(
    dataset,
    dropout,
    training_args,
    k=5,
    model_checkpoint=model_name,
    early_stopping=True,
    early_stopping_patience=2,
    compute_metrics=compute_metrics,
    output_base_dir="./kfold_models"
):
    from copy import deepcopy
    import shutil

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_metrics = []

    best_score = float('-inf')
    best_fold_dir = None

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        print(f"\nFold {fold + 1}/{k}")

        fold_dir = os.path.join(output_base_dir, f"fold_{fold+1}")
        os.makedirs(fold_dir, exist_ok=True)

        training_args_fold = deepcopy(training_args)
        training_args_fold.output_dir = fold_dir

        train_dataset = dataset.select(train_idx)
        val_dataset = dataset.select(val_idx)

        model = MBartForConditionalGeneration.from_pretrained(model_checkpoint).to(training_args.device)
        model.config.dropout = dropout
        model.config.attention_dropout = dropout
        model.config.activation_dropout = dropout

        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args_fold,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)] if early_stopping else None,
            compute_metrics=lambda eval_preds: compute_metrics(eval_preds, source_inputs=val_dataset["src"])
        )

        trainer.train()

        metrics = trainer.evaluate()
        fold_metrics.append(metrics)

        # Save COMET score
        with open(os.path.join(fold_dir, "comet_score.txt"), "w") as f:
            f.write(str(metrics.get("comet", "N/A")))

        # Track best fold
        if "comet" in metrics and metrics["comet"] > best_score:
            best_score = metrics["comet"]
            best_fold_dir = fold_dir

    # Save best model
    if best_fold_dir:
        best_model_path = os.path.join(best_fold_dir, "checkpoint-best")
        target_path = os.path.join(output_base_dir, "best_kfold_model")
        shutil.copytree(best_model_path, target_path, dirs_exist_ok=True)
        print(f"\nBest model copied from {best_model_path} to {target_path} (COMET={best_score:.4f})")

    # Print average eval loss
    eval_losses = [m["eval_loss"] for m in fold_metrics]
    print(f"\nAverage eval loss across {k} folds: {np.mean(eval_losses):.4f} ± {np.std(eval_losses):.4f}")

    return fold_metrics


def objective(trial):
    lr = trial.suggest_float("learning_rate", 1e-6, 3e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [2, 4, 8])
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    label_smoothing = trial.suggest_float("label_smoothing_factor", 0.0, 0.2)

    training_args = Seq2SeqTrainingArguments(
        output_dir="./optuna_trial",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=weight_decay,
        num_train_epochs=3,
        logging_strategy="no",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_comet",
        greater_is_better=True,
        eval_strategy="epoch",
        predict_with_generate=True,
        generation_max_length=max_length,
        generation_num_beams=4,
        fp16=torch.cuda.is_available(),
        report_to="none",
        disable_tqdm=True,
        label_smoothing_factor=label_smoothing
    )

    fold_metrics = run_kfold_training(
        dataset=preproc_dataset,
        training_args=training_args,
        dropout=dropout
    )
    comet_scores = [m["comet"] for m in fold_metrics if "comet" in m]
    return np.mean(comet_scores)


# Run the search
raw_dataset = ds_mt
preproc_dataset = raw_dataset.map(preprocess, batched=True)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)

print(study.best_params)
print(study.best_value)  # Best COMET score


Map:   0%|          | 0/1251 [00:00<?, ? examples/s]

[I 2025-06-01 23:43:13,067] A new study created in memory with name: no-name-92c3546e-d612-4ec4-b65d-245c9640571c



Fold 1/5



`tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.

INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.29it/s]


{'eval_loss': 0.5148106217384338, 'eval_comet': 0.9772949950153609, 'eval_runtime': 58.8379, 'eval_samples_per_second': 4.266, 'eval_steps_per_second': 2.141, 'epoch': 1.0}




INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.08it/s]


{'eval_loss': 0.5099208950996399, 'eval_comet': 0.9776058472485182, 'eval_runtime': 59.1863, 'eval_samples_per_second': 4.241, 'eval_steps_per_second': 2.129, 'epoch': 2.0}


INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 16.94it/s]


{'eval_loss': 0.5022395253181458, 'eval_comet': 0.977737036121794, 'eval_runtime': 59.4926, 'eval_samples_per_second': 4.219, 'eval_steps_per_second': 2.118, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 493.3552, 'train_samples_per_second': 6.081, 'train_steps_per_second': 3.04, 'train_loss': 0.5774346516927084, 'epoch': 3.0}


INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.52it/s]


{'eval_loss': 0.5022395253181458, 'eval_comet': 0.977737036121794, 'eval_runtime': 61.0929, 'eval_samples_per_second': 4.108, 'eval_steps_per_second': 2.062, 'epoch': 3.0}

Fold 2/5



`tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.

INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 16.19it/s]


{'eval_loss': 0.5146510004997253, 'eval_comet': 0.977733092546463, 'eval_runtime': 57.5833, 'eval_samples_per_second': 4.342, 'eval_steps_per_second': 2.171, 'epoch': 1.0}




INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.17it/s]


{'eval_loss': 0.5059395432472229, 'eval_comet': 0.9782774801254273, 'eval_runtime': 56.3383, 'eval_samples_per_second': 4.437, 'eval_steps_per_second': 2.219, 'epoch': 2.0}


INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.53it/s]


{'eval_loss': 0.5044344663619995, 'eval_comet': 0.9784809496402741, 'eval_runtime': 56.0453, 'eval_samples_per_second': 4.461, 'eval_steps_per_second': 2.23, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 494.7543, 'train_samples_per_second': 6.07, 'train_steps_per_second': 3.038, 'train_loss': 0.5697078742904815, 'epoch': 3.0}


INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.45it/s]


{'eval_loss': 0.5044344663619995, 'eval_comet': 0.9784809496402741, 'eval_runtime': 56.1606, 'eval_samples_per_second': 4.452, 'eval_steps_per_second': 2.226, 'epoch': 3.0}

Fold 3/5



`tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.

INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.54it/s]


{'eval_loss': 0.5096697807312012, 'eval_comet': 0.9778678681850433, 'eval_runtime': 56.5861, 'eval_samples_per_second': 4.418, 'eval_steps_per_second': 2.209, 'epoch': 1.0}




INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.10it/s]


{'eval_loss': 0.5023887753486633, 'eval_comet': 0.9781196961402893, 'eval_runtime': 56.5856, 'eval_samples_per_second': 4.418, 'eval_steps_per_second': 2.209, 'epoch': 2.0}


INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.17it/s]


{'eval_loss': 0.5014681816101074, 'eval_comet': 0.9782134761810303, 'eval_runtime': 57.5204, 'eval_samples_per_second': 4.346, 'eval_steps_per_second': 2.173, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 489.1733, 'train_samples_per_second': 6.139, 'train_steps_per_second': 3.073, 'train_loss': 0.5748183808164921, 'epoch': 3.0}


INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 17.45it/s]


{'eval_loss': 0.5014681816101074, 'eval_comet': 0.9782134761810303, 'eval_runtime': 56.799, 'eval_samples_per_second': 4.401, 'eval_steps_per_second': 2.201, 'epoch': 3.0}

Fold 4/5



`tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.

INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 32/32 [00:01<00:00, 16.75it/s]


{'eval_loss': 0.5165628790855408, 'eval_comet': 0.9761563212871551, 'eval_runtime': 61.201, 'eval_samples_per_second': 4.085, 'eval_steps_per_second': 2.042, 'epoch': 1.0}




[W 2025-06-02 00:13:34,112] Trial 0 failed with parameters: {'learning_rate': 8.733238379842811e-06, 'batch_size': 2, 'weight_decay': 0.03148118567057991, 'dropout': 0.4631643861526208, 'label_smoothing_factor': 0.02891469735724841} because of the following error: SafetensorError('Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })').
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-46-dc924dbce7c1>", line 121, in objective
    fold_metrics = run_kfold_training(
                   ^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-46-dc924dbce7c1>", line 64, in run_kfold_training
    trainer.train()
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2240, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/us

SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

#finetune on data without arousal

{
  "learning_rate": 8.733e-6,
  "batch_size": 2,
  "weight_decay": 0.0315,
  "dropout": 0.4632,
  "label_smoothing_factor": 0.0289
}

In [None]:
from datasets import Dataset, DatasetDict
from transformers import MBartTokenizerFast, MBartForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, EarlyStoppingCallback

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

ds_mt = Dataset.from_pandas(df_mt[["Utterance", "Subtitle", "tgt_lang"]])
df_mt = df_mt.sort_values("tgt_lang").reset_index(drop=True)

max_length = 45

def preprocess(batch):
    # Prepend <tgt_lang> to source inputs (MBART expects this format)
    inputs = tokenizer(
        [f"<{lang}> {text}" for text, lang in zip(batch["Utterance"], batch["tgt_lang"])],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    labels = []
    for subtitle, lang in zip(batch["Subtitle"], batch["tgt_lang"]):
        tokenizer.tgt_lang = lang
        label = tokenizer(
            subtitle,
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )["input_ids"]
        # Replace padding token ID with -100
        label = [(token if token != tokenizer.pad_token_id else -100) for token in label]
        labels.append(label)

    inputs["labels"] = labels
    return inputs

#ds_de_tok = ds_de.map(preprocess, batched=True, remove_columns=ds_de.column_names)
ds_tok = ds_mt.map(preprocess, batched=True, batch_size=18, remove_columns=["Utterance", "Subtitle", "tgt_lang"])

#train/test split
dataset_split = ds_tok.train_test_split(test_size=0.2, seed=42, shuffle =True)
dataset_dict = DatasetDict({
    "train": dataset_split["train"],
    "test": dataset_split["test"]
})

#set up training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-meld-de-mt", #where to save model checkpoints
    overwrite_output_dir=False,

    #training schedule
    num_train_epochs=5, #how many epochs, 3-5 probs good normally, but small dataset needs more
    per_device_train_batch_size=2, #how many batches per gpu/cpu between weight updating
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Simulate batch size of 4

    #optimiser
    learning_rate=8.733e-6, #initial lr, weight updating. mBART usually prefers lower LR than mT5: (typically 1e-5 to 3e-5) — higher values like 3e-4 can destabilize fine-tuning
    weight_decay=0.0315, #loss function penalty to reduce reliance on a single weight
    label_smoothing_factor=0.0289, #one hot vector made 0.9 and final 0.1 spread out to help reduce overfitting

    #logging
    logging_dir="./logs", #where to write tensorboard logs - for loss, lr, etc
    logging_strategy="steps",
    logging_steps=10, #when to log metrics

    #evaluation
    eval_strategy="epoch", #steps or epoch
    save_strategy="epoch", # when to save checkpoint
    save_total_limit=2, #how many most recent checkpoints to keep
    predict_with_generate=True, #needed for MT
    generation_max_length=max_length,
    generation_num_beams=4, # Beam search width: keeps multiple decoding options - how many translation candidates at each step? (higher = better, slower)

    #model recovery
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # or bleu/comet etc
    greater_is_better=False,

    fp16=torch.cuda.is_available(),  # if using GPU
    seed = 42
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], #good for small data.
    tokenizer=tokenizer
)

#dropout to help with overfitting (high validation loss)
model.config.dropout = 0.4632
model.config.attention_dropout = 0.4632
model.config.activation_dropout = 0.4632

Map:   0%|          | 0/391 [00:00<?, ? examples/s]


`tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.



In [None]:
trainer.train()
trainer.save_model(os.path.join(path, "./hand_in/models/mbart-MELD-mt"))
model.save_pretrained(os.path.join(path, "./hand_in/models/mbart-MELD-mt"))
tokenizer.save_pretrained(os.path.join(path, "./hand_in/models/mbart-MELD-mt"))



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33memmammolloy[0m ([33memmammolloy-k-benhavns-universitet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,3.0511,2.925813
2,2.4544,2.827463
3,2.168,2.814702
4,2.0064,2.814164
5,1.9909,2.827736


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('mbart-MELD-mt/tokenizer_config.json',
 'mbart-MELD-mt/special_tokens_map.json',
 'mbart-MELD-mt/sentencepiece.bpe.model',
 'mbart-MELD-mt/added_tokens.json',
 'mbart-MELD-mt/tokenizer.json')

# Finetuning with arousal

In [None]:
# Load model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Add special arousal tokens
special_tokens = {"additional_special_tokens": ["<low_arousal>", "<high_arousal>"]}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load dataset from df_mt
ds = Dataset.from_pandas(df_mt[["Utterance", "Subtitle", "arousal", "tgt_lang"]])

max_length = 45

# Preprocessing
def preprocess(batch):
    # Prefix input with language token (mBART-style)
    sources = [f"<{lang}> {arousal} {text}" for lang, arousal, text in zip(batch["tgt_lang"], batch["arousal"], batch["Utterance"])]

    tokenizer.src_lang = "en_XX"
    inputs = tokenizer(sources, max_length=max_length, truncation=True, padding="max_length")

    # Tokenize targets
    targets = tokenizer(batch["Subtitle"], max_length=max_length, truncation=True, padding="max_length")
    inputs["labels"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label_seq]
        for label_seq in targets["input_ids"]
    ]
    return inputs

# Tokenize dataset
ds_tok = ds.map(preprocess, batched=True, remove_columns=ds.column_names)

# Split into train/test
dataset_split = ds_tok.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict({
    "train": dataset_split["train"],
    "test": dataset_split["test"]
})

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-meld-de-mt", #where to save model checkpoints
    overwrite_output_dir=False,

    #training schedule
    num_train_epochs=5, #how many epochs, 3-5 probs good normally, but small dataset needs more
    per_device_train_batch_size=2, #how many batches per gpu/cpu between weight updating
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Simulate batch size of 4

    #optimiser
    learning_rate=8.733e-6, #initial lr, weight updating. mBART usually prefers lower LR than mT5: (typically 1e-5 to 3e-5) — higher values like 3e-4 can destabilize fine-tuning
    weight_decay=0.0315, #loss function penalty to reduce reliance on a single weight
    label_smoothing_factor=0.0289, #one hot vector made 0.9 and final 0.1 spread out to help reduce overfitting

    #logging
    logging_dir="./logs", #where to write tensorboard logs - for loss, lr, etc
    logging_strategy="steps",
    logging_steps=10, #when to log metrics

    #evaluation
    eval_strategy="epoch", #steps or epoch
    save_strategy="epoch", # when to save checkpoint
    save_total_limit=2, #how many most recent checkpoints to keep
    predict_with_generate=True, #needed for MT
    generation_max_length=max_length,
    generation_num_beams=4, # Beam search width: keeps multiple decoding options - how many translation candidates at each step? (higher = better, slower)

    #model recovery
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # or bleu/comet etc
    greater_is_better=False,

    fp16=torch.cuda.is_available(),  # if using GPU
    seed = 42
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

model.config.dropout = 0.4632
model.config.attention_dropout = 0.4632
model.config.activation_dropout = 0.4632

trainer.train()
trainer.save_model(os.path.join(path, "./hand_in/models/mbart-arousal-mt"))
model.save_pretrained(os.path.join(path, "./hand_in/models/mbart-arousal-mt"))
tokenizer.save_pretrained(os.path.join(path, "./hand_in/models/mbart-arousal-mt"))

Map:   0%|          | 0/391 [00:00<?, ? examples/s]


`tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`. Use `processing_class` instead.



Epoch,Training Loss,Validation Loss
1,3.0489,2.758944
2,2.5229,2.650702
3,2.2465,2.628766
4,2.0999,2.634876
5,1.874,2.643313




There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('mbart-arousal-mt/tokenizer_config.json',
 'mbart-arousal-mt/special_tokens_map.json',
 'mbart-arousal-mt/sentencepiece.bpe.model',
 'mbart-arousal-mt/added_tokens.json',
 'mbart-arousal-mt/tokenizer.json')

# Testing models

In [None]:
test_sents = os.path.join(path, "./hand_in/test_set_zupan_baselinetranslations.csv")
df_test = pd.read_csv(test_sents)
sentences = df_test["Sentence"].tolist()

In [None]:
#Trained on MELD data
model_name = os.path.join(path, "./hand_in/models/mbart-MELD-mt")
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

de_baseline = translate(sentences, model, tokenizer, tgt_lang="de_DE", max_length=45)
ru_baseline = translate(sentences, model, tokenizer, tgt_lang="ru_RU", max_length=45)
pl_baseline = translate(sentences, model, tokenizer, tgt_lang="pl_PL", max_length=45)

df_test["MELD_de"] = de_baseline
df_test["MELD_ru"] = ru_baseline
df_test["MELD_pl"] = pl_baseline
df_test.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Emotion,de_baseline,ru_baseline,pl_baseline,MELD_de,MELD_ru,MELD_pl
0,0,That dog is such a character.,Amusement,Dieser Hund ist so ein Charakter.,Этот собака такой характер.,Ten pies jest taki charakter.,Der Hund ist ein solcher Charakter.,Этот собака такой персонаж.,Ten pies jest takim charakterem.
1,1,The look on your face was priceless.,Amusement,Der Blick auf dein Gesicht war unschätzbar.,Выгляд на твоем лице был бесценным.,Wygląd na twarzy był bezcenny.,Der Blick auf dein Gesicht war unbezahlbar.,Ваш взгляд был бесценным.,Wygląd na twoim twarzy był bezcenny.
2,2,That was entertaining.,Amusement,Das war unterhaltsam.,Это было забавно.,To było zabawne.,Das war unterhaltsam.,Это было забавно.,To było zabawne.
3,3,That was priceless.,Amusement,Das war unschätzbar.,Это было бесценное.,Było to bezcenne.,Das war unbezahlbar.,Это было бесценно.,To było nieocenione.
4,4,What a funny joke!,Amusement,Was für ein lustiger Witz!,Какая смешная шутка!,Co za zabawny żart!,Was für ein lustiger Witz!,Какая смешная шутка!,Co za śmieszny żart!


In [None]:
#trained on arousal
model_name = os.path.join(path, "./hand_in/models/mbart-arousal-mt")
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

#high arousal
df_test["high_arousal"] = ["<high arousal> " + sentence for sentence in df_test["Sentence"]]
high_arousal_sentences = df_test["high_arousal"].tolist()

de_higharo = translate(high_arousal_sentences, model, tokenizer, tgt_lang="de_DE", max_length=45)
ru_higharo = translate(high_arousal_sentences, model, tokenizer, tgt_lang="ru_RU", max_length=45)
pl_higharo = translate(high_arousal_sentences, model, tokenizer, tgt_lang="pl_PL", max_length=45)

df_test["higharo_de"] = de_higharo
df_test["higharo_ru"] = ru_higharo
df_test["higharo_pl"] = pl_higharo

#low arousal
df_test["low_arousal"] = ["<low arousal> " + sentence for sentence in df_test["Sentence"]]
low_arousal_sentences = df_test["low_arousal"].tolist()

de_lowaro = translate(low_arousal_sentences, model, tokenizer, tgt_lang="de_DE", max_length=45)
ru_lowaro = translate(low_arousal_sentences, model, tokenizer, tgt_lang="ru_RU", max_length=45)
pl_lowaro = translate(low_arousal_sentences, model, tokenizer, tgt_lang="pl_PL", max_length=45)

df_test["lowaro_de"] = de_lowaro
df_test["lowaro_ru"] = ru_lowaro
df_test["lowaro_pl"] = pl_lowaro

df_test.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Emotion,de_baseline,ru_baseline,pl_baseline,MELD_de,MELD_ru,MELD_pl,high_arousal,higharo_de,higharo_ru,higharo_pl,low_arousal,lowaro_de,lowaro_ru,lowaro_pl
0,0,That dog is such a character.,Amusement,Dieser Hund ist so ein Charakter.,Этот собака такой характер.,Ten pies jest taki charakter.,Der Hund ist ein solcher Charakter.,Этот собака такой персонаж.,Ten pies jest takim charakterem.,<high arousal> That dog is such a character.,Der Hund ist ein solcher Charakter.,<Большого возбуждения>,Ten pies jest takim charakterem.,<low arousal> That dog is such a character.,Der Hund ist ein solcher Charakter.,<низкое возбуждение> Этот собака такой персонаж.,Ten pies jest takim charakterem.
1,1,The look on your face was priceless.,Amusement,Der Blick auf dein Gesicht war unschätzbar.,Выгляд на твоем лице был бесценным.,Wygląd na twarzy był bezcenny.,Der Blick auf dein Gesicht war unbezahlbar.,Ваш взгляд был бесценным.,Wygląd na twoim twarzy był bezcenny.,<high arousal> The look on your face was price...,Der Blick auf dein Gesicht war unbezahlbar.,<Большого возбуждения> Ваш взгляд был бесценным.,Wygląd na twoim twarzy był bezcenny.,<low arousal> The look on your face was pricel...,Der Blick auf dein Gesicht war unbezahlbar.,<низкое возбуждение> Ваш взгляд был бесценным.,<niektóre pobudzenie> Wasze spojrzenie na twar...
2,2,That was entertaining.,Amusement,Das war unterhaltsam.,Это было забавно.,To było zabawne.,Das war unterhaltsam.,Это было забавно.,To było zabawne.,<high arousal> That was entertaining.,Das war unterhaltsam.,Это было забавно.,To było zabawne.,<low arousal> That was entertaining.,Das war unterhaltsam.,Это было забавно.,To było zabawne.
3,3,That was priceless.,Amusement,Das war unschätzbar.,Это было бесценное.,Było to bezcenne.,Das war unbezahlbar.,Это было бесценно.,To było nieocenione.,<high arousal> That was priceless.,Das war wertlos.,Это было бесценно.,To było nieocenione.,<low arousal> That was priceless.,Das war wertlos.,Это было бесценно.,To było nieocenione.
4,4,What a funny joke!,Amusement,Was für ein lustiger Witz!,Какая смешная шутка!,Co za zabawny żart!,Was für ein lustiger Witz!,Какая смешная шутка!,Co za śmieszny żart!,<high arousal> What a funny joke!,Was für ein lustiger Witz!,Какая смешная шутка!,Jaki śmieszny żart!,<low arousal> What a funny joke!,Was für ein lustiger Witz!,Какая смешная шутка!,Jaki śmieszny żart!


In [None]:
df_test.to_csv(os.path.join(path, "./hand_in/test_outputs.csv"), index=False)

# parameter tuning

{
  "learning_rate": 8.733e-6,
  "batch_size": 2,
  "weight_decay": 0.0315,
  "dropout": 0.4632,
  "label_smoothing_factor": 0.0289
}


eval vs steps:
- epoch good for small datasets,less than 10-20 epochs
- steps better for more frequent feedback (eg 500-2000 steps) - so for long training, early stopping, hyperparam tuning

no of epochs
- 3 to 10 for less than 5k samples
- 2 to 5 for 5-50k
- 1 to 3 for large dataset

learning rate:
- Loss drops too fast then explodes	--> Lower LR (e.g. 1e-4)
- Loss doesn’t drop at all	Try higher LR (e.g. 3e-4) --> Training unstable	Add warmup_steps or reduce LR
- Great training loss but bad validation -->	Reduce LR, add weight_decay

batch size:
- small (2-8) more steps but slower and noisier, lower memory usage
- med (16-32): needs 16GB+ GPU
- large (64+): v big GPU needed, faster training

weight decay:
- 0.01 standard
- v small dataset/with dropout 0.001 to 0.01
- overfitting? increase to 0.05