In [None]:
import os
import re
import pickle
import time
from tqdm import tqdm
from typing import Union

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
from matplotlib.colors import ListedColormap
import seaborn as sns

import torch

import huggingface_hub
from datasets import load_dataset, Dataset
import transformers
from transformers import BitsAndBytesConfig
import evaluate
from evaluate import evaluator
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from sacrebleu.tokenizers.tokenizer_zh import TokenizerZh

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [None]:
num_beams = 5
max_new_tokens = 512
top_p = 0.9
temperature = 0.6

# Utils from evaluation pipeline

In [None]:
#################################   NLLB

def get_input_targets_NLLB(dataset_wnt_format, source_lang, target_lang):
    inputs = [example[source_lang] for example in dataset_wnt_format[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset_wnt_format[f"{source_lang}-{target_lang}"]]
    return inputs, inputs, targets

def translate_list_of_str_NLLB(list_str, tokenizer, model, to_laguage):
    """
    Returns a list containing str corresponding to translation of the inputted
    """
    equivalence_language_to_FLORES = {"en": "eng_Latn", "de": "deu_Latn", "ru": "rus_Cyrl", "is": "isl_Latn", "zh": "zho_Hans", "cs": "ces_Latn"}
    with torch.no_grad():
        inputs = tokenizer(list_str, return_tensors="pt", padding=True)
        language_tgt_FLORES = equivalence_language_to_FLORES[to_laguage]
        translated = model.generate(inputs["input_ids"].to(device),
                                    forced_bos_token_id=tokenizer.convert_tokens_to_ids(language_tgt_FLORES),
                                    num_beams=num_beams, max_length=max_new_tokens, early_stopping=True,
                                    ).cpu()
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_text

def translate_batched_NLLB(inputs, model, tokenizer, batch_size, target_language):
    """
    For 8GB VRAM, use batch_size = 4
    For 16GB VRAM, use batch_size = 8 (better working with unbatch version to avoid pad noise).
    """
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_NLLB(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   ALMA

def get_input_targets_ALMA(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [(
        f"Translate from {source_lang_name} to {target_lang_name}:"
        + f"\n{source_lang_name}: {example.get(source_lang)} \n{target_lang_name}:")
        for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def translate_list_of_str_ALMA(list_str, tokenizer, model, target_language):
    """
    Returns a list containing str corresponding to translation of the inputted
    """
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    with torch.no_grad():
        inputs = tokenizer(list_str, return_tensors="pt", padding=True)
        translated = model.generate(inputs["input_ids"].to(device),
                                    num_beams=num_beams, max_new_tokens=max_new_tokens, do_sample=True,
                                    temperature=temperature, top_p=top_p
                                    ).cpu()
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        tgt_language_name = language_name[target_language]
        translated_text = [t.split(f"{tgt_language_name}:")[2] for t in translated_text] # Remove prompt
    return translated_text

def translate_batched_ALMA(inputs, model, tokenizer, batch_size, target_language):
    """
    For 8GB VRAM, use batch_size=1
    For 16GB VRAM, use batch_size=3
    """
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_ALMA(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   Llama 3

def get_input_targets_Llama3(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [
        [{"role": "system", "content": "You are a translator, you output only the translation in the desired language."},
         {"role": "user",
        "content": f"Translate from {source_lang_name} to {target_lang_name}:"
        + f"\n{source_lang_name}: {example.get(source_lang)} \n{target_lang_name}:"
        }] for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def extract_translation_Llama3(translated_prompt):
    answer = translated_prompt.split("<|start_header_id|>assistant<|end_header_id|>\n\n")[-1]
    translation_only = answer.split("<|end_of_text|>")[0]
    translation_only = translation_only.split("<|eot_id|><|start_header_id|>assistant\n")[-1]
    translation_only = translation_only.split("<|eot_id|><|start_header_id|>")[-1]
    return translation_only

def translate_list_of_str_Llama3(list_str, tokenizer, model, target_language=None):
    with torch.no_grad():
        instruct_messages = tokenizer.apply_chat_template(list_str, tokenize=False, add_generation_prompt=True)
        tokens = tokenizer(instruct_messages, padding=True, padding_side='left', return_tensors="pt")
        out_tokens = model.generate(**tokens.to(device),
                                    num_beams=num_beams, do_sample=True,
                                    temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens)
        translations = tokenizer.batch_decode(out_tokens)
        translations = [extract_translation_Llama3(trans) for trans in translations]
        return translations
    
def translate_batched_Llama3(inputs, model, tokenizer, batch_size, target_language):
    """
    For 8GB VRAM use
        batch_size=20 with Llama3 1B,
        batch_size=4 with Llama3 3B
    For 16 GB VRAM use 
        batch_size=40 with Llama3 1B,
        batch_size=10 with Llama3 3B,
        batch_size=5 with Llama3 8B,
    """
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_Llama3(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language=None)
        preds.extend(tslt)
    return preds

#################################   Falcon 3 (Normal + Mamba)

def get_input_targets_Falcon3(dataset, source_lang, target_lang):
    """
    This function is valid for Falcon 3 and it mamba version
    """
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [
        [{"role": "system", "content": "You are a translator, you output only the translation in the desired language."},
         {"role": "user",
          "content": f"Translate from {source_lang_name} to {target_lang_name}:"
          + f"{example.get(source_lang)}"
        }] for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def extract_translation_Falcon3Mamba(translated_prompt):
    answer = translated_prompt.split("<|im_end|>\n<|im_start|>assistant\n")[-1]
    translation_only = answer.split("<|im_end|>")[0]
    return translation_only

def translate_list_of_str_Falcon3Mamba(list_str, tokenizer, model, target_language=None):
    with torch.no_grad():
        instruct_messages = tokenizer.apply_chat_template(list_str, tokenize=False, add_generation_prompt=True)
        tokens = tokenizer(instruct_messages, padding=True, padding_side='left', return_tensors="pt").to(model.device)
        out_tokens = model.generate(**tokens,
                                    num_beams=num_beams, do_sample=True,
                                    temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens)
        translations = tokenizer.batch_decode(out_tokens)
        translations = [extract_translation_Falcon3Mamba(trans) for trans in translations]
        return translations
    
def translate_batched_Falcon3Mamba(inputs, model, tokenizer, batch_size, target_language=None):
    """
    For 16GB VRAM, use
        batch_size=4 with Falcon Mamba 7B (8 bits quantization),
        batch_size=4 with Falcon Mamba 7B (4 bits quantization),
    """
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_Falcon3Mamba(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

def extract_translation_Falcon3(translated_prompt):
    answerpadded = translated_prompt.split("\n<|assistant|>\n")[-1]
    answer = answerpadded.split("<|pad|>")[-1]
    translation_only = answer.split("<|endoftext|>")[0]
    translation_only = re.sub(r"^[^a-zA-Z0-9]*", "", translation_only)
    return translation_only.replace("assistant|>\n", "")

def translate_list_of_str_Falcon3(list_str, tokenizer, model, target_language=None):
    with torch.no_grad():
        instruct_messages = tokenizer.apply_chat_template(list_str, tokenize=False, add_generation_prompt=True)
        tokens = tokenizer(instruct_messages, padding=True, padding_side='left', return_tensors="pt").to(model.device)
        out_tokens = model.generate(**tokens,
                                    num_beams=num_beams, do_sample=True,
                                    temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens)
        translations = tokenizer.batch_decode(out_tokens)
        translations = [extract_translation_Falcon3(trans) for trans in translations]
        return translations
    
def translate_batched_Falcon3(inputs, model, tokenizer, batch_size, target_language=None):
    """
    For 16GB VRAM, use
        batch_size=8 with Falcon 7B (8 bits quantization),
        batch_size=4 with Falcon 3B,
        batch_size=12 with Falcon 1B
    """
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_Falcon3(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   Qwen 2.5

def get_input_targets_Qwen2_5(dataset, source_lang, target_lang):
    """
    This function is valid for Falcon 3 and it mamba version
    """
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [
        [{"role": "system", "content": "You are a translator, you output only the translation in the desired language."},
         {"role": "user",
          "content": f"Translate from {source_lang_name} to {target_lang_name}:"
          + f"{example.get(source_lang)}"
        }] for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def extract_translation_Qwen2_5(translated_prompt):
    answerpadded = translated_prompt.split("\n<|im_start|>assistant\n")[-1]
    answer = answerpadded.split("<|im_end|>")[0]
    translation_only = answer.replace("<|endoftext|>", "")
    return translation_only

def translate_list_of_str_Qwen2_5(list_str, tokenizer, model, target_language=None):
    with torch.no_grad():
        instruct_messages = tokenizer.apply_chat_template(list_str, tokenize=False, add_generation_prompt=True)
        tokens = tokenizer(instruct_messages, padding=True, padding_side='left', return_tensors="pt").to(model.device)
        out_tokens = model.generate(**tokens,
                                    num_beams=num_beams, do_sample=True,
                                    temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens)
        translations = tokenizer.batch_decode(out_tokens)
        translations = [extract_translation_Qwen2_5(trans) for trans in translations]
        return translations

def translate_batched_Qwen2_5(inputs, model, tokenizer, batch_size, target_language=None):
    """
    For 16GB VRAM, use batch_size=100 (up to)
    """
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_Qwen2_5(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   Mistral


def get_input_targets_Mistral(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [
        [{"role": "system", "content": "You are a translator, you output only the translation in the desired language."},
         {"role": "user",
          "content": f"Translate from {source_lang_name} to {target_lang_name}:"
          + f"{example.get(source_lang)}"
        }] for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def extract_translation_Mistral(translated_prompt):
    answerpadded = translated_prompt.split("[/INST] ")[-1]
    answer = answerpadded.split("</s>")[0]
    return answer

def translate_list_of_str_Mistral(list_str, tokenizer, model, target_language=None):
    with torch.no_grad():
        instruct_messages = tokenizer.apply_chat_template(list_str, tokenize=False, add_generation_prompt=True)
        tokens = tokenizer(instruct_messages, padding=True, padding_side='left', return_tensors="pt").to(model.device)
        out_tokens = model.generate(**tokens,
                                    num_beams=num_beams, do_sample=True,
                                    temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens)
        translations = tokenizer.batch_decode(out_tokens)
        translations = [extract_translation_Mistral(trans) for trans in translations]
        return translations

def translate_batched_Mistral(inputs, model, tokenizer, batch_size, target_language=None):
    """
    For 16GB VRAM, use batch_size=2 (up to)
    """
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_Mistral(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   BayLing

def get_input_targets_BayLing(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [(
        f"Translate from {source_lang_name} to {target_lang_name}:"
        + f"\n{source_lang_name}: {example.get(source_lang)} \n{target_lang_name}:")
        for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def translate_list_of_str_BayLing(list_str, tokenizer, model, target_language):
    """
    Returns a list containing str corresponding to translation of the inputted
    """
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    with torch.no_grad():
        inputs = tokenizer(list_str, return_tensors="pt", padding=True)
        translated = model.generate(inputs["input_ids"].to(device),
                                    num_beams=num_beams, max_new_tokens=max_new_tokens, do_sample=True,
                                    temperature=temperature, top_p=top_p
                                    ).cpu()
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        tgt_language_name = language_name[target_language]
        translated_text = [t.split(f"{tgt_language_name}:")[2] for t in translated_text] # Remove prompt
    return translated_text

def translate_batched_BayLing(inputs, model, tokenizer, batch_size, target_language):
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_BayLing(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   BLOOM & BLOOMZ

def get_input_targets_BLOOM(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [(
        f"Translate from {source_lang_name} to {target_lang_name}:"
        + f"\n{source_lang_name}: {example.get(source_lang)} \n{target_lang_name}:")
        for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def translate_list_of_str_BLOOM(list_str, tokenizer, model, target_language):
    """
    Returns a list containing str corresponding to translation of the inputted
    """
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    with torch.no_grad():
        inputs = tokenizer(list_str, return_tensors="pt", padding=True)
        translated = model.generate(inputs["input_ids"].to(device),
                                    num_beams=num_beams, max_new_tokens=max_new_tokens, do_sample=True,
                                    temperature=temperature, top_p=top_p
                                    ).cpu()
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        tgt_language_name = language_name[target_language]
        translated_text = [t.split(f"{tgt_language_name}:")[2] for t in translated_text] # Remove prompt
    return translated_text

def translate_batched_BLOOM(inputs, model, tokenizer, batch_size, target_language):
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_BLOOM(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   OPT & OPT Instruct

def get_input_targets_OPT(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [(
        f"Translate from {source_lang_name} to {target_lang_name}:"
        + f"\n{source_lang_name}: {example.get(source_lang)} \n{target_lang_name}:")
        for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def translate_list_of_str_OPT(list_str, tokenizer, model, target_language):
    """
    Returns a list containing str corresponding to translation of the inputted
    """
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    with torch.no_grad():
        inputs = tokenizer(list_str, return_tensors="pt", padding=True)
        translated = model.generate(inputs["input_ids"].to(device),
                                    num_beams=num_beams, max_new_tokens=max_new_tokens, do_sample=True,
                                    temperature=temperature, top_p=top_p
                                    ).cpu()
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        tgt_language_name = language_name[target_language]
        translated_text = [t.split(f"{tgt_language_name}:")[2] for t in translated_text] # Remove prompt
        translated_text = [t.split(f"\n[END]")[0] for t in translated_text]
    return translated_text

def translate_batched_OPT(inputs, model, tokenizer, batch_size, target_language):
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_OPT(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

#################################   MPT

def get_input_targets_MPT(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use the instruct template
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [(
        "Below is an instruction that describes a task. Write a response that appropriately completes the request. \n### Instruction:"
        + f"Translate from {source_lang_name} to {target_lang_name}: {example.get(source_lang)}"
        + "\n### Response:")
        for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def translate_list_of_str_MPT(list_str, tokenizer, model, target_language=None):
    """
    Returns a list containing str corresponding to translation of the inputted
    """
    with torch.no_grad():
        inputs = tokenizer(list_str, return_tensors="pt", padding=True)
        translated = model.generate(inputs["input_ids"].to(device),
                                    num_beams=num_beams, max_new_tokens=max_new_tokens, do_sample=True,
                                    temperature=temperature, top_p=top_p
                                    ).cpu()
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        translated_text = [t.split("\n### Response:")[-1] for t in translated_text] # Remove prompt
    return translated_text

def translate_batched_MPT(inputs, model, tokenizer, batch_size, target_language):
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_MPT(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

In [None]:
def reduce_flores_to_some_languages(ds_flores, directions: list[str]):
    """
    Extracts a subpart of FLORES dataset to group computations. Keep only the languages
    presents in directions
    Returns a dataset
    """
    print("Extracting all languages in directions from FLORES...")
    list_languages = []
    for direction in directions:
        lang1, lang2 = direction[0:2], direction[3:5]
        if lang1 not in list_languages:
            list_languages.append(lang1)
        if lang2 not in list_languages:
            list_languages.append(lang2)

    language_to_iso = {"en": "eng", "de": "deu", "cs": "ces", "is": "isl", "zh": "cmn", "ru": "rus"}
    ds_list = []
    for elem in ds_flores:
        for lang in list_languages:
            if elem["iso_639_3"] == language_to_iso[lang]:
                if lang == "zh":
                    if elem["glottocode"] == "beij1234":
                        ds_list.append(elem)
                else:
                    ds_list.append(elem)
    return Dataset.from_list(ds_list)

def transform_to_WNT_style(ds_flores, lang, lang_start="en"):
    """
    Convert FLORES dataset (or a fraction of it) to a dataset formatted as WNT23
    Returns a dataset
    """
    language_to_iso = {"en": "eng", "de": "deu", "cs": "ces", "is": "isl", "zh": "cmn", "ru": "rus"}
    list_sentence_lang, list_sentence_lang_start = [], []
    for elem in ds_flores:
        if elem["iso_639_3"] == language_to_iso[lang]:
            if lang == "zh":
                if elem["glottocode"] == "beij1234":
                    list_sentence_lang.append(elem["text"])
            else:
                list_sentence_lang.append(elem["text"])

        elif elem["iso_639_3"] == language_to_iso[lang_start]:
            if lang_start == "zh":
                if elem["glottocode"] == "beij1234":
                    list_sentence_lang_start.append(elem["text"])
            else:
                list_sentence_lang_start.append(elem["text"])
    assert len(list_sentence_lang) == len(list_sentence_lang_start)
    #print(f"Number of samples: {len(list_sentence_lang)}")
    final_text_list = []
    for i in range(len(list_sentence_lang)):
        final_text_list.append({f"{lang_start}": list_sentence_lang_start[i],
                                f"{lang}": list_sentence_lang[i],})
    return Dataset.from_dict({f"{lang_start}-{lang}": final_text_list})

In [None]:
def load_model_benchmark(model_name: str, model_size: Union[str, None] = None) -> tuple:
    """
    Load model and tokenizer for the models considered in the benchmark
    Returns (tokenizer, model)
    """
    if model_name == "alma":
        tokenizer = transformers.LlamaTokenizer.from_pretrained("haoranxu/ALMA-7B", padding_side='left')
        Q_config = BitsAndBytesConfig(load_in_8bit=True) 
        model = transformers.AutoModelForCausalLM.from_pretrained("haoranxu/ALMA-7B", torch_dtype="auto", device_map=device, quantization_config=Q_config)
        
    elif model_name == "nllb":
        tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
        model = transformers.AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", torch_dtype="auto", device_map=device)

    elif model_name == "llama3":
        from credentials import hf_token
        huggingface_hub.login(token = hf_token)
        if model_size=="1B" or model_size=="3B":
            tokenizer = transformers.AutoTokenizer.from_pretrained(f"meta-llama/Llama-3.2-{model_size}-Instruct")
            model = transformers.AutoModelForCausalLM.from_pretrained(f"meta-llama/Llama-3.2-{model_size}-Instruct", torch_dtype="auto", device_map=device)
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
            nQ_cofig = BitsAndBytesConfig(load_in_8bit=True)
            model = transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
        tokenizer.pad_token = tokenizer.eos_token
        model.generation_config.pad_token_id = tokenizer.pad_token_id
    
    elif model_name == "llama3-NI-4bit":
        from credentials import hf_token
        huggingface_hub.login(token = hf_token)
        tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
        nQ_cofig = BitsAndBytesConfig(load_in_4bit=True,
                                      bnb_4bit_quant_type="nf4",
                                      bnb_4bit_compute_dtype=getattr(torch, "float16"),
                                      bnb_4bit_use_double_quant=False)
        model = transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", torch_dtype="auto", device_map=device, quantization_config=Q_config)
        tokenizer.pad_token = tokenizer.eos_token
        model.generation_config.pad_token_id = tokenizer.pad_token_id
        
    elif model_name == "falcon3-mamba":
        tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/Falcon3-Mamba-7B-Instruct")
        Q_config = BitsAndBytesConfig(load_in_8bit=True)
        model = transformers.AutoModelForCausalLM.from_pretrained("tiiuae/Falcon3-Mamba-7B-Instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
    
    elif model_name == "falcon3":
        if model_size=="1B" or model_size=="3B":
            tokenizer = transformers.AutoTokenizer.from_pretrained(f"tiiuae/Falcon3-{model_size}-Instruct")
            model = transformers.AutoModelForCausalLM.from_pretrained(f"tiiuae/Falcon3-{model_size}-Instruct", torch_dtype="auto", device_map=device)
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/Falcon3-7B-Instruct")
            Q_config = BitsAndBytesConfig(load_in_8bit=True)
            model = transformers.AutoModelForCausalLM.from_pretrained("tiiuae/Falcon3-7B-Instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
        model.generation_config.pad_token_id = tokenizer.pad_token_id
        
    elif model_name == "qwen2.5":
        if model_size=="0.5B" or model_size=="1.5B" or model_size=="3B":
            tokenizer = transformers.AutoTokenizer.from_pretrained(f"Qwen/Qwen2.5-{model_size}-Instruct")
            model = transformers.AutoModelForCausalLM.from_pretrained(f"Qwen/Qwen2.5-{model_size}-Instruct", torch_dtype="auto", device_map=device)
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
            Q_config = BitsAndBytesConfig(load_in_8bit=True)
            model = transformers.AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B-Instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
    
    elif model_name == "mistral":
        from credentials import hf_token
        huggingface_hub.login(token = hf_token)
        tokenizer = transformers.AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
        tokenizer.pad_token = tokenizer.eos_token
        Q_config = BitsAndBytesConfig(load_in_8bit=True)
        model = transformers.AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", torch_dtype="auto", device_map=device, quantization_config=Q_config)
        model.generation_config.pad_token_id = tokenizer.pad_token_id
    
    elif model_name == "bayling":
        tokenizer = transformers.AutoTokenizer.from_pretrained("ICTNLP/bayling-2-7b")
        tokenizer.pad_token = tokenizer.eos_token
        Q_config = BitsAndBytesConfig(load_in_8bit=True)
        model = transformers.AutoModelForCausalLM.from_pretrained("ICTNLP/bayling-2-7b", torch_dtype="auto", device_map=device, quantization_config=Q_config)
    
    elif model_name == "bloom":
        if model_size=="0.5B":
            tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-560m")
            model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", torch_dtype=torch.bfloat16, device_map=device)
        elif model_size=="1B":
            tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-1b7")
            model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", torch_dtype="auto", device_map=device)
        elif model_size=="3B":
            tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-3b")
            model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b", torch_dtype="auto", device_map=device)
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-7b1")
            Q_config = BitsAndBytesConfig(load_in_8bit=True)
            model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype="auto", device_map=device, quantization_config=Q_config)

    elif model_name == "bloomz":
        if model_size=="1B":
            tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloomz-1b7")
            model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloomz-1b7", torch_dtype="auto", device_map=device)
        elif model_size=="3B":
            tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloomz-3b")
            model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloomz-3b", torch_dtype="auto", device_map=device)
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloomz-7b1")
            Q_config = BitsAndBytesConfig(load_in_8bit=True)
            model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloomz-7b1", torch_dtype="auto", device_map=device, quantization_config=Q_config)
    
    elif model_name == "opt":
        if model_size=="0.1B":
            tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-125m")
            model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-125m", torch_dtype="auto", device_map=device)
        elif model_size=="0.3B":
            tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-350m")
            model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", device_map=device)
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-6.7b")
            Q_config = BitsAndBytesConfig(load_in_8bit=True)
            model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b", torch_dtype="auto", device_map=device, quantization_config=Q_config)
    
    elif model_name == "opt-instruct":
        tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-iml-1.3b")
        model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-iml-1.3b", torch_dtype="auto", device_map=device)
    
    elif model_name == "mpt":
        tokenizer = transformers.AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
        tokenizer.pad_token = tokenizer.eos_token
        Q_config = BitsAndBytesConfig(load_in_8bit=True)
        model = transformers.AutoModelForCausalLM.from_pretrained("mosaicml/mpt-7b-instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
        model.generation_config.pad_token_id = tokenizer.pad_token_id
        
    return tokenizer, model

def get_support_fn_benchmark(model_name: str) -> tuple:
    """
    Return functions to generate rightly formatted inputs and a function to use
    translation models in inference for the models considered in the benchmark
    """
    if model_name == "alma":
        get_input_targets_fn = get_input_targets_ALMA
        tslt_fn = translate_batched_ALMA
        
    elif model_name == "nllb":
        get_input_targets_fn = get_input_targets_NLLB
        tslt_fn = translate_batched_NLLB

    elif model_name == "llama3":
        get_input_targets_fn = get_input_targets_Llama3
        tslt_fn = translate_batched_Llama3
    
    elif model_name == "llama3-NI-4bit":
        get_input_targets_fn = get_input_targets_Llama3NI4bit
        tslt_fn = translate_batched_Llama3NI4bit
    
    elif model_name == "falcon3-mamba":
        get_input_targets_fn = get_input_targets_Falcon3
        tslt_fn = translate_batched_Falcon3Mamba
    
    elif model_name == "falcon3":
        get_input_targets_fn = get_input_targets_Falcon3
        tslt_fn = translate_batched_Falcon3
    
    elif model_name == "qwen2.5":
        get_input_targets_fn = get_input_targets_Qwen2_5
        tslt_fn = translate_batched_Qwen2_5
    
    elif model_name == "mistral":
        get_input_targets_fn = get_input_targets_Mistral
        tslt_fn = translate_batched_Mistral
    
    elif model_name == "bayling":
        get_input_targets_fn = get_input_targets_BayLing
        tslt_fn = translate_batched_BayLing

    elif model_name == "bloom" or model_name == "bloomz":
        get_input_targets_fn = get_input_targets_BLOOM
        tslt_fn = translate_batched_BLOOM
    
    elif model_name == "opt" or model_name == "opt-instruct":
        get_input_targets_fn = get_input_targets_OPT
        tslt_fn = translate_batched_OPT
    
    elif model_name == "mpt":
        get_input_targets_fn = get_input_targets_MPT
        tslt_fn = translate_batched_MPT
        
    return get_input_targets_fn, tslt_fn

def get_inp_tgt_lang(direction: str) -> tuple[str, str]:
    """
    Return source and target language given a direction xx-yy
    """
    return direction[0:2], direction[3:5]

def reduce_dataset(inputs: list[str], sources: list[str], targets, final_nb: list[str]) -> tuple[list[str], list[str], list[str]]:
    """
    Selects randomly the samples of the evaluation corpus
    """
    idx = np.arange(len(inputs))
    np.random.seed(42)
    idx = np.random.choice(idx, final_nb)
    return [inputs[i] for i in idx], [sources[i] for i in idx], [targets[i] for i in idx]

def get_translations_filename(direction: str, dataset_name: str, model_name: str, model_size: Union[str, None], reduce_size: Union[int, None], translation_folder: Union[str, None] = None) -> str:
    """
    Generate the pkl filename where to save the generated translations
    """
    mod_size = "-"+model_size if model_size is not None else ""
    translation_folder = "evaluations" if translation_folder is None else translation_folder
    return f"./generated_translations/{translation_folder}/{dataset_name}_{model_name}{mod_size}_{direction}_red-{reduce_size}.pkl"

def get_eval_filename(direction: str, dataset_name: str, model_name: str, model_size: Union[str, None], reduce_size: Union[int, None]) -> str:
    """
    Generate the pkl filename where to save the computed metrics
    """
    mod_size = "-"+model_size if model_size is not None else ""
    return f"./evaluations/raw_{dataset_name}_{model_name}{mod_size}_{direction}_red-{reduce_size}.pkl"


# Main function to generate translations
def generate_translation_different_directions(directions: list[str],
                                              dataset_name: str,
                                              model_name: str,
                                              batch_size: int,
                                              reduce_size: Union[int, None] = None,
                                              model_size: Union[str, None] = None,
                                              load_model_and_tokenizer_fn = load_model_benchmark,
                                              get_input_targets_fn = None,
                                              tslt_fn = None,
                                              translation_folder = None) -> None:
    """
    Inputs:
        - directions: list of strings
        - dataset name: str, either "flores" or "wnt23"
        - model_name: str
        - batch_size: int (advised 1 to avoid padding - or make sure your tokenizer is correctly parametrized)
        - reduce_size: int, the number of random samples to use. Samples are sampled using seed to have same
          samples for each models. If reduce_size=None, take all the dataset samples.
        - model_size: str or None

        - load_model_and_tokenizer_fn: a function returning a tuple
          SIGNATURE : load_model_and_tokenizer_fn(model_name: str, model_size: Union[str, None]) -> tokenizer, model

        - get_input_targets_fn: a function returning a tuple of three lists of str gicen the dataset and the source and target language:
          SIGNATURE : get_input_targets_fn(ds: HF_dataset, input_language: str, target_language: str) -> sources, inputs, targets: list[str], list[str], list[str]
            sources are the initial sentences (used in COMET metric)
            inputs are the complete prompts to the model (only one string, apply the instruct template in get_input_targets_fn)
            targets are the target translations

        - tslt_fn: a function prompting the model and generating the a list of translations given a list of prompt, the tokenizer and the model.
          It must include a batch_size argument (the batched processing is not necessary to implement in the function). Include also the
          target_language as argument for consistency with other functions.
          SIGNATURE : tslt_fn(inputs: list[str], model: HF_model, tokenizer: HF_tokenizer, batch_size: int, target_language: Union[str, None]) -> translation_pred: list[str]
    """
    
    # Loading full flores (if necessary)
    if dataset_name == "flores":
        from credentials import hf_token
        huggingface_hub.login(token = hf_token)
        ds_flores = load_dataset("openlanguagedata/flores_plus")["devtest"]

    # Loading corresponding model
    print("Loading model...")
    tokenizer, model = load_model_and_tokenizer_fn(model_name, model_size)
    if get_input_targets_fn is None:
        get_input_targets_fn, tslt_fn = get_support_fn_benchmark(model_name)

    for direction in directions:
        print(f"Translating {direction} with model {model_name}"
              +(f"-{model_size}" if model_size is not None else "")
              +f" for dataset {dataset_name}...")
        input_language, target_language = get_inp_tgt_lang(direction)
        
        # Getting the right split corresponding to the translation direction
        if dataset_name == "flores":
            ds = transform_to_WNT_style(ds_flores, lang=target_language, lang_start=input_language)
        elif dataset_name == "wnt23":
            if direction != "cs-en":
                ds = load_dataset("haoranxu/WMT23-Test", direction)["test"]
            else:
                ds = load_dataset("haoranxu/WMT23-Test", "en-cs")["test"]
                ds = Dataset.from_dict({f"cs-en": ds["en-cs"][::-1]}) # Reverse list to avoid having same sentences (if reduce_size not None)
        # Extracting input & targets
        sources, inputs, targets = get_input_targets_fn(ds, input_language, target_language)
        print(f"Total number of samples: {len(sources)}" + ("" if reduce_size is None else f"; reduced to {reduce_size} (numpy seed = 42)"))
        if reduce_size is not None:
            sources, inputs, targets = reduce_dataset(sources, inputs, targets, reduce_size)
        translation_pred = tslt_fn(inputs, model, tokenizer, batch_size, target_language)

        # Saving translations
        translation_folder = "evaluations" if translation_folder is None else translation_folder
        if not os.path.exists(f"./generated_translations/{translation_folder}"):
            os.makedirs(f"./generated_translations/{translation_folder}")
        translations_filename = get_translations_filename(direction, dataset_name, model_name, model_size, reduce_size, translation_folder)
        
        with open(translations_filename, "wb") as f:
            pickle.dump(translation_pred, f, pickle.HIGHEST_PROTOCOL)

    # De-load model from GPU to enable calling this function with another model without restarting kernel
    model.cpu()
    del model, tokenizer

# Wrappers for several models and several datasets
def generate_translation_several_models(directions, dataset_name, model_names, model_sizes, batch_size, reduce_size,
                                        load_model_and_tokenizer_fn = load_model_benchmark,
                                        get_input_targets_fn = None,
                                        tslt_fn = None,
                                        translation_folder = None) -> None:
    for model_name, model_size in zip(model_names, model_sizes):
        generate_translation_different_directions(directions=directions,
                                                dataset_name=dataset_name,
                                                model_name=model_name,
                                                model_size=model_size,
                                                batch_size=batch_size,
                                                reduce_size=reduce_size,
                                                load_model_and_tokenizer_fn = load_model_and_tokenizer_fn,
                                                get_input_targets_fn = get_input_targets_fn,
                                                tslt_fn = tslt_fn,
                                                translation_folder = translation_folder)
        
def generate_translation_several_datasets(directions, dataset_names, model_names, model_sizes, batch_size, reduce_size,
                                          load_model_and_tokenizer_fn = load_model_benchmark,
                                          get_input_targets_fn = None,
                                          tslt_fn = None,
                                          translation_folder = None) -> None:
    for dataset_name in dataset_names:
        generate_translation_several_models(directions, dataset_name, model_names, model_sizes, batch_size, reduce_size,
                                            load_model_and_tokenizer_fn = load_model_and_tokenizer_fn,
                                            get_input_targets_fn = get_input_targets_fn,
                                            tslt_fn = tslt_fn,
                                            translation_folder = translation_folder)

In [None]:
# Functions to compute metrics given:
#   the metric,
#   the list of initial sentence,
#   the list of target translations,
#   the list of translated sentences,
#   the target language

# Each metric is computed sample by sample. The output is a dictionnary containing
#   the full list of scores
#   the mean score
#   the standard deviation
#   the unbias standard deviation

def eval_rouge(metric, sources, targets, translation_infered, target_language):
    out_rouge = metric.compute(predictions=translation_infered,
                                  references=targets,
                                  use_aggregator=False)
    # For further statistical treatment
    results_rouge = {"rouge1": {},
                     "rouge2": {},
                     "rougeL": {},
                     "rougeLsum": {},}
    for key in ["rouge1", "rouge2", "rougeL", "rougeLsum"]:
        results_rouge[key]["mean_score"] = np.mean(out_rouge[key]).item()
        results_rouge[key]["std_score"] = np.std(out_rouge[key]).item()
        results_rouge[key]["std_unbias_score"] = np.std(out_rouge[key], ddof=1).item()
    return results_rouge

def eval_bleu(metric, sources, targets, translation_infered, target_language):
    results_bleu = {"scores": [], "brevity_penalty": []}
    for trans, tgt in zip(translation_infered, targets):
        try:
            bleu_out = metric.compute(predictions=[trans],
                                    references=[[tgt]],
                                    tokenizer = TokenizerZh() if target_language=="zh" else Tokenizer13a())
        except ZeroDivisionError:
            bleu_out={"bleu": 0., "brevity_penalty": 0.}

        results_bleu["scores"].append(bleu_out["bleu"])
        results_bleu["brevity_penalty"].append(bleu_out["brevity_penalty"])
    # For further statistical treatment
    results_bleu["mean_score"] = np.mean(results_bleu["scores"]).item()
    results_bleu["std_score"] = np.std(results_bleu["scores"]).item()
    results_bleu["std_unbias_score"] = np.std(results_bleu["scores"], ddof=1).item()
    return {"bleu": results_bleu}

def eval_sacrebleu(metric, sources, targets, translation_infered, target_language):
    results_sacrebleu = {"scores": [], "brevity_penalty": []}
    for trans, tgt in zip(translation_infered, targets):
        try:
            sacrebleu_out = metric.compute(predictions=[trans],
                                            references=[[tgt]],
                                            tokenize = "zh" if target_language=="zh" else "13a")
        except ZeroDivisionError:
            sacrebleu_out = {"score": 0., "bp": 0.}
        results_sacrebleu["scores"].append(sacrebleu_out["score"])
        results_sacrebleu["brevity_penalty"].append(sacrebleu_out["bp"])
    # For further statistical treatment
    results_sacrebleu["mean_score"] = np.mean(results_sacrebleu["scores"]).item()
    results_sacrebleu["std_score"] = np.std(results_sacrebleu["scores"]).item()
    results_sacrebleu["std_unbias_score"] = np.std(results_sacrebleu["scores"], ddof=1).item()
    return {"sacrebleu": results_sacrebleu}

def eval_chrf_and_chrfplusplus(metric, sources, targets, translation_infered, target_language):
    results_chrf = {"scores": []}
    results_chrfplusplus = {"scores": []}
    for trans, tgt in zip(translation_infered, targets):
        try:
            chrf_out = metric.compute(predictions=[trans],
                                    references=[[tgt]],
                                    word_order=0,
                                    eps_smoothing=False)
        except ZeroDivisionError:
            chrf_out = {"score": 0.}
        try:
            chrfplusplus_out = metric.compute(predictions=[trans],
                                            references=[[tgt]],
                                            word_order=2,
                                            eps_smoothing=True)
        except ZeroDivisionError:
            chrfplusplus_out = {"score": 0.}
        results_chrf["scores"].append(chrf_out['score'])
        results_chrfplusplus["scores"].append(chrfplusplus_out['score'])
    # For further statistical treatment
    results_chrf["mean_score"] = np.mean(results_chrf["scores"]).item()
    results_chrf["std_score"] = np.std(results_chrf["scores"]).item()
    results_chrf["std_unbias_score"] = np.std(results_chrf["scores"], ddof=1).item()
    results_chrfplusplus["mean_score"] = np.mean(results_chrfplusplus["scores"]).item()
    results_chrfplusplus["std_score"] = np.std(results_chrfplusplus["scores"]).item()
    results_chrfplusplus["std_unbias_score"] = np.std(results_chrfplusplus["scores"], ddof=1).item()
    return {"chrf": results_chrf,
            "chrfplusplus": results_chrfplusplus}

def eval_comet(metric, sources, targets, translation_infered, target_language):
    results_comet = metric.compute(predictions=translation_infered,
                                         references=targets,
                                         sources=sources)
    # For further statistical treatment
    results_comet.update({"std_score": np.std(results_comet["scores"]).item(),
                          "std_unbias_score": np.std(results_comet["scores"], ddof=1).item()})
    return {"comet": results_comet}

def eval_bleurt(metric, sources, targets, translation_infered, target_language):
    results_bleurt = metric.compute(predictions=translation_infered,
                                    references=targets)
    # For further statistical treatment
    results_bleurt.update({"mean_score": np.mean(results_bleurt["scores"]).item(),
                           "std_score": np.std(results_bleurt["scores"]).item(),
                           "std_unbias_score": np.std(results_bleurt["scores"], ddof=1).item()})
    return {"bleurt": results_bleurt}

def eval_bertscore(metric, sources, targets, translation_infered, target_language):
    results_bert = metric.compute(predictions=translation_infered, references=targets, lang=target_language)
    # For further statistical treatment
    results_bert.update({"mean_score": np.mean(results_bert["f1"]).item(),
                         "std_score": np.std(results_bert["f1"]).item(),
                         "std_unbias_score": np.std(results_bert["f1"], ddof=1).item()})
    return {"bertscore": results_bert}

def eval_meteor(metric, sources, targets, translation_infered, target_language):
    results_meteor = {"scores": []}
    for trans, tgt in zip(translation_infered, targets):
        meteor_out = metric.compute(predictions=[trans],
                                    references=[tgt])
        results_meteor["scores"].append(meteor_out["meteor"])
    # For further statistical treatment
    results_meteor["mean_score"] = np.mean(results_meteor["scores"]).item()
    results_meteor["std_score"] = np.std(results_meteor["scores"]).item()
    results_meteor["std_unbias_score"] = np.std(results_meteor["scores"], ddof=1).item()
    return {"meteor": results_meteor}

def get_eval_fn(metric_name):
    if metric_name == "rouge":
        return eval_rouge
    elif metric_name == "bleu":
        return eval_bleu
    elif metric_name == "sacrebleu":
        return eval_sacrebleu
    elif metric_name == "chrf":
        return eval_chrf_and_chrfplusplus
    elif metric_name == "comet":
        return eval_comet
    elif metric_name == "bleurt":
        return eval_bleurt
    elif metric_name == "bertscore":
        return eval_bertscore
    elif metric_name == "meteor":
        return eval_meteor

def load_metric(metric_name):
    if metric_name == "rouge":
        return evaluate.load('rouge')
    elif metric_name == "bleu":
        return evaluate.load("bleu")
    elif metric_name == "sacrebleu":
        return evaluate.load("sacrebleu")
    elif metric_name == "chrf":
        return evaluate.load("chrf")
    elif metric_name == "comet":
        return evaluate.load('comet')
    elif metric_name == "bleurt":
        return evaluate.load('bleurt', 'bleurt-large-512')
    elif metric_name == "bertscore":
        return evaluate.load("bertscore")
    elif metric_name == "meteor":
        return evaluate.load('meteor')

In [None]:
def eval_one_metric_one_model(metric_name: str, metric, directions: list[str], dataset_name: str, model_name: str, model_size: Union[str, None], reduce_size: Union[int, None],
                              input_and_generate_fn = get_support_fn_benchmark) -> None:
    """
    Compute the evaluation accoreding to one metric of one model on several directions
    Metric name should be in ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum", "BLEU", "SacreBLEU", "chrF", "chrF++", "COMET", "BLEURT", "BERTscore", "METEOR"]
    metric if the huggingface metric return by evaluate.load()
    Refer to translations generation function for input_and_generate_fn

    Save directly the computed evaluations, returns None
    """
    # Getting right evaluation function
    metric_eval_fn = get_eval_fn(metric_name)

    # Loading full flores (if necessary)
    if dataset_name == "flores":
        from credentials import hf_token
        huggingface_hub.login(token = hf_token)
        ds_flores = load_dataset("openlanguagedata/flores_plus")["devtest"]
        ds_flores = reduce_flores_to_some_languages(ds_flores, directions)

    for direction in directions:
        print(f"Evaluating translations {direction} with model {model_name}"
              +(f"-{model_size}" if model_size is not None else "")
              +f" for dataset {dataset_name}...")
        input_language, target_language = get_inp_tgt_lang(direction)

        # Loading previous eval if existing
        eval_filename = get_eval_filename(direction, dataset_name, model_name, model_size, reduce_size)
        if not os.path.exists(f"./evaluations"):
            os.makedirs(f"./evaluations")
        if os.path.exists(eval_filename):
            with open(eval_filename, "rb") as f:
                complete_eval = pickle.load(f)
        else:
            complete_eval = {}
        
        # Getting the right split corresponding to the translation direction
        if dataset_name == "flores":
            ds = transform_to_WNT_style(ds_flores, lang=target_language, lang_start=input_language)
        elif dataset_name == "wnt23":
            if direction != "cs-en":
                ds = load_dataset("haoranxu/WMT23-Test", direction)["test"]
            else:
                ds = load_dataset("haoranxu/WMT23-Test", "en-cs")["test"]
                ds = Dataset.from_dict({f"cs-en": ds["en-cs"][::-1]}) # Reverse list to avoid having same sentences (if reduce_size not None)
        
        # Extracting input & targets
        get_input_targets_fn, _ = input_and_generate_fn(model_name)
        sources, inputs, targets = get_input_targets_fn(ds, input_language, target_language)
        print(f"Total number of samples: {len(sources)}" + ("" if reduce_size is None else f"; reduced to {reduce_size} (numpy seed = 42)"))
        if reduce_size is not None:
            # /!\ Use same reduce size and same seed to ensure sources and previous inputs are the same /!\
            sources, inputs, targets = reduce_dataset(sources, inputs, targets, reduce_size)

        # Loading precomputed translations
        translations_filename = get_translations_filename(direction, dataset_name, model_name, model_size, reduce_size)
        with open(translations_filename, "rb") as f:
            translation_pred = pickle.load(f)
        
        # Evaluation translation for this direction
        eval_dict = metric_eval_fn(metric, sources, targets, translation_pred, target_language)
        complete_eval.update(eval_dict)

        with open(eval_filename, "wb") as f:
            pickle.dump(complete_eval, f, pickle.HIGHEST_PROTOCOL)

# Wrapper to perform several evaluations
def eval_one_metric(metric_name, directions, dataset_names, model_names, model_sizes, reduce_sizes):
    print(f"Computing evaluations with {metric_name}...")
    metric = load_metric(metric_name)
    for dataset_name, reduce_size in zip(dataset_names, reduce_sizes):
        for model_name, model_size in zip(model_names, model_sizes):
            eval_one_metric_one_model(metric_name, metric, directions, dataset_name, model_name, model_size, reduce_size)

def eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes):
    for metric_name in metric_names:
        eval_one_metric(metric_name, directions, dataset_names, model_names, model_sizes, reduce_sizes)

In [None]:
def parallelCoordinatesPlot(title, N, data, category, ynames, colors=None, category_names=None, savepath=None):
    """
    A legend is added, if category_names is not None.

    :param title: The title of the plot.
    :param N: Number of data sets (i.e., lines).
    :param data: A list containing one array per parallel axis, each containing N data points.
    :param category: An array containing the category of each data set.
    :param category_names: Labels of the categories. Must have the same length as set(category).
    :param ynames: The labels of the parallel axes.
    :param colors: A colormap to use.
    :return:
    """

    fig, host = plt.subplots(figsize=(24, 8))

    # organize the data
    ys = np.dstack(data)[0]
    ymins = ys.min(axis=0)
    ymaxs = ys.max(axis=0)
    dys = ymaxs - ymins
    ymins -= dys * 0.05  # add 5% padding below and above
    ymaxs += dys * 0.05
    dys = ymaxs - ymins

    # transform all data to be compatible with the main axis
    zs = np.zeros_like(ys)
    zs[:, 0] = ys[:, 0]
    zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]

    axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]
    for i, ax in enumerate(axes):
        ax.set_ylim(ymins[i], ymaxs[i])
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        if ax != host:
            ax.spines['left'].set_visible(False)
            ax.yaxis.set_ticks_position('right')
            ax.spines["right"].set_position(("axes", i / (ys.shape[1] - 1)))

    host.set_xlim(0, ys.shape[1] - 1)
    host.set_xticks(range(ys.shape[1]))
    host.set_xticklabels(ynames, fontsize=7)
    host.tick_params(axis='x', which='major', pad=7)
    host.spines['right'].set_visible(False)
    host.xaxis.tick_top()
    host.set_title(title, fontsize=15)

    if colors is None:
        colors = plt.cm.tab10.colors
    if category_names is not None:
        legend_handles = [None for _ in category_names]
    else:
        legend_handles = [None for _ in set(category)]
    for j in range(N):
        # to just draw straight lines between the axes:
        # host.plot(range(ys.shape[1]), zs[j,:], c=colors[(category[j] - 1) % len(colors) ])

        # create bezier curves
        # for each axis, there will a control vertex at the point itself, one at 1/3rd towards the previous and one
        #   at one third towards the next axis; the first and last axis have one less control vertex
        # x-coordinate of the control vertices: at each integer (for the axes) and two inbetween
        # y-coordinate: repeat every point three times, except the first and last only twice
        verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],
                         np.repeat(zs[j, :], 3)[1:-1]))
        # for x,y in verts: host.plot(x, y, 'go') # to show the control points of the beziers
        codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]
        path = Path(verts, codes)
        patch = patches.PathPatch(path, facecolor='none', lw=1, edgecolor=colors[category[j]])
        legend_handles[category[j]] = patch
        host.add_patch(patch)

        if category_names is not None:
            host.legend(legend_handles, category_names,
                        loc='lower center', bbox_to_anchor=(0.5, -0.18),
                        ncol=len(category_names)//2, fancybox=True, shadow=True)

    plt.tight_layout()
    if savepath is not None:
        plt.savefig(savepath)
    plt.show()
    plt.close()



def barPlot(title, metric_name, directions, results_per_model, colors, savepath=None):

    x = np.arange(len(directions))  # the label locations
    width = 0.05  # the width of the bars
    multiplier = 0
    nb_model = len(results_per_model)

    fig, ax = plt.subplots(layout='constrained', figsize=(24, 8))

    for i, (model, results) in enumerate(results_per_model.items()):
        offset = width * multiplier
        rects = ax.bar(x + offset, results["mean_score"], width, label=model, yerr=results["std_unbias_score"], align='center', ecolor='black', capsize=2, color = colors[i])
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(f"{metric_name} score")
    ax.set_ylim(0)
    ax.set_title(title, fontsize=15)
    ax.set_xticks(x + (nb_model//2)*width, directions)
    ax.legend(loc='upper center', ncols=len(results_per_model)//2, fancybox=True, shadow=True)
    plt.tight_layout()
    if savepath is not None:
        plt.savefig(savepath)
    plt.show()
    plt.close()

In [None]:
def get_full_model_name(model_name, model_size):
    return f"{model_name}"+(f"-{model_size}" if model_size is not None else "")

def concatenate_results_parrPlot(directions, models, model_sizes, datasets, reduce_sizes, metrics_names, agg_keys, verbose=False):
    """
    agg_keys should be a list containing keys present in output dictonnary for every metrics desired
    for all metrics, can be only ["mean_score", "std_score", "std_unbias_score"] (or less)
    """
    metrics_names2metrics = {"ROUGE-1": "rouge1",
                             "ROUGE-2": "rouge2",
                             "ROUGE-L": "rougeL",
                             "ROUGE-Lsum": "rougeLsum",
                             "BLEU": "bleu",
                             "SacreBLEU": "sacrebleu",
                             "chrF": "chrf",
                             "chrF++": "chrfplusplus",
                             "COMET": "comet",
                             "BLEURT": "bleurt",
                             "BERTscore": "bertscore",
                             "METEOR": "meteor"}
    metrics = [metrics_names2metrics[name] for name in metrics_names] # Want something ordered, don't only take dico.values()
    
    data = {key: [[] for _ in range(len(metrics))] for key in agg_keys}
    
    print("Extracting and concatenating metrics...")
    for dataset_name, reduce_size in zip(datasets, reduce_sizes):
        for model_name, model_size in zip(models, model_sizes):
            for direction in directions:
                eval_filename = get_eval_filename(direction, dataset_name, model_name, model_size, reduce_size)
                if verbose:
                    print(eval_filename)
                with open(eval_filename, "rb") as f:
                    evaluations = pickle.load(f)
                for i, m in enumerate(metrics):
                    for key in agg_keys:
                        data[key][i].append(evaluations[m][key])
    return data

def make_parallel_plot(directions,
                       models, model_sizes,
                       datasets, reduce_sizes,
                       metrics_names,
                       list_colors_per, colors=None, verbose=False, savepath=None):
    # Aggregate eval data
    data = concatenate_results_parrPlot(directions, models, model_sizes, datasets, reduce_sizes, metrics_names, agg_keys=["mean_score"], verbose=verbose)
    data = data["mean_score"]

    # Generate plot categories
    ## Precompute categories names
    print(f"Generating categories based {list_colors_per} type ('list_colors_per' param)...")

    dataset_name2real_name = {"wnt23": "WNT23", "flores": "FLORES+"}
    dataset_name2real_name_and_reduction = {}
    for dataset_name, reduce_size in zip(datasets, reduce_sizes):
        dataset_name2real_name_and_reduction[dataset_name] = dataset_name2real_name[dataset_name] + f" - reduct to {reduce_size} samples"
    category_names_data = [dataset_name2real_name_and_reduction[dataset_name] for dataset_name in datasets] if "dataset" in list_colors_per else []
    category_names_direction = directions if "direction" in list_colors_per else []
    category_names_models = [get_full_model_name(model_name, model_size) for model_name, model_size in zip(models, model_sizes)] if "model" in list_colors_per else []

    ## Generate all combinaisons of categories
    category_names = []
    for cat_data in (category_names_data if len(category_names_data)>0 else [""]):
        is_text = len(category_names_data)>0 and (len(category_names_direction)>0 or len(category_names_models)>0)
        cat1 = cat_data + (" - " if is_text else "")
        for cat_model in (category_names_models if len(category_names_models)>0 else [""]):
            is_text = len(category_names_models)>0 and len(category_names_direction)>0
            cat2 = cat1 + cat_model + (" - " if is_text else "")
            for cat_dir in (category_names_direction if len(category_names_direction)>0 else [""]):
                cat3 = cat2 + cat_dir
                category_names.append(cat3)
    elem2cat = {cat_name: i for i, cat_name in enumerate(category_names)}

    ## Get category name per element
    category = []
    for dataset_name in datasets:
        for model_name, model_size in zip(models, model_sizes):
            for direction in directions:
                is_text = len(category_names_data)>0 and (len(category_names_direction)>0 or len(category_names_models)>0)
                cat_name = (dataset_name2real_name_and_reduction[dataset_name] if len(category_names_data)>0 else "") + (" - " if is_text else "")
                is_text = len(category_names_models)>0 and len(category_names_direction)>0
                cat_name = cat_name + (get_full_model_name(model_name, model_size) if len(category_names_models)>0 else "") + (" - " if is_text else "")
                cat_name = cat_name + (direction if len(category_names_direction)>0 else "")
                category.append(elem2cat[cat_name])

    if colors is None and len(list_colors_per)==1:
        if "dataset" in list_colors_per:
            colors = plt.cm.Accent.colors
        elif "direction" in list_colors_per:
            colors = plt.cm.tab20.colors
        else:
            colors = plt.cm.Dark2.colors + plt.cm.tab10.colors[0:7] + plt.cm.tab10.colors[8:]

    # Plot
    print("Plotting in parallel coordinates plot...")
    n_datasets, n_directions, n_models = len(directions), len(models), len(datasets)
    parallelCoordinatesPlot(title = f"Influence of {list_colors_per} on translation performances",
                            N = n_datasets*n_directions*n_models,
                            data = data,
                            category = category,
                            category_names = category_names,
                            ynames = metrics_names,
                            colors=colors,
                            savepath=savepath)

def concatenate_results_barPlot(directions, models, model_sizes, dataset_name, reduce_size, metric_name, verbose=False):
    """
    for all metrics, can be only ["mean_score", "std_score", "std_unbias_score"] (or less)
    """
    metrics_names2metrics = {"ROUGE-1": "rouge1",
                             "ROUGE-2": "rouge2",
                             "ROUGE-L": "rougeL",
                             "ROUGE-Lsum": "rougeLsum",
                             "BLEU": "bleu",
                             "SacreBLEU": "sacrebleu",
                             "chrF": "chrf",
                             "chrF++": "chrfplusplus",
                             "COMET": "comet",
                             "BLEURT": "bleurt",
                             "BERTscore": "bertscore",
                             "METEOR": "meteor"}
    metric = metrics_names2metrics[metric_name]
    results_per_model = {get_full_model_name(model_name, model_size): {"mean_score":[], "std_unbias_score":[]} for model_name, model_size in zip(models, model_sizes)}
    
    print("Extracting and concatenating metrics...")
    for model_name, model_size in zip(models, model_sizes):
        for direction in directions:
            eval_filename = get_eval_filename(direction, dataset_name, model_name, model_size, reduce_size)
            if verbose:
                print(eval_filename)
            with open(eval_filename, "rb") as f:
                evaluations = pickle.load(f)
            results_per_model[get_full_model_name(model_name, model_size)]["mean_score"].append(evaluations[metric]["mean_score"])
            results_per_model[get_full_model_name(model_name, model_size)]["std_unbias_score"].append(evaluations[metric]["std_unbias_score"])
    return results_per_model

def make_bar_plot(directions,
                    model_names, model_sizes,
                    dataset_name, reduce_size,
                    metric_names,
                    cmap=None,
                    savepath = None):
    for metric_name in metric_names:
        title = f"{metric_name} translation evaluation on dataset {dataset_name} (mean score with unbiased std)"
        results_per_model = concatenate_results_barPlot(directions, model_names, model_sizes, dataset_name, reduce_size, metric_name, verbose=False)
        cmap = "Spectral" if cmap is None else cmap
        cmap_perso = ListedColormap(sns.color_palette(cmap, len(results_per_model)).as_hex())
        barPlot(title,
                metric_name,
                directions,
                results_per_model,
                colors = cmap_perso.colors,
                savepath = (savepath+f"_{metric_name}" if savepath is not None else None))

# ICL pipeline

## For Causal LM

In [None]:
def get_input_tgt_icl_fn_CausalModel(number_examples):
    """Works for ALMA, OPT-instruct, BLOOMz, and any GPT model non instruct"""
    def get_input_targets_icl_CausalModel(dataset, source_lang, target_lang):
        language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
        source_lang_name = language_name[source_lang]
        target_lang_name = language_name[target_lang]
        # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
        sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]

        inputs = []
        offset_seed = 0
        print("Generating prompts for In-Context learning...")
        for i in tqdm(range(len(dataset))):
            np.random.seed(i + offset_seed)
            idx = np.arange(len(dataset))
            idx = np.random.choice(idx, number_examples)
            while i in idx: # Make sure the translation to do is not in the examples
                offset_seed += 1
                np.random.seed(i + offset_seed)
                idx = np.arange(len(dataset))
                idx = np.random.choice(idx, number_examples)
            examples = [dataset[f"{source_lang}-{target_lang}"][n] for n in idx]
            inp = f"Here are examples of translations from {source_lang_name} to {target_lang_name}:"
            for n in range(number_examples):
                example_source, example_target = examples[n][source_lang], examples[n][target_lang]
                inp += f"[START]\n{source_lang_name}: {example_source} \n{target_lang_name}: {example_target}\n[END]"
            inp += f"\n Using the examples, translate from {source_lang_name} to {target_lang_name}:"
            input_source = dataset[f"{source_lang}-{target_lang}"][i][source_lang]
            inp += f"[START]\n{source_lang_name}: {input_source} \n{target_lang_name}:"
            inputs.append(inp)

        targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        return sources, inputs, targets
    return get_input_targets_icl_CausalModel

**Careful, the code runs for ALMA, but 16GB is not enough to use 7B models quantized in 8 bits with one example...**

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
dataset_names = ["wnt23"]

model_names = ["opt-instruct"]
model_sizes = [None]

batch_size = 1
reduce_size = 4

number_examples = 1

generate_translation_several_datasets(directions, dataset_names, model_names, model_sizes, batch_size, reduce_size,
                                    load_model_and_tokenizer_fn = load_model_benchmark,
                                    get_input_targets_fn = get_input_tgt_icl_fn_CausalModel(number_examples),
                                    tslt_fn = translate_batched_OPT,
                                    translation_folder = f"evaluationsICL_{number_examples}examples")

## For Instruct Causal LM

In [None]:
def get_input_tgt_icl_fn_Instruct(number_examples):
    def get_input_targets_icl_Instruct(dataset, source_lang, target_lang):
        """
        Work at least for Qwen2.5 and Llama3
        """
        language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
        source_lang_name = language_name[source_lang]
        target_lang_name = language_name[target_lang]
        sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]

        inputs = []
        offset_seed = 0
        print("Generating prompts for In-Context learning...")
        for i in tqdm(range(len(dataset))):
            np.random.seed(i + offset_seed)
            idx = np.arange(len(dataset))
            idx = np.random.choice(idx, number_examples)
            while i in idx: # Make sure the translation to do is not in the examples
                offset_seed += 1
                np.random.seed(i + offset_seed)
                idx = np.arange(len(dataset))
                idx = np.random.choice(idx, number_examples)
            examples = [dataset[f"{source_lang}-{target_lang}"][n] for n in idx]
            inp = f"Here are examples of translations from {source_lang_name} to {target_lang_name}:"
            for n in range(number_examples):
                example_source, example_target = examples[n][source_lang], examples[n][target_lang]
                inp += f"\n[EXAMPLE {n+1}]\n{source_lang_name}: {example_source} \n{target_lang_name}: {example_target}"
            inp += f"\n Using the examples, translate from {source_lang_name} to {target_lang_name}:"
            input_source = dataset[f"{source_lang}-{target_lang}"][i][source_lang]
            inp += f"[TASK]\n{source_lang_name}: {input_source} \n{target_lang_name}:"
            inputs.append([
                {"role": "system", "content": "You are a translator, you output only the translation in the desired language."},
                {"role": "user", "content": f"{inp}"}])

        targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        return sources, inputs, targets
    return get_input_targets_icl_Instruct

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
dataset_names = ["wnt23"]

model_names = ["llama3"]
model_sizes = ["3B"]

batch_size = 1
reduce_size = 50

for number_examples in [1, 2, 3, 4]: #More than 4 is OOM
    generate_translation_several_datasets(directions, dataset_names, model_names, model_sizes, batch_size, reduce_size,
                                        load_model_and_tokenizer_fn = load_model_benchmark,
                                        get_input_targets_fn = get_input_tgt_icl_fn_Instruct(number_examples),
                                        tslt_fn = translate_batched_Llama3,
                                        translation_folder = f"evaluationsICL_{number_examples}examples")

In [None]:
import pickle
for number_examples in [1, 2, 3, 4]:
    with open(f"./generated_translations/evaluationsICL_{number_examples}examples/wnt23_llama3-3B_en-de_red-50.pkl", "rb") as f:
        translations = pickle.load(f)
    print("First translation:", translations[0])