In [None]:
import pickle
import time
from tqdm import tqdm

import matplotlib.pyplot as plt

import torch

import huggingface_hub
from datasets import load_dataset
import transformers
from transformers import BitsAndBytesConfig


from utils import (
    generate_translation_several_datasets,
    generate_translation_different_directions,
    generate_translation_several_models,
    load_model_benchmark,
    eval_metrics,
    make_parallel_plot,
    make_bar_plot,
    make_bar_plot_all_metrics
)

from utils.eval_params import num_beams, temperature, max_new_tokens, top_p
#num_beams = 5
#max_new_tokens = 512
#top_p = 0.9
#temperature = 0.6

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [None]:
language_tested = ["en", "de", "cs", "is", "zh", "ru"] # Only from or to english
metrics_available = ["bleu", "rouge", "bleurt", "sacrebleu", "comet", "meteor", "chrf", "bert_score"]
models_available = [
    # NLLB
    "facebook/nllb-200-distilled-600M",
    # ALMA
    "haoranxu/ALMA-7B",
    # Llama 3 Instruct
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B-Instruct",
    # Falcon 3 Mamba Instruct
    "tiiuae/Falcon3-Mamba-7B-Instruct",
    # Falcon 3 Instruct
    "tiiuae/Falcon3-1B-Instruct",
    "tiiuae/Falcon3-3B-Instruct",
    "tiiuae/Falcon3-7B-Instruct",
    # Qwen 2.5 Mamba Instruct
    "Qwen/Qwen2.5-0.5B-Instruct",
    "Qwen/Qwen2.5-1.5B-Instruct",
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
    # Mistral Instruct
    "mistralai/Mistral-7B-Instruct-v0.3",
    # BayLing
    "ICTNLP/bayling-2-7b",
    # Bloom & Bloomz
    "bigscience/bloom-560m",
    "bigscience/bloom-1b7",
    "bigscience/bloom-3b",
    "bigscience/bloom-7b1",
    "bigscience/bloomz-1b7",
    "bigscience/bloomz-3b",
    "bigscience/bloomz-7b1",
    # OPT
    "facebook/opt-125m",
    "facebook/opt-350m",
    "facebook/opt-6.7b",
    "facebook/opt-iml-1.3b",
    # MPT
    "mosaicml/mpt-7b-instruct",
]


ds_available = ["haoranxu/WMT23-Test",
                "openlanguagedata/flores_plus"]

## Inference functions

In [None]:
from utils import (
get_input_targets_NLLB,
translate_list_of_str_NLLB,
translate_batched_NLLB,
get_input_targets_ALMA,
translate_list_of_str_ALMA,
translate_batched_ALMA,
get_input_targets_Llama3,
extract_translation_Llama3,
translate_list_of_str_Llama3,
translate_batched_Llama3,
get_input_targets_Falcon3,
extract_translation_Falcon3Mamba,
translate_list_of_str_Falcon3Mamba,
translate_batched_Falcon3Mamba,
extract_translation_Falcon3,
translate_list_of_str_Falcon3,
translate_batched_Falcon3,
get_input_targets_Qwen2_5,
extract_translation_Qwen2_5,
translate_list_of_str_Qwen2_5,
translate_batched_Qwen2_5,
get_input_targets_Mistral,
extract_translation_Mistral,
translate_list_of_str_Mistral,
translate_batched_Mistral,
get_input_targets_BayLing,
translate_list_of_str_BayLing,
translate_batched_BayLing,
get_input_targets_BLOOM,
translate_list_of_str_BLOOM,
translate_batched_BLOOM,
get_input_targets_OPT,
translate_list_of_str_OPT,
translate_batched_OPT,
get_input_targets_MPT,
translate_list_of_str_MPT,
translate_batched_MPT,
)

## Dataset handling
We use WNT23 from the authors preprocessed split and the FLORES+ dataset, format in the same way that the WNT23 is.

In [None]:
from utils import reduce_flores_to_some_languages, transform_to_WNT_style

### WNT23

In [None]:
ds_wnt = load_dataset("haoranxu/WMT23-Test", "en-cs")["test"]
print(len(ds_wnt), ds_wnt[0:4])

### FLORES

In [None]:
from credentials import hf_token
huggingface_hub.login(token = hf_token)
ds_flores = load_dataset("openlanguagedata/flores_plus")["devtest"]

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
ds_flores_reduced = reduce_flores_to_some_languages(ds_flores, directions)

In [None]:
t1 = time.time()
ds_flores_wnt_style = transform_to_WNT_style(ds_flores, lang="zh", lang_start="en")
print(f"Time to compute: {time.time()-t1:.2f}s", ds_flores_wnt_style[0:4])

t1 = time.time()
ds_flores_wnt_style_reduced = transform_to_WNT_style(ds_flores_reduced, lang="zh", lang_start="en")
print(f"Time to compute: {time.time()-t1:.2f}s", ds_flores_wnt_style_reduced[0:4])

In [None]:
from utils import get_translations_filename

dataset_names = ["wnt23", "flores"]
reduce_sizes = [100, 200]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["alma",
               "nllb",
               "llama3", "llama3", "llama3",
               "falcon3-mamba",
               "falcon3", "falcon3", "falcon3",
               "qwen2.5", "qwen2.5", "qwen2.5", "qwen2.5",
               "mistral",
               "bloomz",
               "opt-instruct"]
model_sizes = [None,
               None,
               "1B", "3B", "8B",
               None,
               "1B", "3B", "7B",
               "0.5B", "1.5B", "3B", "7B",
               None,
               "7B",
               None]


for dataset_name, reduce_size in zip(dataset_names, reduce_sizes):
    for model_name, model_size in zip(model_names, model_sizes):
        for direction in directions:
            translations_filename = get_translations_filename(direction, dataset_name, model_name, model_size, reduce_size, None)
            with open(translations_filename, "rb") as f:
                tslt = pickle.load(f)
            model_size_p = "-"+model_size if model_size is not None else ""
            for i in range(5):
                print(f"[{dataset_name} red_{reduce_size}] [{model_name}{model_size_p}] [{direction} idx {5*i}]", tslt[5*i])
            print()


## Models & inference loops

### NLLB

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = transformers.AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", torch_dtype="auto", device_map=device)

In [None]:
sources, inputs, targets = get_input_targets_NLLB(ds_wnt, source_lang="en", target_lang="de")

In [None]:
for i in range(8):
    print(translate_list_of_str_NLLB(inputs[i:i+1], tokenizer, model, "de"))

### ALMA

In [None]:
# Load base model and LoRA weights
tokenizer = transformers.LlamaTokenizer.from_pretrained("haoranxu/ALMA-7B", padding_side='left')
Q_config = BitsAndBytesConfig(load_in_8bit=True) 
model = transformers.AutoModelForCausalLM.from_pretrained("haoranxu/ALMA-7B", torch_dtype="auto", device_map=device, quantization_config=Q_config)

In [None]:
sources, inputs, targets = get_input_targets_ALMA(ds_wnt, source_lang="en", target_lang="de")

In [None]:
for i in range(4):
    print(translate_list_of_str_ALMA(inputs[i:i+1], tokenizer, model, "de"))

### Llama 3 Instruct

In [None]:
from credentials import hf_token
huggingface_hub.login(token = hf_token)
tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
# tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
# tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
# model = transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", torch_dtype="auto", device_map=device)
# model = transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", torch_dtype="auto", device_map=device)
# Q_config = BitsAndBytesConfig(load_in_8bit=True) 
# model = transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)

In [None]:
sources, inputs, targets = get_input_targets_Llama3(ds_wnt, source_lang="en", target_lang="cs")

In [None]:
translate_list_of_str_Llama3(inputs[0:5], tokenizer, model)

### Llama3 not instructed 4B (for comparaison to finetuned version)

In [None]:
from credentials import hf_token
huggingface_hub.login(token = hf_token)
tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
Q_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=getattr(torch, "float16"),
                                bnb_4bit_use_double_quant=False)
model = transformers.AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", torch_dtype="auto", device_map=device, quantization_config=Q_config)
tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = tokenizer.pad_token_id

In [None]:
def get_input_targets_Llama3NI4bit(dataset, source_lang, target_lang):
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang_name = language_name[source_lang]
    target_lang_name = language_name[target_lang]
    # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
    sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    inputs = [(
        f"Translate from {source_lang_name} to {target_lang_name} and end your answer as soon as the task is finished:"
        + f"\n{source_lang_name}: {example.get(source_lang)} \n{target_lang_name}:")
        for example in dataset[f"{source_lang}-{target_lang}"]]
    targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
    return sources, inputs, targets

def translate_list_of_str_Llama3NI4bit(list_str, tokenizer, model, target_language):
    """
    Returns a list containing str corresponding to translation of the inputted
    """
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    with torch.no_grad():
        inputs = tokenizer(list_str, return_tensors="pt", padding=True)
        translated = model.generate(**inputs.to(device),
                                    num_beams=num_beams, max_new_tokens=max_new_tokens, do_sample=True,
                                    temperature=temperature, top_p=top_p
                                    ).cpu()
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        tgt_language_name = language_name[target_language]
        translated_text = [t.split(f"{tgt_language_name}:")[-1] for t in translated_text] # Remove prompt
    return translated_text

def translate_batched_Llama3NI4bit(inputs, model, tokenizer, batch_size, target_language):
    preds = []
    for i in tqdm(range(len(inputs)//batch_size)):
        tslt = translate_list_of_str_Llama3NI4bit(inputs[i*batch_size : (i+1)*batch_size], tokenizer, model, target_language)
        preds.extend(tslt)
    return preds

In [None]:
sources, inputs, targets = get_input_targets_Llama3NI4bit(ds_wnt, source_lang="en", target_lang="de")

In [None]:
for i in range(4):
    print(translate_list_of_str_Llama3NI4bit(inputs[i:i+1], tokenizer, model, target_language="de"))

### Falcon 3 Instruct (mamba and transformer)

In [None]:
# tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/Falcon3-Mamba-7B-Instruct")
# Q_config = BitsAndBytesConfig(load_in_8bit=True)
# model = transformers.AutoModelForCausalLM.from_pretrained("tiiuae/Falcon3-Mamba-7B-Instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/Falcon3-7B-Instruct")
Q_config = BitsAndBytesConfig(load_in_8bit=True)
model = transformers.AutoModelForCausalLM.from_pretrained("tiiuae/Falcon3-7B-Instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
# tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/Falcon3-3B-Instruct")
# model = transformers.AutoModelForCausalLM.from_pretrained("tiiuae/Falcon3-3B-Instruct", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/Falcon3-1B-Instruct")
# model = transformers.AutoModelForCausalLM.from_pretrained("tiiuae/Falcon3-1B-Instruct", torch_dtype="auto", device_map=device)

In [None]:
sources, inputs, targets = get_input_targets_Falcon3(ds_wnt, source_lang="en", target_lang="de")

In [None]:
translate_list_of_str_Falcon3Mamba(inputs[0:4], tokenizer, model)

In [None]:
translate_list_of_str_Falcon3(inputs[0:8], tokenizer, model)

### Qwen 2.5

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model = transformers.AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
# model = transformers.AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
# model = transformers.AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8")
# model = transformers.AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8", torch_dtype="auto", device_map=device)

In [None]:
sources, inputs, targets = get_input_targets_Qwen2_5(ds_wnt, source_lang="en", target_lang="de")

In [None]:
for i in range(10):
    print(translate_list_of_str_Qwen2_5(inputs[i:i+1], tokenizer, model))

### Mistral 7B

In [None]:
from credentials import hf_token
huggingface_hub.login(token = hf_token)
tokenizer = transformers.AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
tokenizer.pad_token = tokenizer.eos_token
Q_config = BitsAndBytesConfig(load_in_8bit=True)
model = transformers.AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", torch_dtype="auto", device_map=device, quantization_config=Q_config)

In [None]:
sources, inputs, targets = get_input_targets_Mistral(ds_wnt, source_lang="en", target_lang="de")

In [None]:
for i in range(4):
    print(translate_list_of_str_Mistral(inputs[i:i+1], tokenizer, model))

### Bayling

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("ICTNLP/bayling-2-7b")
tokenizer.pad_token = tokenizer.eos_token
Q_config = BitsAndBytesConfig(load_in_8bit=True)
model = transformers.AutoModelForCausalLM.from_pretrained("ICTNLP/bayling-2-7b", torch_dtype="auto", device_map=device, quantization_config=Q_config)

In [None]:
sources, inputs, targets = get_input_targets_BayLing(ds_wnt, source_lang="en", target_lang="de")

In [None]:
for i in range(4):
    print(translate_list_of_str_BayLing(inputs[i:i+1], tokenizer, model, target_language="de"))

### Bloom & Bloom Z

In [None]:
# tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-560m")
# model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", torch_dtype=torch.bfloat16, device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-1b7")
# model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-3b")
# model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloom-7b1")
# Q_config = BitsAndBytesConfig(load_in_8bit=True)
# model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloom-7b1", torch_dtype="auto", device_map=device, quantization_config=Q_config)
# tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloomz-1b7")
# model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloomz-1b7", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloomz-3b")
# model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloomz-3b", torch_dtype="auto", device_map=device)
tokenizer = transformers.AutoTokenizer.from_pretrained("bigscience/bloomz-7b1")
Q_config = BitsAndBytesConfig(load_in_8bit=True)
model = transformers.AutoModelForCausalLM.from_pretrained("bigscience/bloomz-7b1", torch_dtype="auto", device_map=device, quantization_config=Q_config)

In [None]:
sources, inputs, targets = get_input_targets_BLOOM(ds_wnt, "en", "de")

In [None]:
translate_batched_BLOOM(inputs[0:2], model, tokenizer, batch_size=1, target_language="de")

### OPT & OPT Instruct

In [None]:
# tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-125m")
# model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-125m", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-350m")
# model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", device_map=device)
# tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-6.7b")
# Q_config = BitsAndBytesConfig(load_in_8bit=True)
# model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b", torch_dtype="auto", device_map=device, quantization_config=Q_config)
tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-iml-1.3b")
model = transformers.AutoModelForCausalLM.from_pretrained("facebook/opt-iml-1.3b", torch_dtype="auto", device_map=device)

In [None]:
sources, inputs, targets = get_input_targets_OPT(ds_wnt, "en", "de")

In [None]:
translate_batched_OPT(inputs[4:6], model, tokenizer, batch_size=1, target_language="de")

### MPT

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
tokenizer.pad_token = tokenizer.eos_token
Q_config = BitsAndBytesConfig(load_in_8bit=True)
model = transformers.AutoModelForCausalLM.from_pretrained("mosaicml/mpt-7b-instruct", torch_dtype="auto", device_map=device, quantization_config=Q_config)
model.generation_config.pad_token_id = tokenizer.pad_token_id

In [None]:
sources, inputs, targets = get_input_targets_MPT(ds_wnt, source_lang="en", target_lang="de")

In [None]:
for i in range(4,6):
    print(translate_list_of_str_MPT(inputs[i:i+1], tokenizer, model, target_language="de"))

## Translation part

In [None]:
## Global benchmark

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
model_names = ["alma",
               "nllb",
               "llama3", "llama3", "llama3",
               "falcon3-mamba",
               "falcon3", "falcon3", "falcon3",
               "qwen2.5", "qwen2.5", "qwen2.5", "qwen2.5",
               "mistral",
               "bloomz",
               "opt-instruct"]
model_sizes = [None,
               None,
               "1B", "3B", "8B",
               None,
               "1B", "3B", "7B",
               "0.5B", "1.5B", "3B", "7B",
               None,
               "7B",
               None]

generate_translation_several_models(directions,
                                    dataset_name="wnt23",
                                    model_names=model_names,
                                    model_sizes=model_sizes,
                                    batch_size=1,
                                    reduce_size=100)

generate_translation_several_models(directions,
                                    dataset_name="flores",
                                    model_names=model_names,
                                    model_sizes=model_sizes,
                                    batch_size=1,
                                    reduce_size=200)

In [None]:
## For RAG and ICL comparison, 0 examples

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
model_names = ["llama3"]
model_sizes = ["3B"]

generate_translation_several_models(directions,
                                    dataset_name="wnt23",
                                    model_names=model_names,
                                    model_sizes=model_sizes,
                                    batch_size=1,
                                    reduce_size=50)

In [None]:
## Wanted to do but computation time too expansive

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
model_names = ["bloom", "bloom", "bloom", "bloom",
               "bloomz", "bloomz",
               "opt", "opt", "opt",
               "bayling",
               "mpt"]
model_sizes = ["0.5B", "1B", "3B", "7B",
               "1B", "3B",
               "0.1B", "0.3B", "7B",
               None,
               None]

generate_translation_several_models(directions,
                                    dataset_name="wnt23",
                                    model_names=model_names,
                                    model_sizes=model_sizes,
                                    batch_size=1,
                                    reduce_size=100)

generate_translation_several_models(directions,
                                    dataset_name="flores",
                                    model_names=model_names,
                                    model_sizes=model_sizes,
                                    batch_size=1,
                                    reduce_size=200)

## Metrics from predictions: evaluation function

### Evaluations

In [None]:
metric_names = ["bleurt"]

dataset_names = ["wnt23"]
reduce_sizes = [50]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3"]
model_sizes = ["3B"]

In [None]:
eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes)

In [None]:
metric_names = ["rouge", "bleu", "sacrebleu", "chrf", "comet", "meteor", "bertscore"]

dataset_names = ["wnt23", "flores"]
reduce_sizes = [100, 200]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["alma",
               "nllb",
               "llama3", "llama3", "llama3",
               "falcon3-mamba",
               "falcon3", "falcon3", "falcon3",
               "qwen2.5", "qwen2.5", "qwen2.5", "qwen2.5",
               "mistral",
               "bloomz",
               "opt-instruct",]
model_sizes = [None,
               None,
               "1B", "3B", "8B",
               None,
               "1B", "3B", "7B",
               "0.5B", "1.5B", "3B", "7B",
               None,
               "7B",
               None,]

In [None]:
eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes)

In [None]:
# Comput BLEURT metric alone because it is not offloaded of the GPU, need to restart the kernel...
metric_names = ["bleurt"]

dataset_names = ["flores", "wnt23"]
reduce_sizes = [200, 100]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["bloomz",
               "opt-instruct"]
model_sizes = ["7B",
               None]

In [None]:
eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes)

### Plot

In [None]:
## Parallel plots with all evaluation trajectories
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_names = ["wnt23", "flores"]
reduce_sizes = [100, 200]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["alma",
               "nllb",
               "llama3", "llama3", "llama3",
               "falcon3-mamba",
               "falcon3", "falcon3", "falcon3",
               "qwen2.5", "qwen2.5", "qwen2.5", "qwen2.5",
               "mistral",
               "bloomz",
               "opt-instruct",]
model_sizes = [None,
               None,
               "1B", "3B", "8B",
               None,
               "1B", "3B", "7B",
               "0.5B", "1.5B", "3B", "7B",
               None,
               "7B",
               None,]

make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["dataset"],
                    colors=None,
                    savepath = "./results/evaluations_figures/all_dataset")
make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["direction"],
                    colors=None,
                    savepath = "./results/evaluations_figures/all_direction")
make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["model"],
                    colors=None,
                    savepath = "./results/evaluations_figures/all_model")

In [None]:
## Parallel plot with chinese trajectories
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_names = ["flores"]
reduce_sizes = [200]

directions = ["en-zh", "zh-en"]

model_names = ["alma", "nllb",
               "llama3",
               "falcon3-mamba",
               "falcon3",
               "qwen2.5", "qwen2.5", "qwen2.5",
               "mistral",
               "bloomz"]
model_sizes = [None, None,
               "8B",
            None,
            "7B",
               "3B", "7B", "1.5B",
               None,
               "7B"]

make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["direction", "model"],
                    colors = plt.cm.tab20.colors,
                    savepath = "./results/evaluations_figures/chinese_dir_model_flores")

In [None]:
## Parallel plot NLLB v.s. ALMA
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_names = ["flores", "wnt23"]
reduce_sizes = [200, 100]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["alma",
               "nllb"]

model_sizes = [None,
               None]

make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["direction", "model"],
                    colors = plt.cm.tab20.colors[::2]+plt.cm.tab20.colors[1::2],
                    savepath = "./results/evaluations_figures/alma-nllb_direction_model")

make_parallel_plot(directions,
                    model_names, model_sizes,
                    ["flores"], [200],
                    metric_names,
                    list_colors_per = ["direction", "model"],
                    colors = plt.cm.tab20.colors[::2]+plt.cm.tab20.colors[1::2],
                    savepath = "./results/evaluations_figures/alma-nllb_flores_direction_model")

make_parallel_plot(directions,
                    model_names, model_sizes,
                    ["wnt23"], [100],
                    metric_names,
                    list_colors_per = ["direction", "model"],
                    colors = plt.cm.tab20.colors[::2]+plt.cm.tab20.colors[1::2],
                    savepath = "./results/evaluations_figures/alma-nllb_wnt23_direction_model")

In [None]:
## Bar plot per metric
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["alma",
               "nllb",
               "llama3", "llama3", "llama3",
               "falcon3-mamba",
               "falcon3", "falcon3", "falcon3",
               "qwen2.5", "qwen2.5", "qwen2.5", "qwen2.5",
               "mistral",
               "bloomz",
               "opt-instruct",]
model_sizes = [None,
               None,
               "1B", "3B", "8B",
               None,
               "1B", "3B", "7B",
               "0.5B", "1.5B", "3B", "7B",
               None,
               "7B",
               None,]

dataset_name = "flores"
reduce_size = 200

metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

make_bar_plot(directions,
                model_names, model_sizes,
                dataset_name, reduce_size,
                metric_names,
                savepath = "./results/evaluations_figures/barplot_all_models")

In [None]:
## Bar plot per direction
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["alma",
               "nllb",
               "llama3", "llama3", "llama3",
               "falcon3-mamba",
               "falcon3", "falcon3", "falcon3",
               "qwen2.5", "qwen2.5", "qwen2.5", "qwen2.5",
               "mistral",
               "bloomz",
               "opt-instruct",]
model_sizes = [None,
               None,
               "1B", "3B", "8B",
               None,
               "1B", "3B", "7B",
               "0.5B", "1.5B", "3B", "7B",
               None,
               "7B",
               None,]

dataset_name = "flores"
reduce_size = 200

metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

make_bar_plot_all_metrics(directions, model_names, model_sizes, dataset_name, reduce_size, metric_names,
                        savepath = "./results/evaluations_figures/barplot_all_metrics_all_models")

### Benchmarck table

In [None]:
import pandas as pd
import pickle
import os

eval_directory = "./evaluations/"
models = ["alma","bloomz","falcon3","falcon3","falcon3","falcon3-mamba","llama3","llama3","llama3","mistral","nllb","opt-instruct","qwen2.5","qwen2.5","qwen2.5","qwen2.5"]
models_sizes = [None,"7B","1B","3B","7B",None,"1B","3B","8B",None,None,None,"0.5B","1.5B","3B","7B"]
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

In [None]:
def extract_metrics_per_model_direction_dataset(model_name, model_size, direction, dataset_name):
    if dataset_name == "flores":
        red_size = "200"
    elif dataset_name == "wnt23":
        red_size = "100"
    if model_size is not None:
        model_name+= f"-{model_size}"
    eval_file_name = eval_directory + f"raw_{dataset_name}_{model_name}_{direction}_red-{red_size}.pkl"
    
    with open(eval_file_name, "rb") as f:
        evaluation_pred = pickle.load(f)
    
    return evaluation_pred

def extract_metrics_per_direction_dataset(direction, dataset_name):
    df_eval = pd.DataFrame()
    for model_name, model_size in zip(models, models_sizes):
        eval_dict = extract_metrics_per_model_direction_dataset(model_name,model_size,direction,dataset_name)
        eval_dict = dict(map(lambda kv: (kv[0], [kv[1]["mean_score"]]), eval_dict.items()))
        model_name_ = model_name if model_size is None else f"{model_name}-{model_size}"
        df_eval = pd.concat([df_eval, pd.DataFrame(eval_dict,index=[model_name_])])

    return df_eval

def load_evaluations_as_tables(output_directory="./results/evaluations_tables/"):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    for dataset_name in ['wnt23','flores']:
        df_eval_list =[]
        for direction in directions:
            evaluation_file = output_directory + f"eval_{dataset_name}_{direction}_all-models.csv"
            df_eval = extract_metrics_per_direction_dataset(direction, dataset_name)
            df_eval_list.append(df_eval)
            df_eval.to_csv(evaluation_file)
        df_eval_avg = pd.concat(df_eval_list).groupby(level=0).mean()
        df_eval_avg.to_csv(output_directory+f"eval_{dataset_name}_all-directions.csv")

In [None]:
load_evaluations_as_tables()