In [None]:
import os

import pickle
from tqdm import tqdm

import numpy as np
import torch

from utils import (
    generate_translation_several_datasets,
    load_model_benchmark,
    translate_batched_OPT,
    translate_batched_Llama3,
    eval_metrics,
    make_parallel_plot,
    make_bar_plot,
    make_bar_plot_all_metrics
)

from utils.eval_params import num_beams, temperature, max_new_tokens, top_p
#num_beams = 5
#max_new_tokens = 512
#top_p = 0.9
#temperature = 0.6

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

# For Causal LM

In [None]:
def get_input_tgt_icl_fn_CausalModel(number_examples):
    """Works for ALMA, OPT-instruct, BLOOMz, and any GPT model non instruct"""
    def get_input_targets_icl_CausalModel(dataset, source_lang, target_lang):
        language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
        source_lang_name = language_name[source_lang]
        target_lang_name = language_name[target_lang]
        # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
        sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]

        inputs = []
        offset_seed = 0
        print("Generating prompts for In-Context learning...")
        for i in tqdm(range(len(dataset))):
            np.random.seed(i + offset_seed)
            idx = np.arange(len(dataset))
            idx = np.random.choice(idx, number_examples)
            while i in idx: # Make sure the translation to do is not in the examples
                offset_seed += 1
                np.random.seed(i + offset_seed)
                idx = np.arange(len(dataset))
                idx = np.random.choice(idx, number_examples)
            examples = [dataset[f"{source_lang}-{target_lang}"][n] for n in idx]
            inp = f"Here are examples of translations from {source_lang_name} to {target_lang_name}:"
            for n in range(number_examples):
                example_source, example_target = examples[n][source_lang], examples[n][target_lang]
                inp += f"[START]\n{source_lang_name}: {example_source} \n{target_lang_name}: {example_target}\n[END]"
            inp += f"\n Using the examples, translate from {source_lang_name} to {target_lang_name}:"
            input_source = dataset[f"{source_lang}-{target_lang}"][i][source_lang]
            inp += f"[START]\n{source_lang_name}: {input_source} \n{target_lang_name}:"
            inputs.append(inp)

        targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        return sources, inputs, targets
    return get_input_targets_icl_CausalModel

**Careful, the code runs for ALMA, but 16GB is not enough to use 7B models quantized in 8 bits with one example...**

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
dataset_names = ["wnt23"]

model_names = ["opt-instruct"]
model_sizes = [None]

batch_size = 1
reduce_size = 4

number_examples = 1

generate_translation_several_datasets(directions, dataset_names, model_names, model_sizes, batch_size, reduce_size,
                                    load_model_and_tokenizer_fn = load_model_benchmark,
                                    get_input_targets_fn = get_input_tgt_icl_fn_CausalModel(number_examples),
                                    tslt_fn = translate_batched_OPT,
                                    translation_folder = f"evaluationsICL_{number_examples}examples")

# For Instruct Causal LM

In [None]:
def get_input_tgt_icl_fn_Instruct(number_examples):
    def get_input_targets_icl_Instruct(dataset, source_lang, target_lang):
        """
        Work at least for Qwen2.5 and Llama3
        """
        if not os.path.exists("./cache_perso"):
            os.makedirs("./cache_perso")
        if os.path.exists(f"./cache_perso/ICL_{source_lang}-{target_lang}_{number_examples}"):
            print("Using cached data...")
            with open(f"./cache_perso/ICL_{source_lang}-{target_lang}_{number_examples}", "rb") as f:
                cached = pickle.load(f)
            sources = cached["sources"]
            inputs = cached["inputs"]
            targets = cached["targets"]
            return sources, inputs, targets
        language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
        source_lang_name = language_name[source_lang]
        target_lang_name = language_name[target_lang]
        sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        inputs = []
        offset_seed = 0
        print("Generating prompts for In-Context learning...")
        for i in tqdm(range(len(dataset))):
            np.random.seed(i + offset_seed)
            idx = np.arange(len(dataset))
            idx = np.random.choice(idx, number_examples)
            while i in idx: # Make sure the translation to do is not in the examples
                offset_seed += 1
                np.random.seed(i + offset_seed)
                idx = np.arange(len(dataset))
                idx = np.random.choice(idx, number_examples)
            examples = [dataset[f"{source_lang}-{target_lang}"][n] for n in idx]
            inp = f"Here are examples of translations from {source_lang_name} to {target_lang_name}:"
            for n in range(number_examples):
                example_source, example_target = examples[n][source_lang], examples[n][target_lang]
                inp += f"\n[EXAMPLE {n+1}]\n{source_lang_name}: {example_source} \n{target_lang_name}: {example_target}"
            inp += f"\n Using the examples, translate from {source_lang_name} to {target_lang_name}:"
            input_source = dataset[f"{source_lang}-{target_lang}"][i][source_lang]
            inp += f"[TASK]\n{source_lang_name}: {input_source} \n{target_lang_name}:"
            inputs.append([
                {"role": "system", "content": "You are a translator, you output only the translation in the desired language."},
                {"role": "user", "content": f"{inp}"}])

        targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        with open(f"./cache_perso/ICL_{source_lang}-{target_lang}_{number_examples}", "wb") as f:
            pickle.dump({"sources": sources, "inputs": inputs, "targets": targets}, f, pickle.HIGHEST_PROTOCOL)
        return sources, inputs, targets
    return get_input_targets_icl_Instruct

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
dataset_names = ["wnt23"]

model_names = ["llama3"]
model_sizes = ["3B"]

batch_size = 1
reduce_size = 50

for number_examples in [1, 2, 3, 4]: #More than 4 is OOM
    generate_translation_several_datasets(directions, dataset_names, model_names, model_sizes, batch_size, reduce_size,
                                        load_model_and_tokenizer_fn = load_model_benchmark,
                                        get_input_targets_fn = get_input_tgt_icl_fn_Instruct(number_examples),
                                        tslt_fn = translate_batched_Llama3,
                                        translation_folder = f"evaluationsICL_{number_examples}examples")

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
dataset_names = ["flores"]

model_names = ["llama3"]
model_sizes = ["3B"]

batch_size = 4
reduce_size = 100

for number_examples in [1, 2, 3, 4, 5]:
    generate_translation_several_datasets(directions, dataset_names, model_names, model_sizes, batch_size, reduce_size,
                                        load_model_and_tokenizer_fn = load_model_benchmark,
                                        get_input_targets_fn = get_input_tgt_icl_fn_Instruct(number_examples),
                                        tslt_fn = translate_batched_Llama3,
                                        translation_folder = f"evaluationsICL_{number_examples}examples")

In [None]:
for number_examples in [1, 2, 3, 4]:
    with open(f"./generated_translations/evaluationsICL_{number_examples}examples/wnt23_llama3-3B_en-de_red-50.pkl", "rb") as f:
        translations = pickle.load(f)
    print("First translation:", translations[0])

In [None]:
for number_examples in [1, 2, 3, 4, 5]:
    with open(f"./generated_translations/evaluationsICL_{number_examples}examples/flores_llama3-3B_en-de_red-100.pkl", "rb") as f:
        translations = pickle.load(f)
    print("First translation:", translations[0])

# Evaluation

In [None]:
# Compute metrics
metric_names = ["rouge", "bleu", "sacrebleu", "chrf", "comet", "meteor", "bertscore"]

dataset_names = ["wnt23"] # Careful to use one dataset at a time otherwise get_input_tgt_rag_fn_Instruct will not be consistent
reduce_sizes = [50]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3"]
model_sizes = ["3B"]

for number_examples in [1, 2, 3, 4]:
    eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes,
                get_input_targets_fn=get_input_tgt_icl_fn_Instruct(number_examples),
                translation_folder=f"evaluationsICL_{number_examples}examples",
                additionnal_name=f"ICL_{number_examples}examples")

In [None]:
# Compute BLEURT metric alone because it is not offloaded of the GPU, need to restart the kernel...
metric_names = ["bleurt"]

dataset_names = ["wnt23"] # Careful to use one dataset at a time otherwise get_input_tgt_rag_fn_Instruct will not be consistent 
reduce_sizes = [50]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3"]
model_sizes = ["3B"]

for number_examples in [1, 2, 3, 4]:
    eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes,
                get_input_targets_fn=get_input_tgt_icl_fn_Instruct(number_examples),
                translation_folder=f"evaluationsICL_{number_examples}examples",
                additionnal_name=f"ICL_{number_examples}examples")

In [None]:
# Compute metrics
metric_names = ["rouge", "bleu", "sacrebleu", "chrf", "comet", "meteor", "bertscore"]

dataset_names = ["flores"] # Careful to use one dataset at a time otherwise get_input_tgt_rag_fn_Instruct will not be consistent
reduce_sizes = [100]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3"]
model_sizes = ["3B"]

for number_examples in [1, 2, 3, 4, 5]:
    eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes,
                get_input_targets_fn=get_input_tgt_icl_fn_Instruct(number_examples),
                translation_folder=f"evaluationsICL_{number_examples}examples",
                additionnal_name=f"ICL_{number_examples}examples")

In [None]:
# Comput BLEURT metric alone because it is not offloaded of the GPU, need to restart the kernel...
metric_names = ["bleurt"]

dataset_names = ["flores"] # Careful to use one dataset at a time otherwise get_input_tgt_rag_fn_Instruct will not be consistent 
reduce_sizes = [100]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3"]
model_sizes = ["3B"]

for number_examples in [1, 2, 3, 4, 5]:
    eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes,
                get_input_targets_fn=get_input_tgt_icl_fn_Instruct(number_examples),
                translation_folder=f"evaluationsICL_{number_examples}examples",
                additionnal_name=f"ICL_{number_examples}examples")

# Plot

## WNT23

In [None]:
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_names = ["wnt23"]
reduce_sizes = [50]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "ICL_1examples",
                     "ICL_2examples",
                     "ICL_3examples",
                     "ICL_4examples"]


make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["model"],
                    additionnal_names=additionnal_names,
                    savepath = "./results/evaluations_figures/ICL_RAG_parallelplot/ICL_eval_parrallel_model_wnt")

In [None]:
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_name = "wnt23"
reduce_size = 50

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "ICL_1examples",
                     "ICL_2examples",
                     "ICL_3examples",
                     "ICL_4examples"]

make_bar_plot(directions,
                model_names, model_sizes,
                dataset_name, reduce_size,
                metric_names,
                additionnal_names=additionnal_names,
                width=0.15,
                cmap="rainbow",
                savepath = "./results/evaluations_figures/ICL_barplot/barplot_all_models_ICL_wnt")

In [None]:
## Bar plot per direction
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_name = "wnt23"
reduce_size = 50

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "ICL_1examples",
                     "ICL_2examples",
                     "ICL_3examples",
                     "ICL_4examples"]

make_bar_plot_all_metrics(directions, model_names, model_sizes, dataset_name, reduce_size, metric_names, additionnal_names,
                          cmap = "rainbow",
                        savepath = "./results/evaluations_figures/ICL_barplot/barplot_all_metrics_ICL_wnt")

## FLORES

In [None]:
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_names = ["flores"]
reduce_sizes = [100]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3",
               "llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B",
               "3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "ICL_1examples",
                     "ICL_2examples",
                     "ICL_3examples",
                     "ICL_4examples",
                     "ICL_5examples"]


make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["model"],
                    additionnal_names=additionnal_names,
                    savepath = "./results/evaluations_figures/ICL_RAG_parallelplot/ICL_eval_parrallel_model_flores")

In [None]:
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_name = "flores"
reduce_size = 100

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "ICL_1examples",
                     "ICL_2examples",
                     "ICL_3examples",
                     "ICL_4examples"]

make_bar_plot(directions,
                model_names, model_sizes,
                dataset_name, reduce_size,
                metric_names,
                additionnal_names=additionnal_names,
                width=0.15,
                cmap="rainbow",
                savepath = "./results/evaluations_figures/ICL_barplot/barplot_all_models_ICL_flores")

In [None]:
## Bar plot per direction
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_name = "flores"
reduce_size = 100

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "ICL_1examples",
                     "ICL_2examples",
                     "ICL_3examples",
                     "ICL_4examples"]

make_bar_plot_all_metrics(directions, model_names, model_sizes, dataset_name, reduce_size, metric_names, additionnal_names,
                        savepath = "./results/evaluations_figures/ICL_barplot/barplot_all_metrics_ICL_flores")