In [None]:
import pickle
from tqdm import tqdm

import torch

from utils import (
    generate_sorted_affinity_index,
    generate_translation_several_models,
    load_model_benchmark,
    translate_batched_OPT,
    translate_batched_Llama3,
    get_sorted_affinity_index_path,
    get_closest_sentences,
    eval_metrics,
    make_parallel_plot,
    make_bar_plot,
    make_bar_plot_all_metrics
)

from utils.eval_params import num_beams, temperature, max_new_tokens, top_p
#num_beams = 5
#max_new_tokens = 512
#top_p = 0.9
#temperature = 0.6

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

# Cells to generate sorted sentence's index

In [None]:
dataset_name = "wnt23"
for direction in ["en-de", "de-en",
                  "en-cs", "cs-en",
                  "en-is", "is-en",
                  "en-zh", "zh-en",
                  "en-ru", "ru-en"]:
    print(f"Generating sorted indexes based on BERT embedding simmilarities for direction {direction} and dataset {dataset_name}")
    generate_sorted_affinity_index(direction, dataset_name)

In [None]:
dataset_name = "flores"
for direction in ["en-de", "de-en",
                  "en-cs", "cs-en",
                  "en-is", "is-en",
                  "en-zh", "zh-en",
                  "en-ru", "ru-en"]:
    print(f"Generating sorted indexes based on BERT embedding simmilarities for direction {direction} and dataset {dataset_name}")
    generate_sorted_affinity_index(direction, dataset_name)

# RAG for Causal LM

In [None]:
def get_input_tgt_rag_fn_CausalModel(number_examples, dataset_name):
    def get_input_targets_rag_CausalModel(dataset, source_lang, target_lang):
        direction = f"{source_lang}-{target_lang}"
        sort_aff_idx_savepath = get_sorted_affinity_index_path(direction, dataset_name)
        with open(sort_aff_idx_savepath, "rb") as f:
            sorted_affinity_index = pickle.load(f)

        language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
        source_lang_name = language_name[source_lang]
        target_lang_name = language_name[target_lang]
        # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
        sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]

        inputs = []
        ds = dataset[direction]
        for i in tqdm(range(len(dataset))):
            examples = get_closest_sentences(number_examples, i, ds, sorted_affinity_index)
            inp = f"Here are examples of translations from {source_lang_name} to {target_lang_name}:"
            for n in range(number_examples):
                example_source, example_target = examples[n][source_lang], examples[n][target_lang]
                inp += f"[START]\n{source_lang_name}: {example_source} \n{target_lang_name}: {example_target}\n[END]"
            inp += f"\n Using the examples, translate from {source_lang_name} to {target_lang_name}:"
            input_source = dataset[f"{source_lang}-{target_lang}"][i][source_lang]
            inp += f"[START]\n{source_lang_name}: {input_source} \n{target_lang_name}:"
            inputs.append(inp)

        targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        return sources, inputs, targets
    return get_input_targets_rag_CausalModel


**Careful, the code runs for ALMA, but 16GB is not enough to use 7B models quantized in 8 bits with one example...**

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
dataset_name = "wnt23"

model_names = ["opt-instruct"]
model_sizes = [None]

batch_size = 1
reduce_size = 4

number_examples = 1

# Careful to use generate_translation_several_models and not generate_translation_several_datasets (several datasets are not supported by the rag input generation)
generate_translation_several_models(directions, dataset_name, model_names, model_sizes, batch_size, reduce_size,
                                    load_model_and_tokenizer_fn = load_model_benchmark,
                                    get_input_targets_fn = get_input_tgt_rag_fn_CausalModel(number_examples, dataset_name),
                                    tslt_fn = translate_batched_OPT,
                                    translation_folder = f"evaluationsRAG_{number_examples}examples")

# RAG for Instruct models

In [None]:
def get_input_tgt_rag_fn_Instruct(number_examples, dataset_name):
    def get_input_targets_rag_Instruct(dataset, source_lang, target_lang):
        direction = f"{source_lang}-{target_lang}"
        sort_aff_idx_savepath = get_sorted_affinity_index_path(direction, dataset_name)
        with open(sort_aff_idx_savepath, "rb") as f:
            sorted_affinity_index = pickle.load(f)

        language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
        source_lang_name = language_name[source_lang]
        target_lang_name = language_name[target_lang]
        # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
        sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]

        inputs = []
        ds = dataset[direction]
        for i in tqdm(range(len(dataset))):
            examples = get_closest_sentences(number_examples, i, ds, sorted_affinity_index)
            inp = f"Here are examples of translations from {source_lang_name} to {target_lang_name}:"
            for n in range(number_examples):
                example_source, example_target = examples[n][source_lang], examples[n][target_lang]
                inp += f"\n[EXAMPLE {n+1}]\n{source_lang_name}: {example_source} \n{target_lang_name}: {example_target}"
            inp += f"\n Using the examples, translate from {source_lang_name} to {target_lang_name}:"
            input_source = dataset[f"{source_lang}-{target_lang}"][i][source_lang]
            inp += f"[TASK]\n{source_lang_name}: {input_source} \n{target_lang_name}:"
            inputs.append([
            {"role": "system", "content": "You are a translator, you output only the translation in the desired language."},
            {"role": "user", "content": f"{inp}"}])

        targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        return sources, inputs, targets
    return get_input_targets_rag_Instruct

In [None]:
directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]
dataset_name = "wnt23"

model_names = ["llama3"]
model_sizes = ["3B"]

batch_size = 1
reduce_size = 4

for number_examples in [1, 2]: # More than 2 is OOM
    # Careful to use generate_translation_several_models and not generate_translation_several_datasets (several datasets are not supported by the rag input generation)
    generate_translation_several_models(directions, dataset_name, model_names, model_sizes, batch_size, reduce_size,
                                        load_model_and_tokenizer_fn = load_model_benchmark,
                                        get_input_targets_fn = get_input_tgt_rag_fn_Instruct(number_examples, dataset_name),
                                        tslt_fn = translate_batched_Llama3,
                                        translation_folder = f"evaluationsRAG_{number_examples}examples")

In [None]:
for number_examples in [1, 2]:
    with open(f"./generated_translations/evaluationsRAG_{number_examples}examples/wnt23_llama3-3B_en-de_red-50.pkl", "rb") as f:
        translations = pickle.load(f)
    print("First translation:", translations[0])

# Evaluation

In [None]:
# Compute metrics
metric_names = ["rouge", "bleu", "sacrebleu", "chrf", "comet", "meteor", "bertscore"]

dataset_names = ["wnt23"] # Careful to use one dataset at a time otherwise get_input_tgt_rag_fn_Instruct will not be consistent
reduce_sizes = [50]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3"]
model_sizes = ["3B"]

In [None]:
for number_examples in [1, 2]:
    eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes,
                get_input_targets_fn=get_input_tgt_rag_fn_Instruct(number_examples, dataset_names[0]),
                translation_folder=f"evaluationsRAG_{number_examples}examples",
                additionnal_name=f"RAG_{number_examples}examples")

In [None]:
# Comput BLEURT metric alone because it is not offloaded of the GPU, need to restart the kernel...
metric_names = ["bleurt"]

dataset_names = ["wnt23"] # Careful to use one dataset at a time otherwise get_input_tgt_rag_fn_Instruct will not be consistent 
reduce_sizes = [50]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3"]
model_sizes = ["3B"]

In [None]:
for number_examples in [1, 2]:
    eval_metrics(metric_names, directions, dataset_names, model_names, model_sizes, reduce_sizes,
                get_input_targets_fn=get_input_tgt_rag_fn_Instruct(number_examples, dataset_names[0]),
                translation_folder=f"evaluationsRAG_{number_examples}examples",
                additionnal_name=f"RAG_{number_examples}examples")

# Plot

In [None]:
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_names = ["wnt23"]
reduce_sizes = [50]

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "RAG_1examples",
                     "RAG_2examples"]


make_parallel_plot(directions,
                    model_names, model_sizes,
                    dataset_names, reduce_sizes,
                    metric_names,
                    list_colors_per = ["model"],
                    additionnal_names=additionnal_names,
                    savepath = "./results/evaluations_figures/RAG_eval_parrallel_model")

In [None]:
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_name = "wnt23"
reduce_size = 50

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3"]
model_sizes = ["3B",
               "3B",
               "3B"]
additionnal_names = [None,
                     "RAG_1examples",
                     "RAG_2examples"]

make_bar_plot(directions,
                model_names, model_sizes,
                dataset_name, reduce_size,
                metric_names,
                additionnal_names=additionnal_names,
                width=0.2,
                cmap="rainbow",
                savepath = "./results/evaluations_figures/barplot_all_models_RAG")

In [None]:
## Bar plot per direction
metric_names = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum",
                "BLEU", "SacreBLEU", "chrF", "chrF++",
                "COMET", "BLEURT", "BERTscore", "METEOR"]

dataset_name = "wnt23"
reduce_size = 50

directions = ["en-de", "de-en",
              "en-cs", "cs-en",
              "en-is", "is-en",
              "en-zh", "zh-en",
              "en-ru", "ru-en"]

model_names = ["llama3",
               "llama3",
               "llama3",]
model_sizes = ["3B",
               "3B",
               "3B",]
additionnal_names = [None,
                     "RAG_1examples",
                     "RAG_2examples"]

make_bar_plot_all_metrics(directions, model_names, model_sizes, dataset_name, reduce_size, metric_names, additionnal_names,
                        savepath = "./results/evaluations_figures/RAG_barplot/barplot_all_metrics_ICL")