In [2]:
import torch
import numpy as np
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def load_bert():
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
    model = transformers.BertModel.from_pretrained("bert-base-multilingual-uncased")
    return tokenizer, model

def predict_bert(tokenizer, model, sentence):
    with torch.no_grad():
        encoded_input = tokenizer(sentence, return_tensors='pt')
        output = model(**encoded_input)
    return output

def get_bert_embedding(tokenizer, model, sentence):
    out = predict_bert(tokenizer, model, sentence) # [1, ntokens, 768]
    out = out[0][:,  1:-1, :] # Remove CLS and SEP tokens -> [1, ntokens-2, 768]
    out = out.mean(dim=1) # [1, 768]
    return out

In [None]:
##  Use BERT to generate embeddings 
def get_sorted_affinity_index(list_sentence):
    tokenizer, model = load_bert()
    bert_embeddings = []
    for sentence_to_translate in list_sentence:
        bert_embeddings.append(get_bert_embedding(tokenizer, model, sentence_to_translate)) # [1, emb_size]
    bert_embeddings = torch.cat(bert_embeddings, dim=0) # [n_sentences, emb_size]
    affinity = bert_embeddings @ bert_embeddings.T # [n_sentences, n_sentences]
    affinity -= torch.eye(affinity.shape, device = affinity.device)*float("Inf") #  Suppress self-affinity

    sorted_affinity_index = []
    for i in range(len(list_sentence)):
        sorted_idx = torch.argsort(affinity[i],  descending = True).tolist()
        sorted_affinity_index.append(sorted_idx)
    return sorted_affinity_index

def get_closest_sentences(nb_sentences, sentence_idx, list_sentence, sorted_affinity_index):
    closest_sentences = []
    for i in sorted_affinity_index[sentence_idx][:nb_sentences]:
        closest_sentences.append(list_sentence[i])
    return closest_sentences

In [None]:
def reduce_dataset(inputs: list[str], sources: list[str], targets, final_nb: list[str]) -> tuple[list[str], list[str], list[str]]:
    """
    Selects randomly the samples of the evaluation corpus
    """
    idx = np.arange(len(inputs))
    np.random.seed(42)
    idx = np.random.choice(idx, final_nb)
    return [inputs[i] for i in idx], [sources[i] for i in idx], [targets[i] for i in idx]

def get_input_tgt_rag_fn(number_examples, sorted_affinity_index):
    def get_input_targets_rag_ALMA(dataset, source_lang, target_lang):
        language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
        source_lang_name = language_name[source_lang]
        target_lang_name = language_name[target_lang]
        # Use base formulation "Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
        sources = [example[source_lang] for example in dataset[f"{source_lang}-{target_lang}"]]

        inputs = []
        for i in range(len(dataset)):
            examples = get_closest_sentences(number_examples, i, sources, sorted_affinity_index)
            inp = f"Here are examples of translations from {source_lang_name} to {target_lang_name}:"
            for n in range(number_examples):
                example_source, example_target = examples[n][source_lang], examples[n][target_lang]
                inp += f"\n{source_lang_name}: {example_source} \n{target_lang_name}: {example_target} "
            inp += f"\n Using the examples, translate from {source_lang_name} to {target_lang_name}:"
            input_source = dataset[f"{source_lang}-{target_lang}"][i][source_lang]
            inp += f"\n{source_lang_name}: {input_source} \n{target_lang_name}:"
            inputs.append(inp)

        targets = [example[target_lang] for example in dataset[f"{source_lang}-{target_lang}"]]
        return sources, inputs, targets
    return get_input_targets_rag_ALMA
