In [1]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface


env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [2]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import pyonmttok
import ctranslate2
from metrics import *


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
model_id = "projecte-aina/aguila-7b"
#model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [4]:
## Lets Do the translation layer
from huggingface_hub import snapshot_download
print("Loading translator Models...")

ca_en_model_folder = snapshot_download(repo_id="projecte-aina/mt-aina-ca-en", revision="main")
tokenizer_ca_en = pyonmttok.Tokenizer(
    mode="none", sp_model_path=ca_en_model_folder + "/spm.model"
)
ca_en_model = ctranslate2.Translator(ca_en_model_folder, device="cuda")


Loading translator Models...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [11]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor


def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 0
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)

            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            log_probs = torch.log_softmax(logits, dim=-1).cpu()
            # log_probs = torch.log(min_max_scaling(logits))
            answer_probability += log_probs[0][token]

            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
    return torch.exp(answer_probability).item()
    # return answer_probability.item()


def run_inference(txt, num_tokens=20, stop_text='\n'):
    # Tokenize the input text
    tokens = tokenizer(txt, return_tensors="pt").to(model.device)['input_ids']
    # Calculate the total length of the output (input length + number of tokens to generate)

    generated_text = None

    with torch.no_grad():
        # Generate tokens
        for _ in range(num_tokens):
            max_length = len(tokens[0]) + 1
            tokens = model.generate(tokens, do_sample=True, top_k=1, eos_token_id=tokenizer.eos_token_id, max_length=max_length)

            # Decode the generated tokens into text
            generated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)

            # If a stop text is found, truncate the output at its first occurrence
            if stop_text is not None:
                if generated_text[-len(stop_text):] == stop_text:
                    break

        generated_only = generated_text.replace(txt, "").strip()
        return generated_only


def translate(sample):
    def translate_to_english(txt):
        lines = [l for l in txt.split("\n") if l.strip() != ""]

        toks, _ = tokenizer_ca_en.tokenize_batch(lines)
        translated = ca_en_model.translate_batch(toks)
        ts = []
        for t in translated:
            t_str = tokenizer_ca_en.detokenize(t.hypotheses[0])
            # That is a bug on the translation outputing twice the translation.
            if len(txt.split(" ")) == 1 and len(t_str.split(" ")) == 2:
                t_str = t_str.split(" ")[0]
            ts.append(t_str)

        return "\n".join(ts)
    en_prompt = translate_to_english(sample['prompt'])
    en_answer = translate_to_english(sample['answer'])
    return {"prompt": en_prompt, "answer": en_answer}


def compute_metrics(sample):
    prob = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    bleu = calculate_bleu_score(prediction, sample['answer'])
    return {"prediction": prediction, "prob": prob, "f1": f1, "bleu": bleu}


In [6]:
catalanqa = load_dataset("data", data_files="catalanqa.csv", split="train[:10]")
catalanqa_en = catalanqa.map(translate)


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [12]:
results_ca = catalanqa.map(compute_metrics)
results_ca.to_pandas()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,prompt,answer,prediction,prob,f1,bleu
0,Barcelona en Comú renunciarà a un regidor en t...,en totes les votacions del ple de l'Ajuntament...,un,4.500151e-06,0.0,0.0
1,"Els braços reial, militar i eclesiàstic lliura...",prohibició d'acudir a la justícia eclesiàstica,la prohibició d'acudir a la justícia eclesiàstica,3.655441e-08,0.909091,0.8091067
2,El grup municipal de la CUP-Capgirem a l'Ajunt...,des del 2007,El grup municipal de la CUP-Capgirem a l'Ajunt...,0.0004730225,0.023077,2.658409e-79
3,El 1899 s'erigí una estàtua de Lesseps de més ...,bronze,bronze,0.000295639,1.0,1.821832e-231
4,El 25 de juliol de 1713 l'exèrcit borbònic —un...,franco-espanyol,el franco-espanyol,8.866191e-07,0.666667,1.531972e-231
5,"El 1972, el físic francès Francis Perrin desco...",235,urani 235,0.0006866455,0.666667,1.531972e-231
6,"Hi ha molts tipus de virus de les plantes, per...",en cèl·lules vegetals vives,en cèl·lules vegetals vives,0.0001792908,1.0,1.0
7,Confirmades les quatre primeres morts per coro...,gitana,"una era gitana, una altra era magribina, una a...",2.145767e-05,0.142857,9.257325e-232
8,El bloc independentista superaria el 50% dels ...,Vox,Ciutadans,0.0001688004,0.0,0.0
9,El retaule consta d'una predel·la amb quatre r...,amb pa d'or,amb pa d'or,0.003387451,1.0,1.221339e-77


In [13]:
results_en = catalanqa_en.map(compute_metrics)
results_en.to_pandas()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,prompt,answer,prediction,prob,f1,bleu
0,Barcelona en Comú will resign a councillor in ...,in all the votes of the plenary session of the...,when Joaquim Forn is not present in the plenary,1.349959e-20,0.333333,1.441453e-155
1,"The royal, military and ecclesiastical arms ha...",ban on going to ecclesiastical justice,The prohibition of going to ecclesiastical jus...,4.602043e-10,0.666667,0.4111336
2,The municipal group of the CUP-Capgirem in the...,since 2007,2007,2.43187e-05,0.666667,6.702145e-232
3,In 1899 a statue of Lesseps of more than 10 me...,bronze,bronze,3.314018e-05,1.0,1.821832e-231
4,"On July 25, 1713, the Bourbon army—about 25,00...",French-Spanish,the army of the Duke of Bourbon,1.722947e-08,0.0,0.0
5,"In 1972, French physicist Francis Perrin disco...",235,235U,0.001167297,0.0,0.0
6,"There are many types of plant viruses, but the...",in living plant cells,in living plant cells,0.0001316071,1.0,1.0
7,The first four deaths from coronavirus in Nort...,gypsy,Roma community,3.46452e-07,0.0,0.0
8,The pro-independence bloc would exceed 50% of ...,Vox & Vox,The independence blog,1.463718e-12,0.0,0.0
9,The altarpiece consists of a predella with fou...,with gold leaf,with gilded pacarriers,2.840534e-08,0.333333,1.384293e-231


In [None]:
results_ca.to_csv(f"results/{model_name}-viquiquad-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-viquiquad-en.csv", index=False)


In [10]:
results_ca_mean = results_ca.to_pandas()[['prob', 'f1', 'bleu']].mean()
results_en_mean = results_en.to_pandas()[['prob', 'f1', 'bleu']].mean()
print("==== CA =====")
print(results_ca_mean)
print("==== EN =====")
print(results_en_mean)


==== CA =====
prob    0.700586
f1      0.540836
bleu    0.180911
dtype: float64
==== EN =====
prob    0.532715
f1      0.400000
bleu    0.141113
dtype: float64


In [14]:
results_ca_mean = results_ca.to_pandas()[['prob', 'f1', 'bleu']].mean()
results_en_mean = results_en.to_pandas()[['prob', 'f1', 'bleu']].mean()
print("==== CA =====")
print(results_ca_mean)
print("==== EN =====")
print(results_en_mean)


==== CA =====
prob    0.000522
f1      0.540836
bleu    0.180911
dtype: float64
==== EN =====
prob    0.000136
f1      0.400000
bleu    0.141113
dtype: float64
