In [1]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface


env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [2]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from metrics import *


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
model_id = "projecte-aina/aguila-7b"
#model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [43]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor


def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 0
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)

            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            #log_probs = torch.log_softmax(logits, dim=-1).cpu()
            log_probs = torch.log(min_max_scaling(logits))
            answer_probability += log_probs[0][token]

            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
    return torch.exp(answer_probability).item()
    # return answer_probability.item()


def run_inference(txt, num_tokens=20, stop_text='\n'):
    # Tokenize the input text
    tokens = tokenizer(txt, return_tensors="pt").to(model.device)['input_ids']
    # Calculate the total length of the output (input length + number of tokens to generate)

    generated_text = None

    with torch.no_grad():
        # Generate tokens
        for _ in range(num_tokens):
            max_length = len(tokens[0]) + 1
            tokens = model.generate(tokens, do_sample=True, top_k=1, eos_token_id=tokenizer.eos_token_id, max_length=max_length)

            # Decode the generated tokens into text
            generated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)

            # If a stop text is found, truncate the output at its first occurrence
            if stop_text is not None:
                if generated_text[-len(stop_text):] == stop_text:
                    break

        generated_only = generated_text.replace(txt, "").strip()
        return generated_only


def translate(sample):
    def translate_to_english(txt):
        lines = txt.split("\n")
        toks, _ = tokenizer_ca_en.tokenize_batch(lines)
        translated = ca_en_model.translate_batch(toks)
        ts = []
        for t in translated:
            ts.append(tokenizer_ca_en.detokenize(t.hypotheses[0]))

        return "\n".join(ts)
    en_prompt = translate_to_english(sample['prompt'])
    en_answer = translate_to_english(sample['answer'])
    return {"prompt": en_prompt, "answer": en_answer}


def compute_metrics(sample):
    prob = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    bleu = calculate_bleu_score(prediction, sample['answer'])
    return {"prediction": prediction, "prob": prob, "f1": f1, "bleu": bleu}


In [57]:
xquad_ca = load_dataset("data", data_files="xquad_ca.csv", split="train")
xquad_en = load_dataset("data", data_files="xquad_en.csv", split="train")


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [49]:
results_ca = xquad_ca.map(compute_metrics)
results_ca.to_pandas()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,context,prompt,answer,prediction,prob,f1,bleu
0,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",militant extremista gihadista wahhabita/salafista,gihadista,0.578125,0.4,9.070367e-233
1,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",àrabs sunnites,Abu Bakr al-Baghdadi,0.558594,0.0,0.0
2,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",deu milions,10 milions de persones,0.644531,0.333333,1.28823e-231
3,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",reconeixement,Res,0.550781,0.0,0.0
4,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",califat,el califat,0.765625,0.666667,1.531972e-231
5,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",1756 fins a la signatura del tractat de pau el...,des de 1754 fins a 1760,0.636719,0.266667,3.6455259999999998e-155
6,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",sis anys,des de 1756 fins a 1763,0.652344,0.0,0.0
7,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",el 1760,1760,0.757812,0.666667,6.702145e-232
8,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",batalla de Jumonville Glen,La batalla de Jumonville Glen,0.730469,0.888889,0.6687403
9,"A causa de tenir cossos tous i gelatinosos, el...","A causa de tenir cossos tous i gelatinosos, el...",tenir cossos tous i gelatinosos,perquè els ctenòfors són extremament fràgils i...,0.511719,0.133333,1.0244910000000001e-231


In [None]:
results_ca.to_csv(f"results/{model_name}-xquad-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-xquad-en.csv", index=False)


In [62]:
results_ca_mean = results_ca.to_pandas()[['prob', 'f1', 'bleu']].mean()
results_en_mean = results_en.to_pandas()[['prob', 'f1', 'bleu']].mean()
print("==== CA =====")
print(results_ca_mean)
print("==== EN =====")
print(results_en_mean)


==== CA =====
prob    0.638672
f1      0.335556
bleu    0.066874
dtype: float64
==== EN =====
prob    0.705859
f1      0.516364
bleu    0.100000
dtype: float64
