In [1]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface


env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [2]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from metrics import *


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
model_id = "projecte-aina/aguila-7b"
#model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [4]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor


def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 0
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)

            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            log_probs = torch.log(min_max_scaling(logits))
            answer_probability += log_probs[0][token]

            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
    return torch.exp(answer_probability).item()
    # return answer_probability.item()


def run_inference(txt, num_tokens=20, stop_text='\n'):
    # Tokenize the input text
    tokens = tokenizer(txt, return_tensors="pt").to(model.device)['input_ids']
    # Calculate the total length of the output (input length + number of tokens to generate)

    generated_text = None

    with torch.no_grad():
        # Generate tokens
        for _ in range(num_tokens):
            max_length = len(tokens[0]) + 1
            tokens = model.generate(tokens, do_sample=True, top_k=1, eos_token_id=tokenizer.eos_token_id, max_length=max_length)

            # Decode the generated tokens into text
            generated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)

            # If a stop text is found, truncate the output at its first occurrence
            if stop_text is not None:
                if generated_text[-len(stop_text):] == stop_text:
                    break

        generated_only = generated_text.replace(txt, "").strip()
        return generated_only

def compute_metrics(sample):
    prob = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    bleu = calculate_bleu_score(prediction, sample['answer'])
    return {"prediction": prediction, "prob": prob, "f1": f1, "bleu": bleu}


In [5]:
xquad_ca = load_dataset("data", data_files="xquad_ca.csv", split="train[:10]")
xquad_en = load_dataset("data", data_files="xquad_en.csv", split="train[:10]")


In [6]:
results_ca = xquad_ca.map(compute_metrics)
results_ca.to_pandas()




Map:   0%|          | 0/10 [00:00<?, ? examples/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,context,prompt,answer,prediction,prob,f1,bleu
0,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",militant extremista gihadista wahhabita/salafista,gihadista,0.578125,0.4,9.070367e-233
1,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",àrabs sunnites,Abu Bakr al-Baghdadi,0.558594,0.0,0.0
2,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",deu milions,10 milions de persones,0.644531,0.333333,1.28823e-231
3,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",reconeixement,Res,0.550781,0.0,0.0
4,"""L'Estat Islàmic"", anteriorment conegut com a ...","""L'Estat Islàmic"", anteriorment conegut com a ...",califat,el califat,0.765625,0.666667,1.531972e-231
5,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",1756 fins a la signatura del tractat de pau el...,des de 1754 fins a 1760,0.636719,0.266667,3.6455259999999998e-155
6,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",sis anys,des de 1756 fins a 1763,0.652344,0.0,0.0
7,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",el 1760,1760,0.757812,0.666667,6.702145e-232
8,"A Europa, el teatre nord-americà de la guerra ...","A Europa, el teatre nord-americà de la guerra ...",batalla de Jumonville Glen,La batalla de Jumonville Glen,0.730469,0.888889,0.6687403
9,"A causa de tenir cossos tous i gelatinosos, el...","A causa de tenir cossos tous i gelatinosos, el...",tenir cossos tous i gelatinosos,perquè els ctenòfors són extremament fràgils i...,0.511719,0.133333,1.0244910000000001e-231


In [7]:
results_en = xquad_en.map(compute_metrics)
results_en.to_pandas()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,context,prompt,answer,prediction,prob,f1,bleu
0,"""The Islamic State"", formerly known as the ""Is...","""The Islamic State"", formerly known as the ""Is...",Wahhabi/Salafi jihadist extremist militant,a jihadist group,0.699219,0.333333,9.918892e-232
1,"""The Islamic State"", formerly known as the ""Is...","""The Islamic State"", formerly known as the ""Is...",Sunni Arabs,Abu Bakr al-Baghdadi,0.683594,0.0,0.0
2,"""The Islamic State"", formerly known as the ""Is...","""The Islamic State"", formerly known as the ""Is...",ten million,10 million people,0.707031,0.4,1.384293e-231
3,"""The Islamic State"", formerly known as the ""Is...","""The Islamic State"", formerly known as the ""Is...",recognition,international recognition,0.679688,0.666667,1.531972e-231
4,"""The Islamic State"", formerly known as the ""Is...","""The Islamic State"", formerly known as the ""Is...",a caliphate,a caliphate,0.761719,1.0,1.491668e-154
5,"In Europe, the North American theater of the S...","In Europe, the North American theater of the S...",1756 to the signing of the peace treaty in 1763,1754 to 1760,0.609375,0.181818,1.342376e-232
6,"In Europe, the North American theater of the S...","In Europe, the North American theater of the S...",six years,1754 to 1760,0.644531,0.0,0.0
7,"In Europe, the North American theater of the S...","In Europe, the North American theater of the S...",1760,1760,0.789062,1.0,1.821832e-231
8,"In Europe, the North American theater of the S...","In Europe, the North American theater of the S...",Battle of Jumonville Glen,Battle of Jumonville Glen,0.75,1.0,1.0
9,"Because of their soft, gelatinous bodies, cten...","Because of their soft, gelatinous bodies, cten...","their soft, gelatinous bodies",because they are soft and gelatinous,0.734375,0.4,1.384293e-231


In [8]:
from pathlib import Path
Path("results").mkdir(parents=True, exist_ok=True)

results_ca.to_csv(f"results/{model_name}-xquad-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-xquad-en.csv", index=False)


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

18475

In [9]:
results_ca_mean = results_ca.to_pandas()[['prob', 'f1', 'bleu']].mean()
results_en_mean = results_en.to_pandas()[['prob', 'f1', 'bleu']].mean()
print("==== CA =====")
print(results_ca_mean)
print("==== EN =====")
print(results_en_mean)


==== CA =====
prob    0.638672
f1      0.335556
bleu    0.066874
dtype: float64
==== EN =====
prob    0.705859
f1      0.498182
bleu    0.100000
dtype: float64


In [10]:
del model
del tokenizer
torch.cuda.empty_cache()

model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")






Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
results_ca = xquad_ca.map(compute_metrics)
results_en = xquad_en.map(compute_metrics)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The atte

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The atte

In [12]:
from pathlib import Path
Path("results").mkdir(parents=True, exist_ok=True)

results_ca.to_csv(f"results/{model_name}-xquad-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-xquad-en.csv", index=False)


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

18429

In [13]:
results_ca_mean = results_ca.to_pandas()[['prob', 'f1', 'bleu']].mean()
results_en_mean = results_en.to_pandas()[['prob', 'f1', 'bleu']].mean()
print("==== CA =====")
print(results_ca_mean)
print("==== EN =====")
print(results_en_mean)


==== CA =====
prob     5.625000e-01
f1       3.333333e-01
bleu    5.487540e-156
dtype: float64
==== EN =====
prob    0.678320
f1      0.513333
bleu    0.051697
dtype: float64
