In [1]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface

env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [2]:
from datasets import load_dataset
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import pyonmttok
import ctranslate2
from metrics import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
model_id = "projecte-aina/aguila-7b"
#model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")


Loading checkpoint shards: 100%|██████████| 15/15 [00:10<00:00,  1.42it/s]


In [4]:
## Lets Do the translation layer
from huggingface_hub import snapshot_download
print("Loading translator Models...")

ca_en_model_folder = snapshot_download(repo_id="projecte-aina/mt-aina-ca-en", revision="main")
tokenizer_ca_en = pyonmttok.Tokenizer(
    mode="none", sp_model_path=ca_en_model_folder + "/spm.model"
)
ca_en_model = ctranslate2.Translator(ca_en_model_folder, device="cuda")

Loading translator Models...


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 53635.60it/s]


In [12]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor


def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 1
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)

            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            #log_probs = torch.log_softmax(logits, dim=-1).cpu()
            log_probs = min_max_scaling(logits)
            answer_probability *= log_probs[0][token]

            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
    #return torch.exp(answer_probability).item()
    return answer_probability.item()


def run_inference(txt, num_tokens=20, stop_text='\n'):
    # Tokenize the input text
    inputs = tokenizer(txt, return_tensors="pt").to(model.device)
    # Calculate the total length of the output (input length + number of tokens to generate)
    max_length = len(inputs['input_ids'][0]) + num_tokens

    with torch.no_grad():
        # Generate tokens
        tokens = model.generate(**inputs, do_sample=True, top_k=1, eos_token_id=tokenizer.eos_token_id, max_length=max_length)

        # Decode the generated tokens into text
        generated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)

        # Slice the generated text to exclude the input prompt
        generated_only = generated_text[len(tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)):]

        # If a stop text is found, truncate the output at its first occurrence
        if stop_text in generated_only:
            generated_only = generated_only.split(stop_text)[0]

        return generated_only.strip()


def translate(sample):
    def translate_to_english(txt):
        lines = txt.split("\n")
        toks, _ = tokenizer_ca_en.tokenize_batch(lines)
        translated = ca_en_model.translate_batch(toks)
        ts = []
        for t in translated:
            ts.append(tokenizer_ca_en.detokenize(t.hypotheses[0]))

        return "\n".join(ts)
    en_prompt = translate_to_english(sample['prompt'])
    en_answer = translate_to_english(sample['answer'])
    return {"prompt": en_prompt, "answer": en_answer}


def compute_metrics(sample):
    prob = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    bleu = calculate_bleu_score(prediction, sample['answer'])
    return {"prediction": prediction, "prob": prob, "f1": f1, "bleu": bleu}


In [13]:
xquad_ca = load_dataset("data", data_files="xquad_ca.csv", split="train[:10]")
xquad_en = load_dataset("data", data_files="xquad_en.csv", split="train[:10]")

In [14]:
results_ca = xquad_ca.map(compute_metrics)
results_ca.to_pandas()

Map: 100%|██████████| 10/10 [00:18<00:00,  1.81s/ examples]


Unnamed: 0,context,prompt,answer,prediction,prob,f1,bleu
0,"""L'Estat Islàmic"", anteriorment conegut com a ...","Després de deixar la companyia d'Edison, Tesla...",militant extremista gihadista wahhabita/salafista,----,0.207031,0,0
1,"""L'Estat Islàmic"", anteriorment conegut com a ...",Els himnes de Luter eren sovint evocats per co...,àrabs sunnites,----,0.535156,0,0
2,"""L'Estat Islàmic"", anteriorment conegut com a ...","L'emperador Gegeen Kan, fill i successor d'Ayu...",deu milions,----,0.400391,0,0
3,"""L'Estat Islàmic"", anteriorment conegut com a ...",La defensa dels Panthers va cedir només 308 pu...,reconeixement,----,0.492188,0,0
4,"""L'Estat Islàmic"", anteriorment conegut com a ...","Posteriorment, es va trobar un tros de paper e...",califat,----,0.419922,0,0
5,"A Europa, el teatre nord-americà de la guerra ...",Els paleoclimatòlegs mesuren la proporció d’ox...,1756 fins a la signatura del tractat de pau el...,----,0.361328,0,0
6,"A Europa, el teatre nord-americà de la guerra ...",La defensa dels Panthers va cedir només 308 pu...,sis anys,----,0.382812,0,0
7,"A Europa, el teatre nord-americà de la guerra ...","El 1466, potser 40.000 persones van morir a Pa...",el 1760,----,0.488281,0,0
8,"A Europa, el teatre nord-americà de la guerra ...",Peyton Manning es va convertir en el primer qu...,batalla de Jumonville Glen,----,0.462891,0,0
9,"A causa de tenir cossos tous i gelatinosos, el...","El 1891, el químic escocès James Dewar va acon...",tenir cossos tous i gelatinosos,----,0.304688,0,0


In [15]:
results_en = xquad_en.map(compute_metrics)
results_en.to_pandas()

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map: 100%|██████████| 10/10 [00:19<00:00,  1.92s/ examples]


Unnamed: 0,context,prompt,answer,prediction,prob,f1,bleu
0,After leaving Edison's company Tesla partnered...,After leaving Edison's company Tesla partnered...,Wahhabi/Salafi jihadist extremist militant,----,0.251953,0,0
1,Luther's hymns were frequently evoked by parti...,Luther's hymns were frequently evoked by parti...,Sunni Arabs,----,0.390625,0,0
2,"Emperor Gegeen Khan, Ayurbarwada's son and suc...","Emperor Gegeen Khan, Ayurbarwada's son and suc...",ten million,----,0.386719,0,0
3,"The Panthers defense gave up just 308 points, ...","The Panthers defense gave up just 308 points, ...",recognition,----,0.419922,0,0
4,A piece of paper was later found on which Luth...,A piece of paper was later found on which Luth...,a caliphate,----,0.447266,0,0
5,Paleoclimatologists measure the ratio of oxyge...,Paleoclimatologists measure the ratio of oxyge...,1756 to the signing of the peace treaty in 1763,----,0.235352,0,0
6,"The Panthers defense gave up just 308 points, ...","The Panthers defense gave up just 308 points, ...",six years,----,0.414062,0,0
7,"In 1466, perhaps 40,000 people died of the pla...","In 1466, perhaps 40,000 people died of the pla...",1760,----,0.53125,0,0
8,Peyton Manning became the first quarterback ev...,Peyton Manning became the first quarterback ev...,Battle of Jumonville Glen,----,0.582031,0,0
9,In 1891 Scottish chemist James Dewar was able ...,In 1891 Scottish chemist James Dewar was able ...,"their soft, gelatinous bodies",----,0.339844,0,0


In [16]:
results_ca.to_csv(f"results/{model_name}-xquad-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-xquad-en.csv", index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 234.94ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 294.46ba/s]


18607

In [17]:
results_ca_mean = results_ca.to_pandas()[['prob', 'f1', 'bleu']].mean()
results_en_mean = results_en.to_pandas()[['prob', 'f1', 'bleu']].mean()
print(results_ca_mean)
print(results_en_mean)

prob    0.405469
f1      0.000000
bleu    0.000000
dtype: float64
prob    0.399902
f1      0.000000
bleu    0.000000
dtype: float64
