In [114]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface

env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [115]:
from datasets import load_dataset
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import pyonmttok
import ctranslate2
from metrics import *

In [116]:
model_id = "projecte-aina/aguila-7b"
#model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")


Loading checkpoint shards: 100%|██████████| 15/15 [00:15<00:00,  1.05s/it]


In [117]:
## Lets Do the translation layer
from huggingface_hub import snapshot_download
print("Loading translator Models...")

ca_en_model_folder = snapshot_download(repo_id="projecte-aina/mt-aina-ca-en", revision="main")
tokenizer_ca_en = pyonmttok.Tokenizer(
    mode="none", sp_model_path=ca_en_model_folder + "/spm.model"
)
ca_en_model = ctranslate2.Translator(ca_en_model_folder, device="cuda")

Loading translator Models...


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 16539.05it/s]


In [118]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor


def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 1
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)

            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            #probabilities = min_max_scaling(logits)
            probabilities = torch.softmax(logits, dim=1)
            answer_probability *= probabilities[0][token]

            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
            del new_token
        del inputs
    return answer_probability.item()


def run_inference(txt, num_tokens=20, stop_text='\n'):
    # Tokenize the input text
    inputs = tokenizer(txt, return_tensors="pt").to(model.device)
    # Calculate the total length of the output (input length + number of tokens to generate)
    max_length = len(inputs['input_ids'][0]) + num_tokens

    with torch.no_grad():
        # Generate tokens
        tokens = model.generate(**inputs, do_sample=True, top_k=50, eos_token_id=tokenizer.eos_token_id, max_length=max_length)

        # Decode the generated tokens into text
        generated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)

        # Slice the generated text to exclude the input prompt
        generated_only = generated_text[len(tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)):]

        # If a stop text is found, truncate the output at its first occurrence
        if stop_text in generated_only:
            generated_only = generated_only.split(stop_text)[0]

        return generated_only.strip()


def translate(sample):
    def translate_to_english(txt):
        lines = txt.split("\n")
        toks, _ = tokenizer_ca_en.tokenize_batch(lines)
        translated = ca_en_model.translate_batch(toks)
        ts = []
        for t in translated:
            ts.append(tokenizer_ca_en.detokenize(t.hypotheses[0]))

        return "\n".join(ts)
    en_prompt = translate_to_english(sample['prompt'])
    en_answer = translate_to_english(sample['answer'])
    return {"prompt": en_prompt, "answer": en_answer}


def compute_metrics(sample):
    prob = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    return {"prediction": prediction, "prob": prob, "f1": f1}


In [119]:
viquiquad = load_dataset("data", data_files="viquiquad.csv", split="train")
viquiquad_en = viquiquad.map(translate)

Map:   0%|          | 0/2475 [00:00<?, ? examples/s]

Map: 100%|██████████| 2475/2475 [15:23<00:00,  2.68 examples/s]


In [120]:
results_ca = viquiquad.map(compute_metrics)
results_ca.to_pandas()

Map:  14%|█▎        | 337/2475 [11:20<1:09:01,  1.94s/ examples]

Map:  68%|██████▊   | 1686/2475 [55:11<25:03,  1.91s/ examples]  

In [None]:
results_en = viquiquad_en.map(compute_metrics)
results_en.to_pandas()

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  50%|█████     | 1/2 [00:07<00:07,  7.18s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map: 100%|██████████| 2/2 [00:19<00:00,  9.99s/ examples]


Unnamed: 0,prompt,answer,prediction,prob,f1
0,"During this period, the modern concept of phot...","Life, Paris-Match, Stern or Época","Epoclendor, Paris",2.733924e-15,0.0
1,"After his death, several events and exhibition...",in a black and white photography contest about...,photography about Barceló,6.810296e-09,0.363636


In [None]:
results_ca.to_csv(f"results/{model_name}-viquiquad-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-viquiquad-en.csv", index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 399.31ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 550.51ba/s]


2265