In [1]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface

env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [2]:
from datasets import load_dataset
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import pyonmttok
import ctranslate2
from metrics import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_id = "projecte-aina/aguila-7b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")



Loading checkpoint shards: 100%|██████████| 15/15 [00:05<00:00,  2.59it/s]


In [4]:
## Lets Do the translation layer
from huggingface_hub import snapshot_download
print("Loading translator Models...")

ca_en_model_folder = snapshot_download(repo_id="projecte-aina/mt-aina-ca-en", revision="main")
tokenizer_ca_en = pyonmttok.Tokenizer(
    mode="none", sp_model_path=ca_en_model_folder + "/spm.model"
)
ca_en_model = ctranslate2.Translator(ca_en_model_folder, device="cuda")

Loading translator Models...


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 36856.80it/s]


In [13]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor
    
def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 1
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    
            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            probabilities = min_max_scaling(logits)
            answer_probability *= probabilities[0][token]
            
            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
            del new_token
        del inputs
    return answer_probability.item()

def run_inference(txt, num_tokens=20, stop_text='\n'):
    inputs = tokenizer(txt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        for _ in range(num_tokens):
            tokens = model.generate(**inputs, do_sample=True,
                top_k=1,
                eos_token_id=tokenizer.eos_token_id,
                max_new_tokens=1)
            _out = tokenizer.decode(tokens[0])
            if _out[-len(stop_text):] == stop_text:
                break
            inputs = tokenizer(_out, return_tensors="pt").to(model.device)
        del inputs
        return tokenizer.decode(tokens[0]).replace(txt, "")

def translate_to_english(txt):
    lines = txt.split("\n")
    translated_lines = []   
    toks, _ = tokenizer_ca_en.tokenize_batch(lines)
    translated = ca_en_model.translate_batch(toks)
    ts = []
    for t in translated:
        ts.append(tokenizer_ca_en.detokenize(t.hypotheses[0]))
    
    return "\n".join(ts)

In [36]:
viquiquad = load_dataset("benchmarks", data_files="viquiquad.csv", split="train")

In [37]:
def compute_metrics(sample):
    acc = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    return {"prediction": prediction, "acc": acc, "f1": f1}

results = viquiquad.map(compute_metrics)

Map: 100%|██████████| 2/2 [00:03<00:00,  1.60s/ examples]


In [38]:
def translate(sample):
    en_prompt = translate_to_english(sample['prompt'])
    en_answer = translate_to_english(sample['answer'])
    return {"prompt": en_prompt, "answer": en_answer}
    
viquiquad_en = viquiquad.map(translate)

Map: 100%|██████████| 2/2 [00:00<00:00,  3.21 examples/s]


In [39]:
results_en = viquiquad_en.map(compute_metrics)

Map: 100%|██████████| 2/2 [00:03<00:00,  1.81s/ examples]


In [41]:
results.to_csv("viquiquad_results.csv", index=False)
results_en.to_csv("viquiquad_results_en.csv", index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 294.75ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 296.73ba/s]


2289