In [1]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface


env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [2]:
from datasets import load_dataset
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import pyonmttok
import ctranslate2
from metrics import *


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
model_id = "projecte-aina/aguila-7b"
#model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")

## Lets Do the translation layer
from huggingface_hub import snapshot_download
print("Loading translator Models...")

ca_en_model_folder = snapshot_download(repo_id="projecte-aina/mt-aina-ca-en", revision="main")
tokenizer_ca_en = pyonmttok.Tokenizer(
    mode="none", sp_model_path=ca_en_model_folder + "/spm.model"
)
ca_en_model = ctranslate2.Translator(ca_en_model_folder, device="cuda")


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Loading translator Models...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor


def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 0
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)

            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            log_probs = torch.log(min_max_scaling(logits))
            answer_probability += log_probs[0][token]

            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
    return torch.exp(answer_probability).item()
    # return answer_probability.item()


def run_inference(txt, num_tokens=20, stop_text='\n'):
    # Tokenize the input text
    tokens = tokenizer(txt, return_tensors="pt").to(model.device)['input_ids']
    # Calculate the total length of the output (input length + number of tokens to generate)

    generated_text = None

    with torch.no_grad():
        # Generate tokens
        for _ in range(num_tokens):
            max_length = len(tokens[0]) + 1
            tokens = model.generate(tokens, do_sample=True, top_k=1, eos_token_id=tokenizer.eos_token_id, max_length=max_length)

            # Decode the generated tokens into text
            generated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)

            # If a stop text is found, truncate the output at its first occurrence
            if stop_text is not None:
                if generated_text[-len(stop_text):] == stop_text:
                    break

        generated_only = generated_text.replace(txt, "").strip()
        return generated_only


def translate(sample):
    def translate_to_english(txt):
        lines = [l for l in txt.split("\n") if l.strip() != ""]

        toks, _ = tokenizer_ca_en.tokenize_batch(lines)
        translated = ca_en_model.translate_batch(toks)
        ts = []
        for t in translated:
            t_str = tokenizer_ca_en.detokenize(t.hypotheses[0])
            # That is a bug on the translation outputing twice the translation.
            if len(txt.split(" ")) == 1 and len(t_str.split(" ")) == 2:
                t_str = t_str.split(" ")[0]
            ts.append(t_str)

        return "\n".join(ts)
    en_prompt = translate_to_english(sample['prompt'])
    en_answer = translate_to_english(sample['answer'])
    return {"prompt": en_prompt, "answer": en_answer}


def compute_metrics(sample):
    prob = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    bleu = calculate_bleu_score(prediction, sample['answer'])
    return {"prediction": prediction, "prob": prob, "f1": f1, "bleu": bleu}


In [6]:
teca = load_dataset("data", data_files="teca.csv", split="train[:10]")
teca_en = teca.map(translate)


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [17]:
def eval(entry):
    prediction = run_inference(entry['prompt'], num_tokens=20)
    return { 'prediction': prediction, 'results': prediction == str(entry['answer']) }

results_ca = teca.map(eval)
results_ca.to_pandas()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Unnamed: 0,prompt,answer,numeric_label,prediction,results
0,"premissa: Nosaltres, per exemple, no estem d'a...",implicació,0,realitat,False
1,premissa: Van rebre la visita d'un altre centr...,implicació,0,neutre,False
2,premissa: Els CDR convoquen una protesta davan...,implicació,0,contradicció,False
3,premissa: L’Espai LGTBI de Girona denuncia un ...,neutre,1,contradicció,False
4,"premissa: En aquesta ocasió, compten amb els m...",neutre,1,contradicció,False
5,premissa: Senda ofereix pràctiques d'un períod...,implicació,0,contradicció,False
6,premissa: El TC admet la impugnació del govern...,neutre,1,victòria,False
7,"premissa: Si descrivim i compararem, els parla...",neutre,1,contradicció,False
8,premissa: El doctor Josep Morata Socias i la p...,neutre,1,llengua,False
9,premissa: Això és el que passa quan la teva ge...,neutre,1,afirmació,False


In [19]:
results_en = teca_en.map(eval)
results_en.to_pandas()


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Unnamed: 0,prompt,answer,numeric_label,prediction,results
0,"Premise: We, for example, do not agree to be t...",involvement,0,,False
1,Premise: They received the visit of another ce...,involvement,0,Colau,False
2,premise: The CDRs call for a protest in front ...,involvement,0,21-D,False
3,premise: Espai LGTBI de Girona denounces a fas...,neutral,1,independentisme,False
4,"Premise: On this occasion, they have the music...",neutral,1,,False
5,Premise: Senda offers internships for a minimu...,involvement,0,,False
6,Premise: The TC admits the challenge of the go...,neutral,1,Uruguay,False
7,"Premise: If we describe and compare, the diale...",neutral,1,,False
8,premise: Dr. Josep Morata Socias and Professor...,neutral,1,,False
9,premise: This is what happens when your sister...,neutral,1,Bulgarian music,False


In [9]:
results_ca.to_csv(f"results/{model_name}-teca-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-teca-en.csv", index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'results/aguila-7b-teca-ca.csv'

In [None]:
print("Percentage correct ca:", sum(results_ca['results']) / len(results_ca))
print("Percentage correct en:", sum(results_en['results']) / len(results_en))


Percentage correct ca: 0.0
Percentage correct en: 0.0
