In [45]:
%env XDG_CACHE=/workspace/.cache
%env HF_HOME=/workspace/.cache/huggingface

env: XDG_CACHE=/workspace/.cache
env: HF_HOME=/workspace/.cache/huggingface


In [46]:
from datasets import load_dataset
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import pyonmttok
import ctranslate2
from metrics import *

In [47]:
#model_id = "projecte-aina/aguila-7b"
model_id = "tiiuae/falcon-7b"
model_name = model_id.split('/')[1]

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             trust_remote_code=True,
                                             device_map="auto")

## Lets Do the translation layer
from huggingface_hub import snapshot_download
print("Loading translator Models...")

ca_en_model_folder = snapshot_download(repo_id="projecte-aina/mt-aina-ca-en", revision="main")
tokenizer_ca_en = pyonmttok.Tokenizer(
    mode="none", sp_model_path=ca_en_model_folder + "/spm.model"
)
ca_en_model = ctranslate2.Translator(ca_en_model_folder, device="cuda")

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.35s/it]


Loading translator Models...


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 26214.40it/s]


In [48]:
def min_max_scaling(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    scaled_tensor = (tensor - min_val) / (max_val - min_val)
    return scaled_tensor


def compute_probability(input_text, answer):
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    answer_tokens = tokenizer(answer)['input_ids']
    answer_probability = 1
    with torch.no_grad():
        for token in answer_tokens:
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)

            # Logits are in the outputs, you can access the last token's logits like this:
            logits = outputs.logits[:, -1, :]
            probabilities = min_max_scaling(logits)
            answer_probability *= probabilities[0][token]

            # Prepare input_ids for the next token prediction
            new_token = torch.tensor([[token]]).to(model.device)
            inputs = {'input_ids': torch.cat([inputs['input_ids'], new_token], dim=1),
                    'attention_mask': torch.cat([inputs['attention_mask'], torch.tensor([[1]]).to(model.device)], dim=1)}
            del new_token
        del inputs
    return answer_probability.item()


def run_inference(txt, num_tokens=20, stop_text='\n'):
    # Tokenize the input text
    inputs = tokenizer(txt, return_tensors="pt").to(model.device)
    # Calculate the total length of the output (input length + number of tokens to generate)
    max_length = len(inputs['input_ids'][0]) + num_tokens

    with torch.no_grad():
        # Generate tokens
        tokens = model.generate(**inputs, do_sample=True, top_k=50, eos_token_id=tokenizer.eos_token_id, max_length=max_length)

        # Decode the generated tokens into text
        generated_text = tokenizer.decode(tokens[0], skip_special_tokens=True)

        # Slice the generated text to exclude the input prompt
        generated_only = generated_text[len(tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)):]

        # If a stop text is found, truncate the output at its first occurrence
        if stop_text in generated_only:
            generated_only = generated_only.split(stop_text)[0]

        return generated_only.strip()


def translate(sample):
    def translate_to_english(txt):
        lines = txt.split("\n")
        toks, _ = tokenizer_ca_en.tokenize_batch(lines)
        translated = ca_en_model.translate_batch(toks)
        ts = []
        for t in translated:
            ts.append(tokenizer_ca_en.detokenize(t.hypotheses[0]))

        return "\n".join(ts)
    en_prompt = translate_to_english(sample['prompt'])
    en_answer = translate_to_english(sample['answer'])
    return {"prompt": en_prompt, "answer": en_answer}


def compute_metrics(sample):
    prob = compute_probability(sample['prompt'], sample['answer'])
    prediction = run_inference(sample['prompt'])
    f1 = f1_score(prediction, sample['answer'])
    return {"prediction": prediction, "prob": prob, "f1": f1}


In [49]:
teca = load_dataset("data", data_files="teca.csv", split="train[:10]")
teca_en = teca.map(translate)

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4228.13it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 790.78it/s]
Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 2820 examples [00:00, 47392.51 examples/s]
Map: 100%|██████████| 10/10 [00:01<00:00,  8.02 examples/s]


In [50]:
def eval(entry):
    prediction = run_inference(entry['prompt'], num_tokens=20, stop_text=' ')
    return { 'prediction': prediction, 'results': prediction == str(entry['answer']) }

results_ca = teca.map(eval)
results_ca.to_pandas()

Map:   0%|          | 0/10 [00:00<?, ? examples/s]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Map:  10%|█         | 1/10 [00:14<02:14, 14.95s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  20%|██        | 2/10 [00:30<02:03, 15.46s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  30%|███       | 3/10 [00:46<01:48, 15.47s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  40%|████      | 4/10 [01:02<01:33, 15.61s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  50%|█████     | 5/10 [01:17<01:17, 15.45s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  60%|██████    | 6/10 [01:33<01:02, 15.56s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  70%|███████   | 7/10 [01:48<00:46, 15.65s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  80%|████████  | 8/10 [02:04<00:31, 15.59s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-

Unnamed: 0,prompt,answer,numeric_label,prediction,results
0,"premissa: Nosaltres, per exemple, no estem d'a...",implicació,0,??\n----\npremissa:,False
1,premissa: Els CDR convoquen una protesta davan...,contradicció,2,2\n----\npremissa:,False
2,premissa: Vaig incorporar me a la feina quan e...,contradicció,2,(complement)\n----\npremissa:,False
3,premissa: La tradició i la modernitat conviuen...,neutre,1,¿que?\n----\npremissa:,False
4,premissa: Senda ofereix pràctiques d'un períod...,implicació,0,/,False
5,premissa: El doctor Josep Morata Socias i la p...,contradicció,2,(contradicció,False
6,premissa: Això és el que passa quan la teva ge...,neutre,1,¿s'ha,False
7,premissa: M'ha sabut molt greu però no hi he p...,neutre,1,(,False
8,premissa: El Concurs ha aconseguit gran presti...,neutre,1,0\n----\npremissa:,False
9,premissa: Però l'altre dia llegint a la biblio...,contradicció,2,"""falsification""\n----\npremissa:",False


In [51]:
results_en = teca_en.map(eval)
results_en.to_pandas()

Map:   0%|          | 0/10 [00:00<?, ? examples/s]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Map:  10%|█         | 1/10 [00:16<02:24, 16.00s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  20%|██        | 2/10 [00:31<02:07, 16.00s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  30%|███       | 3/10 [00:46<01:48, 15.48s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  40%|████      | 4/10 [01:02<01:33, 15.59s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  50%|█████     | 5/10 [01:04<00:52, 10.55s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  60%|██████    | 6/10 [01:19<00:49, 12.26s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  70%|███████   | 7/10 [01:35<00:40, 13.35s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Map:  80%|████████  | 8/10 [01:51<00:28, 14.16s/ examples]Setting `pad_token_id` to `eos_token_id`:11 for open-

Unnamed: 0,prompt,answer,numeric_label,prediction,results
0,"Premise: We, for example, do not agree to be t...",involvement in,0,,False
1,premise: The CDRs call for a protest in front ...,contradiction in the law,2,hypothesis\nThis,False
2,Premise: I joined the work when it was decided...,contradiction in the law,2,,False
3,premise: Tradition and modernity coexist in th...,neutral neutral,1,,False
4,Premise: Senda offers internships for a minimu...,involvement in,0,,False
5,premise: Dr. Josep Morata Socias and Professor...,contradiction in the law,2,,False
6,premise: This is what happens when your sister...,neutral neutral,1,,False
7,Premise: I was very sorry but I could not do a...,neutral neutral,1,,False
8,Premise: The Contest has achieved great presti...,neutral neutral,1,,False
9,Premise: But the other day reading in the libr...,contradiction in the law,2,,False


In [52]:
results_ca.to_csv(f"results/{model_name}-teca-ca.csv", index=False)
results_en.to_csv(f"results/{model_name}-teca-en.csv", index=False)

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 430.80ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 405.09ba/s]


12102

In [53]:
print("Percentage correct ca:", sum(results_ca['results']) / len(results_ca))
print("Percentage correct en:", sum(results_en['results']) / len(results_en))

Percentage correct ca: 0.0
Percentage correct en: 0.0
