In [1]:
%load_ext autoreload
%autoreload 2
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel

model_id = "CohereForAI/aya-101"
tokenizer = AutoTokenizer.from_pretrained(model_id)

aya_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    device_map="auto",
    # load in fp16
    torch_dtype=torch.float16,
)



Loading checkpoint shards:   0%|          | 0/11 [00:00<?, ?it/s]

In [2]:
from IPython.display import display, clear_output, HTML
import textwrap

prompt = """Traducir del español ibérico al español rioplatense.

###
español: Hola hombre, ¿cómo estás?
rioplatense:¿Qué onda, chabón? ¿Cómo va?
###
Traducir del español al rioplatense

español: Vaya tío, eres un cabrón
rioplatense: Che pedazo de pelotudo, sos un forro
###
Traducir del español al rioplatense

español: ¡Tú puedes hacerlo!
"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k:v.to(aya_model.device) for k,v in inputs.items()}

output = aya_model.generate(
    **inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, do_sample=True,
    top_p=0.95,
)

text = tokenizer.decode(output[0], skip_special_tokens=True)

clear_output(wait=True)
display(HTML(text))


In [34]:
from rioplatense_hs.prompting import build_prompt, build_base_prompt

base_prompt = build_base_prompt(num_examples=5)

prompt = build_prompt(contexto="China reabre sus fronteras trás el COVID-19", texto="Genial, ahora mandémosles de nuevo a todos sus chinos infectados con COVID-19", base_prompt=base_prompt)

inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k:v.to(aya_model.device) for k,v in inputs.items()}

outs = aya_model.generate(**inputs, max_new_tokens=1000, do_sample=False, top_p=0.95, pad_token_id=tokenizer.eos_token_id)

text = tokenizer.decode(outs[0], skip_special_tokens=False)

print(text)




<pad> El texto hace referencia a los chinos como posibles culpables del COVID-19. La respuesta final es "racismo"</s>


<pad> El comentario alude a los chinos, que serían los supuestos culpables de propagar la pandemia. La respuesta final es "racismo"</s>


In [None]:
from datasets import load_dataset

dataset_name = "piuba-bigdata/contextualized_hate_speech"

ds = load_dataset(dataset_name)

In [None]:
test_ds = ds["test"].shuffle()

In [None]:
from pysentimiento.preprocessing import preprocess_tweet
import re

url_regex = r"\burl\b"


def text_preprocess(text):
    text = preprocess_tweet(text, preprocess_hashtags=False, demoji=False)
    text = text.replace("@usuario", "")

    text = re.sub(url_regex, "", text)
    # Replace multiple spaces with one
    text = " ".join(text.split())
    text = text.replace("\n", " ")

    return text


In [None]:
tokenizer.model_max_length = 6400
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
from rioplatense_hs.mixtral import get_prompt

def tokenize(example):
    texto = text_preprocess(example["text"])

    contexto = preprocess_tweet(
        example["context_tweet"],
        preprocess_hashtags=False,
        demoji=False,
        preprocess_handles=False,
    )
    model_input = get_prompt(context=contexto, text=texto)
    return tokenizer(model_input, truncation=True)

tokenized_ds = test_ds.map(tokenize, batched=False)

In [None]:
# prompt: Sort tokenized_ds by length of input_ids
tokenized_ds = tokenized_ds.map(lambda x: {"len": len(x["input_ids"])}, batched=False)

sorted_tokenized_ds = tokenized_ds.sort("len")

In [None]:
# decode first example

inputs = sorted_tokenized_ds[0]["input_ids"]

text = tokenizer.decode(inputs, skip_special_tokens=True)

text = text.replace("[INST]", "\n[INST]")
text = text.replace("[/INST]", "[/INST]\n")

print(text)

In [None]:
# prompt: Build a dataloader with a Causal LLM collator

from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
batch_size = 16
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

def collate(inputs):
    attention = [ex["attention_mask"] for ex in inputs]
    input_ids = [ex["input_ids"] for ex in inputs]
    ids = [ex["id"] for ex in inputs]
    return ids, tokenizer.pad({"input_ids": input_ids, "attention_mask": attention}, return_tensors="pt")


dataloader = DataLoader(
    sorted_tokenized_ds,
    batch_size=batch_size,
    collate_fn=collate,
    shuffle=False,
    pin_memory=True,
    num_workers=16,
)


In [None]:
from tqdm.auto import tqdm


for ids, inputs in tqdm(dataloader):
    output = model.generate(**inputs, max_new_tokens=150)

    for k, id in enumerate(ids):
        output_text = tokenizer.decode(output[k], skip_special_tokens=True)

        #with open(path+"/{}.txt".format(id), "w") as f:
        #    f.write(output_text)

    #for _ in range(4):
    #    print("="*80)
    #print(f"contexto: {contexto}")
    #print(f"texto: {texto}")
    #print(output_text)
