In [2]:
# Check cuda visible devices
import torch

print(torch.cuda.is_available())

print(torch.cuda.device_count())

True
2


In [2]:
%load_ext autoreload
%autoreload 2
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# custom device map
device_map = {
    'model.norm': 'cpu',
    'lm_head': 'cpu',
    'model.embed_tokens': 0,
}


for i in range(0, 12):
    device_map[f"model.layers.{i}"] = 0

for i in range(12, 24):
    device_map[f"model.layers.{i}"] = 1

for i in range(24, 36):
    device_map[f"model.layers.{i}"] = "cpu"

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    #bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map,
    quantization_config=quantization_config,
    #device_map=device_map,
    #llm_int8_enable_fp32_cpu_offload=True,
)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]



In [3]:
model.hf_device_map

{'model.norm': 'cpu',
 'lm_head': 'cpu',
 'model.embed_tokens': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 1,
 'model.layers.13': 1,
 'model.layers.14': 1,
 'model.layers.15': 1,
 'model.layers.16': 1,
 'model.layers.17': 1,
 'model.layers.18': 1,
 'model.layers.19': 1,
 'model.layers.20': 1,
 'model.layers.21': 1,
 'model.layers.22': 1,
 'model.layers.23': 1,
 'model.layers.24': 'cpu',
 'model.layers.25': 'cpu',
 'model.layers.26': 'cpu',
 'model.layers.27': 'cpu',
 'model.layers.28': 'cpu',
 'model.layers.29': 'cpu',
 'model.layers.30': 'cpu',
 'model.layers.31': 'cpu',
 'model.layers.32': 'cpu',
 'model.layers.33': 'cpu',
 'model.layers.34': 'cpu',
 'model.layers.35': 'cpu'}

In [4]:
from IPython.display import display, clear_output, HTML
import textwrap

prompt = """<s>[INST] Traducir del español al español rioplatense.

español: Hola hombre, ¿cómo estás?
[/INST]
¿Qué onda, chabón? ¿Cómo va?
</s>
[INST]
Traducir del español al rioplatense

español: Vaya tío, eres un cabrón
[/INST]
Che pedazo de pelotudo, sos un forro
</s>
[INST]
Traducir del español al rioplatense

español: Tu puedes hacerlo!
[/INST]
"""
inputs = tokenizer(prompt, return_tensors="pt")
next_token = None
new_tokens = []
while next_token != tokenizer.eos_token_id:
    output = model.generate(**inputs, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)

    next_token = output[0][-1].item()
    inputs = {
        "input_ids": output,
        "attention_mask": torch.ones_like(output),
    }
    new_tokens.append(next_token)
    text = tokenizer.decode(new_tokens, skip_special_tokens=True)

    #text = "\n".join(textwrap.wrap(text, width=100))
    clear_output(wait=True)
    display(HTML(text))


In [None]:
from datasets import load_dataset

dataset_name = "piuba-bigdata/contextualized_hate_speech"

ds = load_dataset(dataset_name)

In [None]:
test_ds = ds["test"].shuffle()

In [None]:
from pysentimiento.preprocessing import preprocess_tweet
import re

url_regex = r"\burl\b"


def text_preprocess(text):
    text = preprocess_tweet(text, preprocess_hashtags=False, demoji=False)
    text = text.replace("@usuario", "")

    text = re.sub(url_regex, "", text)
    # Replace multiple spaces with one
    text = " ".join(text.split())
    text = text.replace("\n", " ")

    return text


In [None]:
tokenizer.model_max_length = 6400
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
from rioplatense_hs.mixtral import get_prompt

def tokenize(example):
    texto = text_preprocess(example["text"])

    contexto = preprocess_tweet(
        example["context_tweet"],
        preprocess_hashtags=False,
        demoji=False,
        preprocess_handles=False,
    )
    model_input = get_prompt(context=contexto, text=texto)
    return tokenizer(model_input, truncation=True)

tokenized_ds = test_ds.map(tokenize, batched=False)

In [None]:
# prompt: Sort tokenized_ds by length of input_ids
tokenized_ds = tokenized_ds.map(lambda x: {"len": len(x["input_ids"])}, batched=False)

sorted_tokenized_ds = tokenized_ds.sort("len")

In [None]:
# decode first example

inputs = sorted_tokenized_ds[0]["input_ids"]

text = tokenizer.decode(inputs, skip_special_tokens=True)

text = text.replace("[INST]", "\n[INST]")
text = text.replace("[/INST]", "[/INST]\n")

print(text)

In [None]:
# prompt: Build a dataloader with a Causal LLM collator

from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
batch_size = 16
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

def collate(inputs):
    attention = [ex["attention_mask"] for ex in inputs]
    input_ids = [ex["input_ids"] for ex in inputs]
    ids = [ex["id"] for ex in inputs]
    return ids, tokenizer.pad({"input_ids": input_ids, "attention_mask": attention}, return_tensors="pt")


dataloader = DataLoader(
    sorted_tokenized_ds,
    batch_size=batch_size,
    collate_fn=collate,
    shuffle=False,
    pin_memory=True,
    num_workers=16,
)


In [None]:
from tqdm.auto import tqdm


for ids, inputs in tqdm(dataloader):
    output = model.generate(**inputs, max_new_tokens=150)

    for k, id in enumerate(ids):
        output_text = tokenizer.decode(output[k], skip_special_tokens=True)

        #with open(path+"/{}.txt".format(id), "w") as f:
        #    f.write(output_text)

    #for _ in range(4):
    #    print("="*80)
    #print(f"contexto: {contexto}")
    #print(f"texto: {texto}")
    #print(output_text)
