<a href="https://colab.research.google.com/github/ferdinandrafols/IA_LLMs/blob/main/aula02_tokenizacao_pratica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aula 2 - Tokeniza√ß√£o

## Parte 1 - Pr√©-tokeniza√ß√£o


In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers

tok_ws = Tokenizer(models.BPE())
tok_ws.pre_tokenizer = pre_tokenizers.Whitespace()
frase = "N√£o, ser√° punido o criminoso."

print(tok_ws.pre_tokenizer.pre_tokenize_str(frase))


## *Punctuation + Whitespace*

In [None]:
tok_punc = Tokenizer(models.BPE())
tok_punc.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),
    pre_tokenizers.Punctuation()
])
print(tok_punc.pre_tokenizer.pre_tokenize_str(frase))

**Pretokenizer: ByteLevel - estilo GPT-2**

In [None]:
tok_byte = Tokenizer(models.BPE())
tok_byte.pre_tokenizer = pre_tokenizers.ByteLevel()
print(tok_byte.pre_tokenizer.pre_tokenize_str(frase))

# Metaspace (SentencePiece style)

In [None]:
tok_meta = Tokenizer(models.BPE())
tok_meta.pre_tokenizer = pre_tokenizers.Metaspace()
print(tok_meta.pre_tokenizer.pre_tokenize_str(frase))

#Treinamento


In [None]:
# 02_tokenizer_train.ipynb

from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# 1. Vis√£o geral do algoritmo BPE
print("Treinar o tokenizador (BPE):\n")
print("1. Comece com todos os caracteres presentes no corpus como tokens.")
print("2. Encontre e una o par de tokens mais frequente em um novo token.")
print("3. Repita at√© atingir o tamanho de vocabul√°rio desejado.\n")

# 2. Corpus de treino
corpus = ["Hello, world!", "Hello there", "World of BPE"]
print("Corpus de treino:", corpus, "\n")

# 3. Configura√ß√£o do tokenizador
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

bpe_trainer = trainers.BpeTrainer(
    vocab_size=50,
    special_tokens=["<pad>", "<unk>", "<s>", "</s>"]
)

# 4. Treinamento
tokenizer.train_from_iterator(corpus, trainer=bpe_trainer)
vocab = tokenizer.get_vocab()
print(f"Tamanho do vocabul√°rio: {len(vocab)}\n")

# 5. Visualizando parte do vocabul√°rio
sorted_vocab = sorted(vocab.items(), key=lambda kv: kv[1])[:20]
for token, idx in sorted_vocab:
    print(f"{idx:>3} ‚Üí {repr(token)}")

# 6. Salvando e recarregando
tokenizer.save("bpe_tokenizer.json")
tokenizer_new = Tokenizer.from_file("bpe_tokenizer.json")

# 7. Testando em novas frases
textos = ["O rato roeu a roupa do rei de Roma", "Hello, world."]

print("\nTokeniza√ß√£o de exemplos:")
for texto in textos:
    out = tokenizer_new.encode(texto)
    print(f"Texto: {texto}")
    print(f"Tokens: {out.tokens}")
    print(f"IDs: {out.ids}\n")


# Encode

In [None]:
# 03_tokenizer_encode.ipynb
# Pipeline de tokeniza√ß√£o: normaliza√ß√£o ‚Üí pr√©-tokeniza√ß√£o ‚Üí modelo ‚Üí p√≥s-processamento

from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors
from tokenizers.normalizers import NFD, StripAccents, Lowercase
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence
from tokenizers.processors import TemplateProcessing

print("### Pipeline de tokeniza√ß√£o ###")
print(" Normalization")
print(" Pre-tokenization")
print(" Model")
print(" Post-processing\n")

# Carregar o tokenizador treinado (BPE)
tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

# -----------------------------------------------------------
# Normalization
# -----------------------------------------------------------
print("# Normalization")
normalizer = normalizers.Sequence([
    NFD(),          # decomposi√ß√£o de acentos
    Lowercase(),    # tudo min√∫sculo
    StripAccents()  # remove acentos
])
texto = "H√©ll√≤ h√¥w are √º?"
print("Antes:", texto)
print("Depois:", normalizer.normalize_str(texto), "\n")
tokenizer.normalizer = normalizer

# -----------------------------------------------------------
# Pre-tokenization
# -----------------------------------------------------------
print("# Pre-tokenization")
pre_tok = Sequence([
    Whitespace(),
    Digits(individual_digits=True)
])
texto2 = "Hello! How are you? Tenho R$ 213,12."
print("Pr√©-tokeniza√ß√£o:", pre_tok.pre_tokenize_str(texto2), "\n")
tokenizer.pre_tokenizer = pre_tok

# -----------------------------------------------------------
# Model
# -----------------------------------------------------------
print("# Model: BPE (Byte Pair Encoding)")
# j√° carregado do arquivo bpe_tokenizer.json

# -----------------------------------------------------------
# Post-processing
# -----------------------------------------------------------
print("# Post-processing (TemplateProcessing)")
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
)

# -----------------------------------------------------------
# Aplicando tudo
# -----------------------------------------------------------
encoded = tokenizer.encode("ol√° mundo")
print("Tokens IDs:", encoded.ids)
print("Tokens:", encoded.tokens)


## Bytelevel vs SentencePiece

In [None]:
# 03_bytelevel_vs_sentencepiece.ipynb
# Comparando ByteLevel (GPT-2) vs SentencePiece (mT5)

from transformers import AutoTokenizer
import unicodedata

# -----------------------------
# 1Ô∏è‚É£ Modelos
# -----------------------------
BYTELEVEL_MODEL = "openai-community/gpt2"
SENTPIECE_MODEL = "google/mt5-small"

tok_byte = AutoTokenizer.from_pretrained(BYTELEVEL_MODEL)
tok_spm  = AutoTokenizer.from_pretrained(SENTPIECE_MODEL)

# Garantir pad_token
if tok_byte.pad_token is None and hasattr(tok_byte, "eos_token"):
    tok_byte.pad_token = tok_byte.eos_token

# -----------------------------
# 2Ô∏è‚É£ Texto de exemplo
# -----------------------------
text = "Vamos comer, vov√≥! üôÇ"
print(f"Texto: {text}\n")

# -----------------------------
# 3Ô∏è‚É£ Tokeniza√ß√£o
# -----------------------------
def encode_details(tokenizer, name):
    enc = tokenizer(text, add_special_tokens=True, return_offsets_mapping=True)
    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])
    ids = enc["input_ids"]
    offsets = enc["offset_mapping"]
    print(f"=== {name} ===")
    print("Tokens:", tokens)
    print("IDs:", ids)
    print("Qtd tokens:", len(tokens))
    print("Decoded:", tokenizer.decode(ids))
    print("Offsets:", offsets)
    print()

encode_details(tok_byte, "ByteLevel (GPT-2)")
encode_details(tok_spm, "SentencePiece (mT5)")

# -----------------------------
# 4Ô∏è‚É£ Compara√ß√£o Unicode (opcional)
# -----------------------------
def show_unicode_chars(s):
    for ch in s:
        name = unicodedata.name(ch, "UNKNOWN")
        print(f"{repr(ch)} -> {name}")

print("\nCaracteres Unicode do texto:")
show_unicode_chars(text)


# Avalia√ß√£o

In [None]:
from tokenizers import Tokenizer
import numpy as np

# Carrega o tokenizador treinado
tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

# Corpus de teste (pode ser parte do seu corpus real)
test_texts = [
    "O rato roeu a roupa do rei de Roma.",
    "Aprender tokeniza√ß√£o √© divertido!",
    "GPT-2 e mT5 usam abordagens diferentes.",
    "Python √© √≥timo para NLP üòÑ",
]

# Fun√ß√µes auxiliares
def count_chars(text):
    return len(text)

def count_words(text):
    return len(text.split())

def evaluate_tokenizer(tokenizer, texts):
    stats = []
    for t in texts:
        enc = tokenizer.encode(t)
        stats.append({
            "text": t,
            "chars": count_chars(t),
            "words": count_words(t),
            "tokens": len(enc.tokens),
            "unk": enc.tokens.count("<unk>"),
            "decoded_ok": (tokenizer.decode(enc.ids) == t)
        })
    return stats

stats = evaluate_tokenizer(tokenizer, test_texts)

# Converter para m√©tricas agregadas
import pandas as pd
df = pd.DataFrame(stats)

tpc = (df["tokens"] / df["chars"]).mean()
tpw = (df["tokens"] / df["words"]).mean()
unk_rate = (df["unk"].sum() / df["tokens"].sum()) * 100
decode_acc = (df["decoded_ok"].mean()) * 100

print("=== M√©tricas de efici√™ncia ===")
print(f"Tokens por caractere (TPC): {tpc:.3f}")
print(f"Tokens por palavra (TPW): {tpw:.3f}")
print(f"Percentual de <unk>: {unk_rate:.2f}%")
print(f"Reversibilidade (decode == original): {decode_acc:.1f}%")
print(f"Tamanho m√©dio da sequ√™ncia: {df['tokens'].mean():.1f} tokens/frase")
