In [1]:
# =========================
# 1) Setup: instalar libs
# =========================
# Observação: em Kaggle, muitas vezes já existe parte disso instalado.
# O -q deixa a saída mais limpa. Remova se quiser ver logs.
!pip -q install -U transformers datasets seqeval scikit-learn accelerate


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# =========================
# 2) Imports e configurações
# =========================
import os
import json
import re
import random
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional
from collections import Counter

import numpy as np
import pandas as pd
import torch

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    set_seed,
)

from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score as seqeval_accuracy
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score as sk_accuracy,
    precision_recall_fscore_support,
)

import matplotlib.pyplot as plt

SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE_ID = 0 if torch.cuda.is_available() else -1

print("Torch:", torch.__version__)
print("Device:", DEVICE)

Torch: 2.10.0+cpu
Device: cpu


In [3]:
# =========================
# 3) Utilitários: localizar arquivos e preparar dados NER
# =========================
def find_file_exact_or_pattern(
    filename: str,
    patterns: List[str],
    base_dirs: List[Path],
) -> Path:
    """Procura um arquivo por nome exato e por padrões (glob), recursivamente."""
    # 1) Checagem direta
    for d in base_dirs:
        p = d / filename
        if p.exists():
            return p

    # 2) Busca recursiva por nome exato
    for d in base_dirs:
        if d.exists():
            hits = list(d.rglob(filename))
            if hits:
                return hits[0]

    # 3) Busca por padrões (glob)
    for pat in patterns:
        for d in base_dirs:
            if d.exists():
                hits = list(d.rglob(pat))
                if hits:
                    return hits[0]

    existing = [str(d) for d in base_dirs if d.exists()]
    raise FileNotFoundError(
        f"Não encontrei '{filename}' (nem padrões {patterns}) nos diretórios: {existing}.\n"
        f"Dica: no Kaggle, os arquivos costumam estar em /kaggle/input/<dataset>/..."
    )

def tokenize_with_spans(text: str) -> Tuple[List[str], List[Tuple[int, int]]]:
    """Tokenização simples por whitespace preservando spans (start/end)."""
    tokens = []
    spans = []
    for m in re.finditer(r"\S+", text):
        tokens.append(m.group())
        spans.append((m.start(), m.end()))
    return tokens, spans

def ensure_bio(tags: List[Any]) -> List[str]:
    """Normaliza uma sequência de labels para BIO.

    Aceita labels como:
    - 'O'
    - 'PER' (sem BIO)  -> vira B-PER/I-PER dependendo da continuidade
    - 'B-PER', 'I-PER' -> mantém (corrigindo I inválido para B quando necessário)
    """
    out = []
    prev_type = "O"
    for t in tags:
        if t is None:
            t = "O"
        t = str(t).strip()
        if t == "" or t.upper() == "O":
            out.append("O")
            prev_type = "O"
            continue

        # já vem em BIO?
        if t.startswith("B-") or t.startswith("I-"):
            pref = t[:2]  # 'B-' ou 'I-'
            typ = t[2:]
            # corrige I-<X> que não segue um B-/I-<X>
            if pref == "I-" and not (prev_type == f"B-{typ}" or prev_type == f"I-{typ}"):
                out.append(f"B-{typ}")
                prev_type = f"B-{typ}"
            else:
                out.append(t)
                prev_type = t
        else:
            # sem BIO: decide B ou I conforme continuidade
            typ = t
            if prev_type.endswith(f"-{typ}"):
                out.append(f"I-{typ}")
                prev_type = f"I-{typ}"
            else:
                out.append(f"B-{typ}")
                prev_type = f"B-{typ}"
    return out

def extract_records(raw: Any) -> List[Dict[str, Any]]:
    """Converte o JSON em uma lista de registros (exemplos)."""
    if isinstance(raw, list):
        return raw
    if isinstance(raw, dict):
        # chaves comuns
        for k in ["data", "examples", "items", "records", "annotations"]:
            if k in raw and isinstance(raw[k], list):
                return raw[k]
        # dict id -> record
        if all(isinstance(v, dict) for v in raw.values()):
            return list(raw.values())
    raise ValueError("Formato de JSON não reconhecido. Esperava lista ou dict com lista interna.")

def parse_ner_json(json_path: Path) -> List[Dict[str, Any]]:
    """Lê o JSON e devolve exemplos no formato {'tokens': [...], 'ner_tags': [...]}"""
    with open(json_path, "r", encoding="utf-8") as f:
        raw = json.load(f)

    records = extract_records(raw)
    examples = []

    # tenta capturar mapeamento id->label se existir
    global_id2label = None
    if isinstance(raw, dict):
        for k in ["id2label", "labels", "tag_names", "ner_tags_names"]:
            if k in raw and isinstance(raw[k], list):
                global_id2label = {i: str(name) for i, name in enumerate(raw[k])}

    for idx, rec in enumerate(records):
        if not isinstance(rec, dict):
            continue

        # Caso 1: já tokenizado
        if "tokens" in rec and ("ner_tags" in rec or "labels" in rec or "tags" in rec):
            tokens = rec["tokens"]
            tags = rec.get("ner_tags", None) or rec.get("labels", None) or rec.get("tags", None)

            if not isinstance(tokens, list) or not isinstance(tags, list):
                continue

            # converte tokens em str
            tokens = [str(t) for t in tokens]

            # tags numéricas?
            if len(tags) > 0 and isinstance(tags[0], int):
                id2label = None
                # tenta achar mapping no registro
                for k in ["id2label", "labels", "tag_names", "ner_tags_names"]:
                    if k in rec and isinstance(rec[k], list):
                        id2label = {i: str(name) for i, name in enumerate(rec[k])}
                        break
                if id2label is None:
                    id2label = global_id2label
                if id2label is None:
                    raise ValueError(
                        "Achei tags numéricas, mas não encontrei um mapeamento id->label no JSON."
                    )
                tags = [id2label[int(t)] for t in tags]
            else:
                tags = [str(t) for t in tags]

            # normaliza BIO
            tags = ensure_bio(tags)

            if len(tokens) != len(tags):
                raise ValueError(
                    f"Registro {idx}: len(tokens)={len(tokens)} != len(tags)={len(tags)}"
                )

            examples.append({"tokens": tokens, "ner_tags": tags})
            continue

        # Caso 2: texto + spans de entidades
        if "text" in rec and ("entities" in rec or "spans" in rec or "annotations" in rec):
            text = str(rec["text"])
            ents = rec.get("entities", None) or rec.get("spans", None) or rec.get("annotations", None)
            if not isinstance(ents, list):
                ents = []

            tokens, spans = tokenize_with_spans(text)
            tags = ["O"] * len(tokens)

            # ordena spans por start
            def _start(ent: Dict[str, Any]) -> int:
                for k in ["start", "begin", "start_offset", "inicio"]:
                    if k in ent:
                        return int(ent[k])
                return 0

            ents_sorted = sorted([e for e in ents if isinstance(e, dict)], key=_start)

            for ent in ents_sorted:
                start = ent.get("start", ent.get("begin", ent.get("start_offset", ent.get("inicio", None))))
                end = ent.get("end", ent.get("stop", ent.get("end_offset", ent.get("fim", None))))
                label = ent.get("label", ent.get("entity", ent.get("type", ent.get("tipo", None))))

                if start is None or end is None or label is None:
                    continue
                start = int(start); end = int(end)
                label = str(label).strip()

                # remove prefixos BIO se vierem
                base = re.sub(r"^(B-|I-)", "", label)

                # tokens que intersectam o span
                idxs = [i for i, (s, e) in enumerate(spans) if not (e <= start or s >= end)]
                if not idxs:
                    continue

                for j, i_tok in enumerate(idxs):
                    pref = "B" if j == 0 else "I"
                    tags[i_tok] = f"{pref}-{base}"

            tags = ensure_bio(tags)
            examples.append({"tokens": tokens, "ner_tags": tags})
            continue

        # Se não reconheceu formato, ignora (ou você pode optar por raise)
        # print(f"Aviso: registro {idx} em formato não reconhecido. Chaves: {list(rec.keys())}")

    if len(examples) == 0:
        raise ValueError(
            "Não consegui extrair nenhum exemplo de NER do JSON.\n"
            "Verifique o formato do arquivo e ajuste o parser em parse_ner_json()."
        )

    return examples

def build_label_list(examples: List[Dict[str, Any]]) -> List[str]:
    labels = set()
    for ex in examples:
        for t in ex["ner_tags"]:
            labels.add(str(t))
    if "O" not in labels:
        labels.add("O")

    def sort_key(lab: str):
        if lab == "O":
            return (0, "", 0)
        if "-" in lab:
            pref, typ = lab.split("-", 1)
        else:
            pref, typ = "B", lab
        pref_order = {"B": 0, "I": 1}.get(pref, 2)
        return (1, typ, pref_order)

    label_list = sorted(labels, key=sort_key)
    # garante 'O' primeiro
    if label_list[0] != "O":
        label_list = ["O"] + [l for l in label_list if l != "O"]
    return label_list


# =========================
# AUTO-LABEL (fallback) — se o JSON vier com ner_tags = 'O' em tudo
# =========================
# Por que isso existe?
# - Se o seu JSON não tem nenhuma entidade anotada, o label_list vira apenas ['O'].
# - Isso faz num_labels=1 e o loss vira sempre 0 (treino "degenerado", sem aprendizado real).
# - Este fallback cria pseudo-labels com regras (regex) para permitir um baseline funcional.
#
# Melhorias (v4):
# - Unifica placeholders TELEFONE/CELULAR/FONE/TEL -> PHONE (evita duplicar labels)
# - Heurística mais forte para endereço (ADDR), incluindo padrões comuns do DF/Brasília (SQS, SQN, SHDF, CRN, etc.)
# - Regex de telefone mais tolerante (aceita mascaramento com X/*/#)
# - Essas mudanças ajudam tanto o treino quanto a redução de falsos positivos na avaliação (ver seção 9)

_PUNCT_STRIP = " \t\n\r.,;:!?\"'()[]{}<>"

# Regexes tolerantes a mascaramento (X, *, #) mantendo o formato
CPF_RE  = re.compile(r"^(?:[\dXx\*#]{3})\.(?:[\dXx\*#]{3})\.(?:[\dXx\*#]{3})-(?:[\dXx\*#]{2})$")
CNPJ_RE = re.compile(r"^(?:[\dXx\*#]{2})\.(?:[\dXx\*#]{3})\.(?:[\dXx\*#]{3})/(?:[\dXx\*#]{4})-(?:[\dXx\*#]{2})$")
CEP_RE  = re.compile(r"^(?:[\dXx\*#]{5})-(?:[\dXx\*#]{3})$|^(?:[\dXx\*#]{8})$")
EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$")

# Telefones BR comuns (com/sem DDD, com/sem +55, com/sem hífen)
PHONE_RE = re.compile(r"^(?:\+?55\s*)?(?:\(?\d{2}\)?\s*)?(?:\d{4,5})[-\s]?\d{4}$")
# Variante tolerante a mascaramento (X/*/#)
PHONE_MASK_RE = re.compile(r"^(?:\+?55\s*)?(?:\(?[\dXx\*#]{2}\)?\s*)?(?:[\dXx\*#]{4,5})[-\s]?[\dXx\*#]{4}$")

# Placeholders comuns em texto mascarado
PLACEHOLDER_RE = re.compile(
    r"^[\[\(<]?\s*(cpf|cnpj|email|e-mail|telefone|celular|fone|tel|nome|rg|cep|endereco|endereço)\s*[\]\)>]?$",
    re.IGNORECASE
)

# Conectores comuns em nomes
NAME_CONNECTORS = {"de", "da", "do", "dos", "das", "e"}

# Heurísticas de endereço (foco DF/Brasília + genéricos)
ADDR_STARTERS = {
    # DF/Brasília (bem comuns em relatos)
    "sqs", "sqn", "scs", "scln", "sclrn", "sgan", "sgas", "shdf", "shis",
    "crn", "cln", "cls", "cl", "qi", "q", "qe", "qna", "qnb", "qnc", "qnd", "qne", "qnf",
    # genéricos
    "rua", "r", "avenida", "av", "travessa", "alameda", "rodovia", "br", "km",
    "quadra", "qd", "lote", "lt", "bloco", "bl", "conjunto", "cj", "setor", "st",
    "ap", "apt", "apartamento", "casa", "loja", "nº", "no", "numero", "número",
    "bairro"
}
ADDR_PARTS = {
    # componentes frequentes de endereço
    "bloco", "bl", "lote", "lt", "quadra", "qd", "conjunto", "cj", "setor", "st",
    "ap", "apt", "apartamento", "casa", "loja", "sul", "norte", "leste", "oeste",
    "asa", "l3", "l2", "l1", "w3", "w2", "w1"
}
ROMAN_RE = re.compile(r"^(?=[IVXLCDM]+$)[IVXLCDM]{1,4}$", re.IGNORECASE)

def _strip_punct(tok: str) -> str:
    return str(tok).strip(_PUNCT_STRIP)

def _is_upper_short(tok: str) -> bool:
    t = _strip_punct(tok)
    return t.isupper() and 1 <= len(t) <= 4

def _looks_like_name_token(tok: str) -> bool:
    t = _strip_punct(tok)
    if not t:
        return False
    # Exclui siglas curtas (ex: CPF, DF, SQS)
    if _is_upper_short(t):
        return False
    # Primeira letra maiúscula + contém letra
    return t[0].isupper() and any(ch.isalpha() for ch in t)

def _looks_like_addr_token(tok: str) -> bool:
    t = _strip_punct(tok)
    if not t:
        return False
    lt = t.lower()
    if lt in ADDR_PARTS or lt in ADDR_STARTERS:
        return True
    if ROMAN_RE.match(t):
        return True
    if _is_upper_short(t):
        return True
    # contém dígito (ex: 104, 602-607, 308)
    if any(ch.isdigit() for ch in t):
        return True
    # padrões tipo "QNL23" etc
    if re.match(r"^[A-Za-z]{1,6}\d{1,4}[A-Za-z]?$", t):
        return True
    return False

def _detect_pii_type(tok: str) -> Optional[str]:
    t_raw = str(tok).strip()
    t = _strip_punct(t_raw)

    if not t:
        return None

    # Placeholders
    m = PLACEHOLDER_RE.match(t)
    if m:
        key = m.group(1).lower().replace("-", "")
        if key in {"nome"}:
            return "PER"
        if key in {"endereco", "endereço"}:
            return "ADDR"
        if key in {"telefone", "celular", "fone", "tel"}:
            return "PHONE"
        if key in {"email"}:
            return "EMAIL"
        if key in {"cpf"}:
            return "CPF"
        if key in {"cnpj"}:
            return "CNPJ"
        if key in {"cep"}:
            return "CEP"
        if key in {"rg"}:
            return "RG"
        return key.upper()

    # Regex fortes
    if CPF_RE.match(t):
        return "CPF"
    if CNPJ_RE.match(t):
        return "CNPJ"
    if CEP_RE.match(t):
        return "CEP"
    if EMAIL_RE.match(t):
        return "EMAIL"

    # Telefone: tira caracteres extras comuns e testa
    t_phone = re.sub(r"[()\s]", "", t)
    if PHONE_RE.match(t_phone) or PHONE_MASK_RE.match(t_phone):
        return "PHONE"

    return None

def auto_label_tokens(tokens: List[str]) -> List[str]:
    """Gera tags BIO a partir de tokens usando regras simples (prioriza precisão)."""
    n = len(tokens)
    tags = ["O"] * n

    # 1) Padrões diretos por token (CPF, CNPJ, EMAIL, PHONE, CEP, placeholders)
    for i, tok in enumerate(tokens):
        typ = _detect_pii_type(tok)
        if typ:
            tags[i] = f"B-{typ}"

    # 2) Heurística de nome (PER) por contexto
    #    Ex.: "meu nome é Aline Souza" / "nome: Aline Souza"
    lower = [_strip_punct(t).lower() for t in tokens]

    def label_name_from(start_idx: int):
        """Tenta rotular uma sequência de nome a partir de start_idx."""
        idxs = []
        cap_count = 0
        j = start_idx
        # pega até 6 tokens (para nomes com conectores)
        while j < n and len(idxs) < 6:
            tok_j = tokens[j]
            lj = lower[j]

            if _looks_like_name_token(tok_j):
                idxs.append(j)
                cap_count += 1
                j += 1
                continue

            # conectores dentro do nome (de/da/do/dos/das/e) se seguido de token "nomeável"
            if lj in NAME_CONNECTORS and idxs and (j + 1) < n and _looks_like_name_token(tokens[j + 1]):
                idxs.append(j)
                j += 1
                continue

            break

        if cap_count >= 1 and idxs:
            # não sobrescreve um token já marcado como outro PII "forte"
            if tags[idxs[0]] == "O":
                tags[idxs[0]] = "B-PER"
            for k in idxs[1:]:
                if tags[k] == "O":
                    tags[k] = "I-PER"

    for i in range(n):
        # "nome é" / "nome:" / "nome -"
        if lower[i] == "nome" and (i + 1) < n and lower[i + 1] in {"é", "eh", ":", "-"}:
            if (i + 2) < n:
                label_name_from(i + 2)

        # "meu nome é"
        if lower[i] == "meu" and (i + 2) < n and lower[i + 1] == "nome" and lower[i + 2] in {"é", "eh", ":", "-"}:
            if (i + 3) < n:
                label_name_from(i + 3)

        # "me chamo"
        if lower[i] == "me" and (i + 1) < n and lower[i + 1] == "chamo":
            if (i + 2) < n:
                label_name_from(i + 2)

        # "sou Fulano"
        if lower[i] == "sou":
            if (i + 1) < n:
                label_name_from(i + 1)

    # 3) Heurística de endereço (ADDR)
    #    - Detecta inícios comuns (SQS, SQN, SHDF, CRN, Rua, Av, Quadra, etc.)
    #    - Rotula uma "janela" curta de tokens que parecem parte de endereço
    def label_addr_from(start_idx: int):
        idxs = []
        j = start_idx
        while j < n and len(idxs) < 12:
            tok_j = tokens[j]
            clean = _strip_punct(tok_j)
            if not clean:
                break
            lj = clean.lower()

            if j == start_idx:
                idxs.append(j)
                j += 1
                continue

            # aceita partes típicas de endereço
            if _looks_like_addr_token(tok_j) or lj in NAME_CONNECTORS or lj in {"-", "/", "–"}:
                idxs.append(j)
                j += 1
                continue

            break

        # aplica BIO sem sobrescrever PII "forte"
        if idxs:
            if tags[idxs[0]] == "O":
                tags[idxs[0]] = "B-ADDR"
            for k in idxs[1:]:
                if tags[k] == "O":
                    tags[k] = "I-ADDR"

    for i in range(n):
        # gatilho direto (token é um starter)
        if tags[i] == "O" and lower[i] in ADDR_STARTERS:
            label_addr_from(i)

        # "na/no/em <starter>"
        if lower[i] in {"na", "no", "em"} and (i + 1) < n and tags[i + 1] == "O" and lower[i + 1] in ADDR_STARTERS:
            label_addr_from(i + 1)

    # 4) Normaliza BIO (corrige I inválidos etc)
    tags = ensure_bio(tags)
    return tags

def auto_label_examples(examples: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Aplica auto_label_tokens em todos os exemplos."""
    out = []
    for ex in examples:
        toks = [str(t) for t in ex["tokens"]]
        out.append({"tokens": toks, "ner_tags": auto_label_tokens(toks)})
    return out


In [4]:
# =========================
# 4) Carregar dados de treino (JSON) e preparar Dataset HF
# =========================
BASE_DIRS = [
    Path("/data"),           # conforme enunciado
    Path("./data"),          # alternativa comum
    Path("."),               # diretório atual
    Path("/kaggle/input"),   # Kaggle inputs
    Path("/kaggle/working"), # Kaggle working
    Path("/mnt/data"),       # sandbox/local
]

JSON_NAME = "dados_treino_ner_250.json"
CSV_NAME = "amostra_com_labels_1 - Página1.csv"

json_path = find_file_exact_or_pattern(
    filename=JSON_NAME,
    patterns=["*treino*ner*250*.json", "*dados*treino*ner*.json"],
    base_dirs=BASE_DIRS,
)

csv_path = find_file_exact_or_pattern(
    filename=CSV_NAME,
    patterns=[
        "*amostra_com_labels_1*Página1*.csv",
        "*amostra_com_labels_1*Pagina1*.csv",
        "*amostra*labels*Página1*.csv",
        "*amostra*labels*Pagina1*.csv",
    ],
    base_dirs=BASE_DIRS,
)

print("JSON:", json_path)
print("CSV :", csv_path)

examples = parse_ner_json(json_path)
print("N exemplos:", len(examples))
print("Exemplo[0] keys:", examples[0].keys())
print("Tokens (primeiros 20):", examples[0]["tokens"][:20])
print("Tags   (primeiros 20):", examples[0]["ner_tags"][:20])


# Sanity check: distribuição de tags
tag_counts = Counter(t for ex in examples for t in ex["ner_tags"])
non_o = sum(c for t, c in tag_counts.items() if t != "O")
print("\nTag distribution (top 20):", tag_counts.most_common(20))
print("Total tags:", sum(tag_counts.values()), "| Non-O:", non_o)

# Se seu JSON veio TODO 'O', o treino fica degenerado (num_labels=1 => loss=0 sempre).
# Se isso acontecer, você tem 2 opções:
#  (A) Corrigir o JSON para conter entidades anotadas (recomendado).
#  (B) Usar o fallback AUTO-LABEL abaixo (baseline rápido).
AUTO_LABEL_IF_ONLY_O = True

if non_o == 0:
    msg = (
        "\n⚠️ ALERTA: Seu JSON não contém nenhuma entidade anotada (só 'O').\n"
        "Isso faz num_labels=1 e o treino NÃO aprende nada (loss=0 sempre).\n"
        "Vou aplicar AUTO-LABEL por regras (regex) para criar pseudo-labels e permitir treinar um baseline.\n"
        "Se você preferir corrigir o dataset manualmente, defina AUTO_LABEL_IF_ONLY_O=False e rode de novo.\n"
    )
    print(msg)
    if AUTO_LABEL_IF_ONLY_O:
        examples = auto_label_examples(examples)
        tag_counts = Counter(t for ex in examples for t in ex["ner_tags"])
        non_o = sum(c for t, c in tag_counts.items() if t != "O")
        print("Após AUTO-LABEL — Tag distribution (top 20):", tag_counts.most_common(20))
        print("Após AUTO-LABEL — Total tags:", sum(tag_counts.values()), "| Non-O:", non_o)
    else:
        raise ValueError("JSON sem entidades (apenas 'O'). Corrija o dataset ou ative AUTO_LABEL_IF_ONLY_O.")

label_list = build_label_list(examples)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print("\nLabels (num_labels=%d):" % len(label_list))
print(label_list)


JSON: data\dados_treino_ner_250.json
CSV : data\amostra_com_labels_1 - Página1.csv
N exemplos: 250
Exemplo[0] keys: dict_keys(['tokens', 'ner_tags'])
Tokens (primeiros 20): ['oi', 'na', 'fila', 'tinha', '42', 'pessoas', 'e', 'o', 'painel', 'ficou', 'travado', 'em', '19', 'pq', 'ninguém', 'responde', 'tá', 'complicado', 'demais']
Tags   (primeiros 20): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Tag distribution (top 20): [('O', 5455)]
Total tags: 5455 | Non-O: 0

⚠️ ALERTA: Seu JSON não contém nenhuma entidade anotada (só 'O').
Isso faz num_labels=1 e o treino NÃO aprende nada (loss=0 sempre).
Vou aplicar AUTO-LABEL por regras (regex) para criar pseudo-labels e permitir treinar um baseline.
Se você preferir corrigir o dataset manualmente, defina AUTO_LABEL_IF_ONLY_O=False e rode de novo.

Após AUTO-LABEL — Tag distribution (top 20): [('O', 4460), ('I-ADDR', 254), ('B-PER', 177), ('B-ADDR', 176), ('B-EMAIL', 110), ('B-PHONE', 104), ('

In [5]:
# =========================
# 5) Tokenização + alinhamento de labels (BIO) e split treino/val
# =========================
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

dataset = Dataset.from_list(examples).shuffle(seed=SEED)
# Split pequeno só para monitorar loss/métricas durante treino
dataset = dataset.train_test_split(test_size=0.1, seed=SEED)
dataset = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

print(dataset)

def tokenize_and_align_labels(batch):
    tokenized = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        truncation=True,
        # padding é feito pelo DataCollator
    )

    labels = []
    for i in range(len(batch["tokens"])):
        word_ids = tokenized.word_ids(batch_index=i)
        word_labels = batch["ner_tags"][i]
        word_label_ids = [label2id[str(l)] for l in word_labels]

        aligned = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned.append(-100)
            elif word_id != prev_word_id:
                aligned.append(word_label_ids[word_id])
            else:
                # subword: ignora na loss (estratégia padrão)
                aligned.append(-100)
            prev_word_id = word_id

        labels.append(aligned)

    tokenized["labels"] = labels
    return tokenized

tokenized_ds = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

print(tokenized_ds)


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 225
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 25
    })
})


Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 225
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


In [6]:
# =========================
# 6) Modelo + Trainer
# =========================
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
).to(DEVICE)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_labels = []
    true_preds = []

    for pred_seq, label_seq in zip(preds, labels):
        seq_true = []
        seq_pred = []
        for p, l in zip(pred_seq, label_seq):
            if l == -100:
                continue
            seq_true.append(id2label[int(l)])
            seq_pred.append(id2label[int(p)])
        true_labels.append(seq_true)
        true_preds.append(seq_pred)

    # seqeval pode avisar quando não há amostras positivas. Mantemos robusto com zero_division=0 quando disponível.
    try:
        prec = precision_score(true_labels, true_preds, zero_division=0)
        rec  = recall_score(true_labels, true_preds, zero_division=0)
        f1v  = f1_score(true_labels, true_preds, zero_division=0)
    except TypeError:
        prec = precision_score(true_labels, true_preds)
        rec  = recall_score(true_labels, true_preds)
        f1v  = f1_score(true_labels, true_preds)

    return {
        "precision": prec,
        "recall": rec,
        "f1": f1v,
        "accuracy": seqeval_accuracy(true_labels, true_preds),
    }

training_args = TrainingArguments(
    output_dir="./pii_ner_bertpt",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: neuralmind/bert-base-portuguese-cased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.bias                       | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok 

<transformers.trainer.Trainer at 0x1e57b511370>

In [7]:
# =========================
# 7) Treinamento
# =========================
train_result = trainer.train()
print(train_result)

print("\nAvaliação no split de validação (do JSON):")
eval_result = trainer.evaluate()
print(eval_result)


  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.722163,0.51578,0.421053,0.111111,0.175824,0.82548
2,0.282031,0.196516,0.791045,0.736111,0.76259,0.95288
3,0.119191,0.08669,0.941176,0.888889,0.914286,0.977312
4,0.06758,0.052428,0.946667,0.986111,0.965986,0.989529
5,0.044954,0.045086,0.959459,0.986111,0.972603,0.993019


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  super().__init__(loader)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  super().__init__(loader)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  super().__init__(loader)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  super().__init__(loader)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=145, training_loss=0.2900570425493964, metrics={'train_runtime': 248.6754, 'train_samples_per_second': 4.524, 'train_steps_per_second': 0.583, 'total_flos': 38643395404992.0, 'train_loss': 0.2900570425493964, 'epoch': 5.0})

Avaliação no split de validação (do JSON):


{'eval_loss': 0.04508648067712784, 'eval_precision': 0.9594594594594594, 'eval_recall': 0.9861111111111112, 'eval_f1': 0.9726027397260274, 'eval_accuracy': 0.9930191972076788, 'eval_runtime': 0.9197, 'eval_samples_per_second': 27.182, 'eval_steps_per_second': 4.349, 'epoch': 5.0}


In [8]:
# =========================
# 8) Salvar modelo treinado (para pipeline)
# =========================

# Altere o nome do caminho se ja existir

SAVE_DIR = Path("./trained_ner_model")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(SAVE_DIR))
tokenizer.save_pretrained(str(SAVE_DIR))

print("Modelo salvo em:", SAVE_DIR.resolve())


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Modelo salvo em: D:\GithubHD\nlp-acesso-a-informacao\trained_ner_model
