In [8]:
# Imports, paths e seed

import os, json, math, platform, warnings, time
from pathlib import Path
from datetime import datetime
from datetime import timezone
from itertools import product
from typing import List, Dict, Any

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from gensim.models.coherencemodel import CoherenceModel

BASE = Path("../../data")
IN_CSV = BASE/"interim"/"bertopic"/"prep.csv"
OUT_DIR = BASE/"processed"/"bertopic"
FIGS = Path("../../reports/figs")

OUT_DIR.mkdir(parents=True, exist_ok=True)
FIGS.mkdir(parents=True, exist_ok=True)

SEED = 42
np.random.seed(SEED)

def _get_ver(pkg):
    try:
        import importlib.metadata as im
        return im.version(pkg)
    except Exception:
        return "NA"

versions = {
    "python": platform.python_version(),
    "bertopic": _get_ver("bertopic"),
    "sentence-transformers": _get_ver("sentence-transformers"),
    "umap-learn": _get_ver("umap-learn"),
    "hdbscan": _get_ver("hdbscan"),
    "scikit-learn": _get_ver("scikit-learn"),
    "gensim": _get_ver("gensim"),
    "numpy": _get_ver("numpy"),
    "pandas": _get_ver("pandas")
}
print("VERSIONS:", json.dumps(versions, indent=2, ensure_ascii=False))

VERSIONS: {
  "python": "3.12.2",
  "bertopic": "0.16.0",
  "sentence-transformers": "2.6.1",
  "umap-learn": "0.5.6",
  "hdbscan": "0.8.33",
  "scikit-learn": "1.4.2",
  "gensim": "4.3.3",
  "numpy": "1.26.4",
  "pandas": "2.2.2"
}


In [9]:
# Stopwords em português (para CountVectorizer)
try:
    from nltk.corpus import stopwords as nltk_stop
    STOP_WORDS_PT = sorted(set(nltk_stop.words('portuguese')))
    print(f"STOP_WORDS_PT carregadas: {len(STOP_WORDS_PT)} termos")
except Exception as e:
    # fallback mínimo para não travar caso NLTK falhe (mantém pipeline funcionando)
    print("[WARN] NLTK indisponível; usando fallback reduzido:", e)
    STOP_WORDS_PT = sorted(set("""
a à acerca agora ai ainda além algo alguem alguns algumas algum alguma ambos ambas ante antes ao aos após aquela aquelas aquele aqueles aquilo as assim até através cada quase com como contra contudo cujo cuja cujos cujas da das de dela delas dele deles depois desde desta deste disso disto do dos e é ela elas ele eles em entre era eram essa essas esse esses esta estas este estes estou eu foi foram fosse fossem fui há isso isto já la lá lhe lhes mais mas me mesmo mesmoa mesmos mesmas minha minhas meu meus muito muita muitas muitos não na nas nem no nos nós o os ou para pela pelas pelo pelos pouca poucas pouco poucos por porque porém pra qual quais quando que quem se sem sempre sendo ser seu seus sob sobre sua suas também tão tão tem tenho ter teu teus tua tuas tudo um uma umas uns
""".split()))

STOP_WORDS_PT carregadas: 207 termos


In [10]:
# Leitura e preparação dos textos

df = pd.read_csv(IN_CSV, encoding="utf-8")

required_cols = {"DOC_ID", "resumo", "RESUMO_PREP_BERTOPIC"}
missing = required_cols - set(df.columns)
assert not missing, f"Colunas ausentes: {missing}"

docs_for_embeddings = (
    df["resumo"].fillna(df["RESUMO_PREP_BERTOPIC"]).astype(str).tolist()
)
docs_for_vectorizer = (
    df["RESUMO_PREP_BERTOPIC"].fillna("").astype(str).tolist()
)

lens = pd.Series([len(x) for x in docs_for_vectorizer])
print("N documentos:", len(df))
print("Docs vazios (vectorizer):", sum([len(x.strip()) == 0 for x in docs_for_vectorizer]))
print("Comprimento (chars) — quantis:", lens.quantile([0.0, 0.25, 0.5, 0.75, 0.95, 1.0]).to_dict())


N documentos: 423
Docs vazios (vectorizer): 14
Comprimento (chars) — quantis: {0.0: 0.0, 0.25: 894.5, 0.5: 1123.0, 0.75: 1322.5, 0.95: 1729.2999999999997, 1.0: 2309.0}


In [11]:
# Embeddings (SBERT)

# Modelo multilíngue leve que preserva semântica de sentenças (SBERT)
# BERT → base dos embeddings contextuais [Devlin et al.] usados em SBERT
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
sbert_name = "paraphrase-multilingual-MiniLM-L12-v2"

start = time.time()
sbert = SentenceTransformer(sbert_name, device=device)
embeddings = sbert.encode(
    docs_for_embeddings,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True
)
elapsed = time.time() - start
print(f"Embeddings shape={embeddings.shape} | device={device} | time={elapsed:.1f}s")


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Embeddings shape=(423, 384) | device=cpu | time=11.3s


In [12]:
# Grade de hiperparâmetros (mini-sweep)

from math import floor
N_DOCS = len(docs_for_vectorizer)

# === EXPANDE A GRADE PARA MAIS GRANULARIDADE ===
grid_umap = {
    "n_neighbors": [5, 10, 15, 30],
    "n_components": [5, 10, 15],
    "min_dist": [0.0],
    "metric": ["cosine"],
    "random_state": [SEED],
}

grid_hdb = {
    "min_cluster_size": [5, 8, 10, 15],
    "min_samples": [1, 5, None],
    "metric": ["euclidean"],
    "cluster_selection_method": ["eom", "leaf"],  # incluímos 'leaf' para granularidade
    "prediction_data": [True],
}

grid_vec = {
    "ngram_range": [(1,1), (1,2), (1,3)],
    "stop_words": [STOP_WORDS_PT],
    "min_df": [1, 2, 3],
    "max_df": [0.95, 0.99],
}

# Produto cartesiano (com UMAP)
from itertools import product
combos_all = []
for u, h, v in product(
    product(*grid_umap.values()),
    product(*grid_hdb.values()),
    product(*grid_vec.values())
):
    umap_kwargs = dict(zip(grid_umap.keys(), u))
    hdb_kwargs = dict(zip(grid_hdb.keys(), h))
    vec_kwargs = dict(zip(grid_vec.keys(), v))
    combos_all.append({"umap": umap_kwargs, "hdb": hdb_kwargs, "vec": vec_kwargs})

# Baseline (a mesma da sua versão anterior)
baseline = {
    "umap": {"n_neighbors": 10, "n_components": 5, "min_dist": 0.0, "metric": "cosine", "random_state": SEED},
    "hdb": {"min_cluster_size": 10, "min_samples": None, "metric": "euclidean",
            "cluster_selection_method": "eom", "prediction_data": True},
    "vec": {"ngram_range": (1,1), "stop_words": STOP_WORDS_PT, "min_df": 2, "max_df": 0.9},
}

def _same(a, b): 
    return json.dumps(a, sort_keys=True) == json.dumps(b, sort_keys=True)

combos = [baseline] + [c for c in combos_all if not _same(c, baseline)]

# ---- filtro para evitar "max_df corresponds to < documents than min_df"
def _valid_vec(v, n_docs: int) -> bool:
    min_df = v["min_df"]
    max_df = v["max_df"]
    max_docs = floor(max_df * n_docs) if isinstance(max_df, float) else int(max_df)
    return max_docs >= int(min_df)

combos = [c for c in combos if _valid_vec(c["vec"], N_DOCS)]

# === Amostragem reprodutível para conter o tamanho do sweep ===
# Mantemos baseline + 39 aleatórios (total 40)
N_MAX = 40
rng = np.random.RandomState(SEED)
if len(combos) > N_MAX:
    others = combos[1:]
    idx = rng.choice(len(others), size=N_MAX-1, replace=False)
    combos = [combos[0]] + [others[i] for i in sorted(idx)]

# === LOTE DE CONTROLE "SEM UMAP" (opcional, pequeno) ===
# Gera combos onde 'umap' é None (HDBSCAN direto nos embeddings)
combos_noumap_all = []
for h, v in product(
    product(*grid_hdb.values()),
    product(*grid_vec.values())
):
    hdb_kwargs = dict(zip(grid_hdb.keys(), h))
    vec_kwargs = dict(zip(grid_vec.keys(), v))
    combos_noumap_all.append({"umap": None, "hdb": hdb_kwargs, "vec": vec_kwargs})

# Aplica o mesmo filtro de min_df/max_df
combos_noumap_all = [c for c in combos_noumap_all if _valid_vec(c["vec"], N_DOCS)]

# Mescla uma pequena amostra (até 10) de combos sem UMAP
take = min(10, len(combos_noumap_all))
idx_nu = rng.choice(len(combos_noumap_all), size=take, replace=False) if take > 0 else []
combos = combos + [combos_noumap_all[i] for i in sorted(idx_nu)]

# Se passou de N_MAX, reamostra mantendo baseline na frente
if len(combos) > N_MAX:
    others = combos[1:]
    # escolhe N_MAX-1 dentre o restante (com UMAP e sem UMAP misturados)
    idx = rng.choice(len(others), size=N_MAX-1, replace=False)
    combos = [combos[0]] + [others[i] for i in sorted(idx)]

print(f"Combos válidos após filtro/amostragem (incluindo sem UMAP): {len(combos)}")
print("Exemplo (0):", combos[0])

Combos válidos após filtro/amostragem (incluindo sem UMAP): 40
Exemplo (0): {'umap': {'n_neighbors': 10, 'n_components': 5, 'min_dist': 0.0, 'metric': 'cosine', 'random_state': 42}, 'hdb': {'min_cluster_size': 10, 'min_samples': None, 'metric': 'euclidean', 'cluster_selection_method': 'eom', 'prediction_data': True}, 'vec': {'ngram_range': (1, 1), 'stop_words': ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'até', 'com', 'como', 'da', 'das', 'de', 'dela', 'delas', 'dele', 'deles', 'depois', 'do', 'dos', 'e', 'ela', 'elas', 'ele', 'eles', 'em', 'entre', 'era', 'eram', 'essa', 'essas', 'esse', 'esses', 'esta', 'estamos', 'estar', 'estas', 'estava', 'estavam', 'este', 'esteja', 'estejam', 'estejamos', 'estes', 'esteve', 'estive', 'estivemos', 'estiver', 'estivera', 'estiveram', 'estiverem', 'estivermos', 'estivesse', 'estivessem', 'estivéramos', 'estivéssemos', 'estou', 'está', 'estávamos', 'estão', 'eu', 'foi', 'fomos', 'for', 'fora', 'foram', 'forem', 'form

In [13]:
# Funções auxiliares

def write_json(path: Path, data: Dict[str, Any]):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def extract_topic_words(topic_model: BERTopic, topk: int = 10) -> List[List[str]]:
    topics_dict = topic_model.get_topics()
    topic_words: List[List[str]] = []
    for tid, pairs in topics_dict.items():
        if tid == -1:
            continue
        # BERTopic pode retornar None/[] para tópicos degenerados
        if not pairs:
            continue
        # Garante apenas strings não vazias; respeita topk
        toks = [str(w).strip() for (w, _) in pairs[:topk] if isinstance(w, (str, bytes)) and str(w).strip()]
        if toks:
            topic_words.append(toks)
    return topic_words

def simple_tokenize_docs(docs: List[str]) -> List[List[str]]:
    # Tokenização simples coerente com o CountVectorizer (lower/space)
    return [str(d).lower().split() for d in docs]

def topic_diversity(topic_words: List[List[str]], topk: int = 10) -> float:
    if not topic_words:
        return float("nan")
    uniq = len(set([w for tw in topic_words for w in tw[:topk]]))
    return uniq / (topk * len(topic_words))

def outlier_rate(topics: List[int]) -> float:
    topics = np.array(topics)
    return float(np.mean(topics == -1))

def _sanitize_topics_for_gensim(topic_words: List[List[str]]) -> List[List[str]]:
    clean: List[List[str]] = []
    for tw in topic_words:
        if not isinstance(tw, (list, tuple)):
            continue
        toks = [str(t).strip() for t in tw if isinstance(t, (str, bytes)) and str(t).strip()]
        if toks:
            clean.append(toks)
    return clean

def compute_coherences(topic_words, tokenized_docs):
    # c_npmi e c_uci com 'texts' (ambas aceitam texts); sanear antes de passar ao gensim
    from gensim.corpora import Dictionary
    from gensim.models.coherencemodel import CoherenceModel

    cleaned = _sanitize_topics_for_gensim(topic_words)
    if not cleaned:
        return float("nan"), float("nan")

    dictionary = Dictionary(tokenized_docs)

    c_npmi = CoherenceModel(
        topics=cleaned, texts=tokenized_docs, dictionary=dictionary, coherence="c_npmi"
    ).get_coherence()

    c_uci = CoherenceModel(
        topics=cleaned, texts=tokenized_docs, dictionary=dictionary, coherence="c_uci"
    ).get_coherence()

    return float(c_npmi), float(c_uci)

In [14]:
# Loop do sweep: treino, avaliação e salvamento por trial

# === Isolamento por execução: salva em subpasta RUN_ID ===
RUN_ID = datetime.now(timezone.utc).strftime("run_%Y%m%dT%H%M%SZ")
RUN_DIR = OUT_DIR / RUN_ID
RUN_DIR.mkdir(parents=True, exist_ok=True)
write_json(RUN_DIR/"run_info.json", {
    "run_id": RUN_ID, "seed": SEED, "versions": versions,
    "n_docs": len(docs_for_vectorizer), "timestamp_utc": datetime.now(timezone.utc).isoformat()
})

tokenized_docs = simple_tokenize_docs(docs_for_vectorizer)
trials_summary = []
errors_log = []

N_DOCS_EFF = len(docs_for_vectorizer)  # usado para sanear min_df/max_df

for i, cfg in enumerate(tqdm(combos, desc="Sweep")):
    trial_id = f"trial_{i:02d}"
    tdir = RUN_DIR / trial_id
    tdir.mkdir(parents=True, exist_ok=True)

    # --- SANEAMENTO do Vectorizer (evita "max_df corresponds to < documents than min_df" e string 'portuguese')
    vec_kwargs = cfg["vec"].copy()
    sw = vec_kwargs.get("stop_words", None)
    if isinstance(sw, str):
        vec_kwargs["stop_words"] = STOP_WORDS_PT

    min_df = vec_kwargs.get("min_df", 1)
    max_df = vec_kwargs.get("max_df", 1.0)
    min_df_prop = (min_df / max(N_DOCS_EFF, 1)) if isinstance(min_df, int) else float(min_df)
    max_df_prop = (max_df / max(N_DOCS_EFF, 1)) if isinstance(max_df, int) else float(max_df)
    if max_df_prop < min_df_prop:
        max_df_prop = min(1.0, max(min_df_prop + 1e-9, 0.999))
    vec_kwargs["min_df"] = min_df_prop
    vec_kwargs["max_df"] = max_df_prop

    # Instâncias dos componentes
    vectorizer_model = CountVectorizer(**vec_kwargs)
    # >>> CORREÇÃO: aceitar combos "sem UMAP"
    umap_model = None if (cfg["umap"] is None) else UMAP(**cfg["umap"])
    hdbscan_model = HDBSCAN(**cfg["hdb"])

    topic_model = BERTopic(
        embedding_model=None,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,          # pode ser None
        hdbscan_model=hdbscan_model,
        calculate_probabilities=True,
        verbose=False
    )

    started = datetime.now(timezone.utc).isoformat()
    t0 = time.time()
    try:
        topics, probs = topic_model.fit_transform(
            documents=docs_for_vectorizer,
            embeddings=embeddings
        )
        fit_time = time.time() - t0

        # Artefatos primários
        topic_info = topic_model.get_topic_info()
        topic_info.to_csv(tdir/"topic_info.csv", index=False, encoding="utf-8")

        # Doc→tópico
        if probs is not None and hasattr(probs, "__array__"):
            rowmax = np.nanmax(probs, axis=1)
            doc_prob = [float(x) if np.isfinite(x) else np.nan for x in rowmax]
        else:
            doc_prob = [np.nan] * len(topics)

        doc_topics = pd.DataFrame({
            "DOC_ID": df["DOC_ID"].values,
            "topic": topics,
            "prob": doc_prob
        })
        # mantém alt e compatível com o comparador
        doc_topics.to_csv(tdir/"doc_topics_alt.csv", index=False, encoding="utf-8")
        doc_topics.to_csv(tdir/"doc_topics.csv", index=False, encoding="utf-8")

        # c-TF-IDF e vocabulário
        ctf = topic_model.c_tf_idf_
        arr = ctf.toarray() if hasattr(ctf, "toarray") else np.asarray(ctf)
        np.save(tdir/"c_tf_idf.npy", arr)

        vocab = vectorizer_model.get_feature_names_out()
        with open(tdir/"vocab.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(map(str, vocab)))

        # === Métricas (com blindagens) ===
        topic_words = extract_topic_words(topic_model, topk=10)
        div = topic_diversity(topic_words, topk=10)

        # usa a função já definida no notebook (não 'coherence_npmi', que não existe aqui)
        try:
            c_npmi, c_uci = compute_coherences(topic_words, tokenized_docs)
        except Exception as e:
            warnings.warn(f"Coerência falhou em {trial_id}: {e}")
            c_npmi, c_uci = np.nan, np.nan

        # >>> CORREÇÃO: converte para array antes de comparar com -1
        topics_arr = np.asarray(topics)
        out_pct = float(np.mean(topics_arr == -1))

        n_topics_no_outlier = int((topic_info["Topic"] != -1).sum())

        metrics = {
            "c_npmi": c_npmi,
            "c_uci": c_uci,
            "topic_diversity@10": div,
            "outlier_rate": out_pct,
            "n_topics_excl_-1": n_topics_no_outlier,
            "fit_time_sec": fit_time
        }
        write_json(tdir/"metrics.json", metrics)

        # Metadados
        run_md = {
            "trial_id": trial_id,
            "started_utc": started,
            "finished_utc": datetime.now(timezone.utc).isoformat(),
            "seed": SEED,
            "versions": versions,
            "paths": {
                "input_csv": str(IN_CSV.resolve()),
                "trial_dir": str(tdir.resolve())
            },
            "params": {
                "umap": cfg["umap"],
                "hdbscan": cfg["hdb"],
                "vectorizer": {
                    **{k: (list(v) if k=="ngram_range" else v) for k, v in cfg["vec"].items()},
                    "_effective_min_df": vec_kwargs["min_df"],
                    "_effective_max_df": vec_kwargs["max_df"],
                },
                "sbert_model": sbert_name,
            },
            "sizes": {
                "n_docs": len(df),
                "emb_dim": int(embeddings.shape[1])
            }
        }
        write_json(tdir/"run_metadata.json", run_md)

        trials_summary.append({
            "trial_id": trial_id,
            "metrics": metrics,
            "cfg": cfg
        })

    except Exception as e:
        err = {"trial_id": trial_id, "error": repr(e)}
        errors_log.append(err)
        write_json(tdir/"error.json", err)
        print(f"[WARN] {trial_id} falhou: {e}")

Sweep:   0%|          | 0/40 [00:00<?, ?it/s]

In [15]:
# depois do sweep:
def _get(d: dict, path, default=np.nan):
    """Acesso seguro a d[k1][k2]...; retorna default se faltar algo ou se algum nível for None."""
    cur = d
    for k in path:
        if cur is None:
            return default
        if isinstance(cur, dict):
            cur = cur.get(k, default)
        else:
            return default
    return cur

rows = []
for t in trials_summary:
    cfg = t["cfg"]
    rows.append(dict(
        trial_id=t["trial_id"],
        n_topics=t["metrics"]["n_topics_excl_-1"],
        out_rate=t["metrics"]["outlier_rate"],
        c_npmi=t["metrics"]["c_npmi"],
        c_uci=t["metrics"]["c_uci"],

        # UMAP pode ser None nos combos "sem UMAP"
        umap_used = cfg["umap"] is not None,
        umap_n_neighbors = _get(cfg, ["umap", "n_neighbors"]),
        umap_n_components = _get(cfg, ["umap", "n_components"]),
        umap_min_dist = _get(cfg, ["umap", "min_dist"]),

        # HDBSCAN
        hdb_min_cluster_size = cfg["hdb"]["min_cluster_size"],
        hdb_min_samples = cfg["hdb"]["min_samples"],
        hdb_method = cfg["hdb"]["cluster_selection_method"],

        # Vectorizer (valores conforme definidos na grade original)
        vec_ngr = cfg["vec"]["ngram_range"],
        vec_min_df = cfg["vec"]["min_df"],
        vec_max_df = cfg["vec"]["max_df"],
    ))

df_sweep = pd.DataFrame(rows)

# Salva um resumo desta execução
df_sweep.to_csv(RUN_DIR/"sweep_summary_alt.csv", index=False, encoding="utf-8")

# Top em nº de tópicos
display(df_sweep.sort_values("n_topics", ascending=False).head(10))

# Efeito de parâmetros-chave (inclui flag umap_used para comparar com/sem UMAP)
display(
    df_sweep
      .groupby(["umap_used", "hdb_method","hdb_min_cluster_size","hdb_min_samples"])
      .agg(n_topics_mean=("n_topics","mean"),
           n_topics_max=("n_topics","max"),
           trials=("trial_id","count"))
      .sort_values(["n_topics_max","n_topics_mean","trials"], ascending=False)
      .head(20)
)

Unnamed: 0,trial_id,n_topics,out_rate,c_npmi,c_uci,umap_used,umap_n_neighbors,umap_n_components,umap_min_dist,hdb_min_cluster_size,hdb_min_samples,hdb_method,vec_ngr,vec_min_df,vec_max_df
32,trial_32,39,0.208038,-0.229013,-8.33127,False,,,,5,1.0,leaf,"(1, 3)",2,0.95
3,trial_03,31,0.229314,-0.263635,-8.896085,True,5.0,10.0,0.0,5,5.0,leaf,"(1, 1)",1,0.95
4,trial_04,31,0.229314,-0.264518,-8.878906,True,5.0,10.0,0.0,5,5.0,leaf,"(1, 1)",3,0.99
7,trial_07,27,0.170213,-0.227258,-8.13898,True,10.0,5.0,0.0,8,1.0,leaf,"(1, 2)",3,0.95
15,trial_15,25,0.196217,-0.267536,-9.034093,True,15.0,5.0,0.0,8,1.0,leaf,"(1, 1)",3,0.95
35,trial_35,23,0.333333,-0.203839,-7.658139,False,,,,5,,leaf,"(1, 3)",2,0.95
5,trial_05,23,0.146572,-0.212162,-7.861008,True,5.0,15.0,0.0,5,,eom,"(1, 2)",2,0.95
34,trial_34,23,0.3026,-0.201547,-7.457811,False,,,,5,,leaf,"(1, 3)",1,0.99
19,trial_19,22,0.392435,-0.200098,-7.654399,True,30.0,5.0,0.0,5,,leaf,"(1, 2)",3,0.99
6,trial_06,18,0.113475,-0.184597,-6.23382,True,5.0,15.0,0.0,10,1.0,eom,"(1, 3)",2,0.95


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n_topics_mean,n_topics_max,trials
umap_used,hdb_method,hdb_min_cluster_size,hdb_min_samples,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,leaf,5,1.0,39.0,39,1
True,leaf,5,5.0,26.666667,31,3
True,leaf,8,1.0,26.0,27,2
True,eom,10,1.0,18.0,18,1
False,leaf,8,5.0,16.0,16,1
True,leaf,8,5.0,15.0,15,2
False,leaf,15,1.0,11.0,11,1
True,leaf,10,5.0,11.0,11,1
True,leaf,15,1.0,10.0,11,3
True,eom,15,1.0,10.0,10,1


In [16]:
# Ranking e escolha da melhor configuração

# Ranking composto (ordenação por: c_npmi desc, topic_diversity desc, outlier_rate asc + penalização por n_tópicos)
if not trials_summary:
    raise RuntimeError("Nenhum trial concluído. Verifique errors_log e configurações.")

df_rank = pd.DataFrame([{
    "trial_id": t["trial_id"],
    "c_npmi": t["metrics"]["c_npmi"],
    "topic_diversity": t["metrics"]["topic_diversity@10"],
    "outlier_rate": t["metrics"]["outlier_rate"],
    "n_topics": t["metrics"]["n_topics_excl_-1"]
} for t in trials_summary])

# Garante numéricos para evitar NaNs silenciosos em rank
for col in ["c_npmi", "topic_diversity", "outlier_rate", "n_topics"]:
    df_rank[col] = pd.to_numeric(df_rank[col], errors="coerce")

# Ranks principais
df_rank["r1"] = df_rank["c_npmi"].rank(ascending=False, method="min")
df_rank["r2"] = df_rank["topic_diversity"].rank(ascending=False, method="min")
df_rank["r3"] = df_rank["outlier_rate"].rank(ascending=True, method="min")

# Penalização por distância a uma faixa alvo (ajuste conforme o domínio)
LOW, HIGH = 10, 40
def penalty(n):
    if pd.isna(n):
        return 1.0
    n = float(n)
    if n < LOW:
        return (LOW - n) / LOW
    if n > HIGH:
        return (n - HIGH) / HIGH
    return 0.0

df_rank["n_topics_penalty"] = df_rank["n_topics"].apply(penalty)

# Peso da penalização (ajustável)
PENALTY_WEIGHT = 5.0
df_rank["rank_sum"] = df_rank[["r1", "r2", "r3"]].sum(axis=1) + PENALTY_WEIGHT * df_rank["n_topics_penalty"]

# Ordena, escolhe vencedora e persiste
df_rank = df_rank.sort_values(["rank_sum", "r1", "r2", "r3"]).reset_index(drop=True)
best_trial_id = df_rank.loc[0, "trial_id"]
best_row = df_rank.loc[0].to_dict()

print("TOP-10 trials por critério composto:")
display(df_rank.head(10))
print("Vencedora:", best_trial_id)

# Persistência no RUN_DIR
df_rank.to_csv(RUN_DIR/"ranking_alt.csv", index=False, encoding="utf-8")
write_json(RUN_DIR/"winner_alt.json", {
    "run_id": RUN_ID,
    "best_trial_id": best_trial_id,
    "best_row": best_row,
})

TOP-10 trials por critério composto:


Unnamed: 0,trial_id,c_npmi,topic_diversity,outlier_rate,n_topics,r1,r2,r3,n_topics_penalty,rank_sum
0,trial_00,-0.05465,1.0,0.0,2,1.0,1.0,1.0,0.8,7.0
1,trial_01,-0.05465,1.0,0.0,2,1.0,1.0,1.0,0.8,7.0
2,trial_10,-0.05465,1.0,0.0,2,1.0,1.0,1.0,0.8,7.0
3,trial_11,-0.05465,1.0,0.0,2,1.0,1.0,1.0,0.8,7.0
4,trial_13,-0.05465,1.0,0.0,2,1.0,1.0,1.0,0.8,7.0
5,trial_16,-0.131737,1.0,0.0,2,6.0,1.0,1.0,0.8,12.0
6,trial_29,-0.131737,1.0,0.0,2,6.0,1.0,1.0,0.8,12.0
7,trial_37,-0.131737,1.0,0.0,2,6.0,1.0,1.0,0.8,12.0
8,trial_38,-0.131737,1.0,0.0,2,6.0,1.0,1.0,0.8,12.0
9,trial_20,-0.131737,0.7,0.0,2,6.0,38.0,1.0,0.8,49.0


Vencedora: trial_00
