# Comparação de Modelos de Tópicos (LDA × BERTopic × BERTopic-alt)

Este notebook percorre os *runs/trials* salvos em `data/processed/{lda,bertopic}/run_*/trial_*`,
recalcula métricas (C_npmi, C_v, diversidade, separação JSD), lê *outliers* (BERTopic) e consolida tudo
em tabelas e gráficos prontos para o TCC.

**Atenção:** defina o caminho do **corpus de referência tokenizado** (`REF_TEXTS_PATH`), um arquivo TXT onde cada linha contém
uma lista de tokens separados por espaço. Essa referência é usada pelo Gensim para C_npmi/C_v.


In [ ]:
# Imports e paths
from pathlib import Path
import json, math, os, re, warnings
from datetime import datetime, timezone
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from itertools import combinations
from scipy.optimize import linear_sum_assignment
from scipy.spatial.distance import jensenshannon
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

ROOT = Path.cwd()  # ajuste se necessário
DATA = ROOT / "data"
PROCESSED = DATA / "processed"
RESULTS = ROOT / "results" / "comparison" / datetime.now(timezone.utc).strftime("run_%Y%m%dT%H%M%SZ")
RESULTS.mkdir(parents=True, exist_ok=True)

# Caminho para corpus de referência tokenizado (um documento por linha; tokens separados por espaço)
REF_TEXTS_PATH = DATA / "interim" / "lda" / "ref_corpus_tokens.txt"  # <-- AJUSTE AQUI
if not REF_TEXTS_PATH.exists():
    raise FileNotFoundError(f"Defina REF_TEXTS_PATH corretamente. Arquivo não encontrado: {REF_TEXTS_PATH}")

# Carregar corpus de referência
ref_texts = [line.strip().split() for line in open(REF_TEXTS_PATH, encoding="utf-8")]
ref_dict = Dictionary(ref_texts)
ref_bow = [ref_dict.doc2bow(t) for t in ref_texts]
print(f"Corpus de referência: {len(ref_texts)} docs, vocabulário={len(ref_dict)}")


In [ ]:
# Funções auxiliares
def load_vocab(path: Path):
    return [line.strip() for line in open(path, encoding="utf-8")] 

def top_words_from_matrix(mat: np.ndarray, vocab: list, topn: int = 10):
    # mat: K x |V|
    topics = []
    for k in range(mat.shape[0]):
        row = mat[k]
        idx = np.argsort(-row)[:topn]
        topics.append([(vocab[i], float(row[i])) for i in idx])
    return topics

def topic_diversity(topics_terms, topn=10):
    uniq = set()
    total = 0
    for terms in topics_terms:
        for w, _ in terms[:topn]:
            uniq.add(w)
        total += min(len(terms), topn)
    return len(uniq)/total if total else np.nan

def coherence_scores(topics_terms, measure: str):
    # topics_terms: list[list[(word,weight)]]
    topics = [[w for (w,_) in t] for t in topics_terms]
    cm = CoherenceModel(topics=topics, texts=ref_texts, dictionary=ref_dict, coherence=measure)
    return cm.get_coherence()

def avg_jsd_between_topics(mat: np.ndarray, eps=1e-12):
    # normaliza linhas para distribuição
    row_sums = mat.sum(axis=1, keepdims=True) + eps
    prob = mat / row_sums
    K = prob.shape[0]
    if K < 2:
        return np.nan
    ds = []
    for i, j in combinations(range(K), 2):
        d = jensenshannon(prob[i], prob[j])
        ds.append(float(d))
    return float(np.mean(ds)) if ds else np.nan

def scan_trials(method_root: Path):
    # retorna lista de paths trial dirs
    trials = []
    if not method_root.exists():
        return trials
    for run_dir in sorted(method_root.glob('run_*')):
        for tdir in sorted(run_dir.glob('trial_*')):
            trials.append(tdir)
    return trials


In [ ]:
# Varre LDA
lda_trials = scan_trials(PROCESSED / "lda")
print(f"LDA trials encontrados: {len(lda_trials)}")
rows = []
for tdir in lda_trials:
    try:
        phi = np.load(tdir/"phi_topics.npy")  # K x |V|
        vocab = load_vocab(tdir.parent.parent / "vocab.txt") if (tdir.parent.parent/"vocab.txt").exists() else load_vocab(tdir/"vocab.txt")
        topics_terms = top_words_from_matrix(phi, vocab, topn=10)
        c_npmi = coherence_scores(topics_terms, "c_npmi")
        c_v    = coherence_scores(topics_terms, "c_v")
        div    = topic_diversity(topics_terms, topn=10)
        sep    = avg_jsd_between_topics(phi)
        theta_p = tdir/"theta_docs.npy"
        ent_low = np.nan
        if theta_p.exists():
            theta = np.load(theta_p)
            # entropia média por doc (natural log), proxy de nitidez
            p = theta + 1e-12
            ent = -np.sum(p*np.log(p), axis=1)
            # porcentagem com entropia < ln(K)/2 (dominância razoável)
            thr = (math.log(theta.shape[1]))/2.0
            ent_low = float(np.mean(ent < thr))
        rows.append({
            "method":"lda",
            "run": tdir.parent.name,
            "trial": tdir.name,
            "K": int(phi.shape[0]),
            "c_npmi": c_npmi,
            "c_v": c_v,
            "diversity@10": div,
            "sep_jsd": sep,
            "entropy_lt_halflnK_pct": ent_low,
        })
    except Exception as e:
        warnings.warn(f"Falha ao processar {tdir}: {e}")
lda_df = pd.DataFrame(rows)
display(lda_df.head())
lda_df.to_csv(RESULTS/"lda_trials.csv", index=False)


In [ ]:
# Varre BERTopic
bt_trials = scan_trials(PROCESSED / "bertopic")
print(f"BERTopic trials encontrados: {len(bt_trials)}")
rows = []
for tdir in bt_trials:
    try:
        ctf = np.load(tdir/"c_tf_idf.npy")  # K x |V|
        vocab = load_vocab(tdir/"vocab.txt") if (tdir/"vocab.txt").exists() else load_vocab(tdir.parent/"vocab.txt")
        topics_terms = top_words_from_matrix(ctf, vocab, topn=10)
        c_npmi = coherence_scores(topics_terms, "c_npmi")
        c_v    = coherence_scores(topics_terms, "c_v")
        div    = topic_diversity(topics_terms, topn=10)
        sep    = avg_jsd_between_topics(ctf)
        outliers_pct = np.nan
        labels_p = tdir/"doc_topics.csv"
        if labels_p.exists():
            import pandas as _pd
            lab = _pd.read_csv(labels_p)
            if "topic" in lab.columns:
                outliers_pct = float((lab["topic"] == -1).mean())
        rows.append({
            "method":"bertopic",
            "run": tdir.parent.name,
            "trial": tdir.name,
            "K": int(ctf.shape[0]),
            "c_npmi": c_npmi,
            "c_v": c_v,
            "diversity@10": div,
            "sep_jsd": sep,
            "outliers_pct": outliers_pct,
        })
    except Exception as e:
        warnings.warn(f"Falha ao processar {tdir}: {e}")
bt_df = pd.DataFrame(rows)
display(bt_df.head())
bt_df.to_csv(RESULTS/"bertopic_trials.csv", index=False)


In [ ]:
# Consolidação e gráficos rápidos
df = pd.concat([lda_df, bt_df], ignore_index=True)
df.to_csv(RESULTS/"all_trials.csv", index=False)

print(f"Salvo: {RESULTS/'all_trials.csv'}")

plt.figure(figsize=(8,5))
for m in sorted(df['method'].unique()):
    sub = df[df['method']==m]
    plt.scatter(sub['c_npmi'], sub['diversity@10'], label=m, alpha=0.7)
plt.xlabel('c_npmi (ref)')
plt.ylabel('diversity@10')
plt.legend(); plt.title('c_npmi vs diversity@10')
plt.tight_layout(); plt.show()

plt.figure(figsize=(8,5))
df.boxplot(column='c_npmi', by='method')
plt.suptitle(''); plt.title('Distribuição de c_npmi por método')
plt.tight_layout(); plt.show()


In [ ]:
# Escreve na planilha-modelo (sheet Replicacoes)
xlsx_out = ROOT / "reports" / "tables" / "TCC_comparacao_resultados.xlsx"
xlsx_out.parent.mkdir(parents=True, exist_ok=True)
with pd.ExcelWriter(xlsx_out, engine='openpyxl') as writer:
    df.to_excel(writer, index=False, sheet_name='Replicacoes')
print(f"Planilha escrita em: {xlsx_out}")
