In [3]:
import sys, platform
print(sys.executable)
print(platform.python_version())

c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\envs\lda\Scripts\python.exe
3.12.2


In [2]:
# Checagem de ambiente

import sys, platform, warnings, os, json, random, math
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import gensim
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

warnings.filterwarnings("ignore")
matplotlib.rcParams["figure.dpi"] = 120

print("Python:", sys.version)
print("OS:", platform.platform())
print("gensim:", gensim.__version__)
print("pyLDAvis:", pyLDAvis.__version__)
print("numpy:", np.__version__)
print("matplotlib:", matplotlib.__version__)

ImportError: No module named 'sklearn.__check_build._check_build'
___________________________________________________________________________
Contents of c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\envs\lda\Lib\site-packages\sklearn\__check_build:
_check_build.cp311-win_amd64.pyd__init__.py               __pycache__
___________________________________________________________________________
It seems that scikit-learn has not been built correctly.

If you have installed scikit-learn from source, please do not forget
to build the package before using it: run `python setup.py install` or
`make` in the source directory.

If you have used an installer, please check that it is suited for your
Python version, your operating system and your platform.

In [None]:
# Definição de paths

from pathlib import Path

def find_repo_root(start: Path = Path.cwd()) -> Path:
    for p in [start, *start.parents]:
        target = p / "data" / "interim" / "lda" / "vocab_bow.dict"
        if target.exists():
            return p
    raise FileNotFoundError(
        "Raiz do projeto não encontrada. "
        "Verifique se existe 'data/interim/lda/vocab_bow.dict' a partir da raiz."
    )

REPO = find_repo_root()
print("REPO =", REPO)

# Entradas
PATH_BOW_MM         = REPO / "data" / "interim" / "lda" / "bow.mm"
PATH_VOCAB_DICT     = REPO / "data" / "interim" / "lda" / "vocab_bow.dict"
PATH_BOW_INDEX_CSV  = REPO / "data" / "interim" / "lda" / "bow_index.csv"
PATH_VOCAB_TERMS    = REPO / "data" / "interim" / "lda" / "vocab_terms.csv"

# Saídas
DIR_PROCESSED_LDA   = REPO / "data" / "processed" / "lda"
DIR_REPORTS_FIGS    = REPO / "reports" / "figs"
DIR_PROCESSED_LDA.mkdir(parents=True, exist_ok=True)
DIR_REPORTS_FIGS.mkdir(parents=True, exist_ok=True)

for p in [PATH_VOCAB_DICT, PATH_BOW_MM, PATH_BOW_INDEX_CSV, PATH_VOCAB_TERMS]:
    print(p, "EXISTS?", p.exists())

In [None]:
# Carregar corpus e dicionário

dictionary = corpora.Dictionary.load(str(PATH_VOCAB_DICT))
corpus = corpora.MmCorpus(str(PATH_BOW_MM))

bow_index = pd.read_csv(PATH_BOW_INDEX_CSV)
vocab_df  = pd.read_csv(PATH_VOCAB_TERMS)

print(dictionary)
print(f"n_docs (corpus): {len(corpus)}")
bow_index.head(3)

In [None]:
# Utilitários (treino, coerências, estabilidade)

from typing import Dict, List
from itertools import combinations

RANDOM_STATE = 42

def train_lda(k:int,
              corpus,
              dictionary,
              passes:int=10,
              iterations:int=400,
              chunksize:int=2000,
              alpha='auto',
              eta='auto',
              random_state:int=RANDOM_STATE) -> LdaModel:
    return LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=random_state,
        chunksize=chunksize,
        passes=passes,
        iterations=iterations,
        alpha=alpha,
        eta=eta,
        eval_every=None
    )

def compute_coherences(model:LdaModel,
                       texts_like_tokens:List[List[str]]|None,
                       corpus,
                       dictionary) -> Dict[str, float]:
    coherences = {}

    # UMass
    cm_umass = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherences['u_mass'] = float(cm_umass.get_coherence())

    # C_v
    if texts_like_tokens is not None:
        cm_cv = CoherenceModel(model=model, texts=texts_like_tokens, dictionary=dictionary, coherence='c_v')
    else:
        cm_cv = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='c_v')
    coherences['c_v'] = float(cm_cv.get_coherence())

    # C_npmi
    if texts_like_tokens is not None:
        cm_npmi = CoherenceModel(model=model, texts=texts_like_tokens, dictionary=dictionary, coherence='c_npmi')
        coherences['c_npmi'] = float(cm_npmi.get_coherence())
    else:
        coherences['c_npmi'] = float('nan')

    return coherences

def topn_terms_per_topic(model:LdaModel, topn:int=10) -> Dict[int, List[str]]:
    return {t: [w for (w, _) in model.show_topic(t, topn=topn)] for t in range(model.num_topics)}

def jaccard(a:set, b:set) -> float:
    u = len(a | b)
    return len(a & b)/u if u else 0.0

def stability_score(models:List[LdaModel], topn:int=10) -> float:
    if len(models) < 2:
        return float('nan')

    def pairwise_best_match(m1, m2):
        T1 = topn_terms_per_topic(m1, topn=topn)
        T2 = topn_terms_per_topic(m2, topn=topn)
        used = set()
        scores = []
        for t1, terms1 in T1.items():
            s1 = set(terms1)
            best, best_t = 0.0, None
            for t2, terms2 in T2.items():
                if t2 in used: 
                    continue
                sc = jaccard(s1, set(terms2))
                if sc > best:
                    best, best_t = sc, t2
            if best_t is not None:
                used.add(best_t)
                scores.append(best)
        return float(np.mean(scores)) if scores else 0.0

    vals = []
    for i, j in combinations(range(len(models)), 2):
        vals.append(pairwise_best_match(models[i], models[j]))
    return float(np.mean(vals)) if vals else float('nan')

In [None]:
# Carregar tokens para coerência

import pickle

TOKENS_PATH = REPO / "data" / "interim" / "lda" / "tokens.pkl"

if TOKENS_PATH.exists():
    with open(TOKENS_PATH, "rb") as f:
        texts_like_tokens = pickle.load(f)
    print("Tokens carregados:", len(texts_like_tokens))
else:
    texts_like_tokens = None
    print("Aviso: tokens.pkl não encontrado, coerência C_v/C_npmi pode ficar limitada.")

In [None]:
# Varredura de k

K_VALUES = list(range(5, 35, 5))
PASSES = 10
ITERATIONS = 400
CHUNKSIZE = 2000
N_RESTARTS = 3
random_seeds = [RANDOM_STATE + i for i in range(N_RESTARTS)]

results = []
best_models_per_k = {}

for k in K_VALUES:
    models_k, metrics_k = [], []
    for seed in random_seeds:
        model = train_lda(k, corpus, dictionary,
                          passes=PASSES, iterations=ITERATIONS,
                          chunksize=CHUNKSIZE, random_state=seed)
        coh = compute_coherences(model, texts_like_tokens, corpus, dictionary)
        models_k.append(model)
        metrics_k.append(coh)
    stab = stability_score(models_k, topn=10)
    dfm = pd.DataFrame(metrics_k)
    idx_best = dfm["c_v"].idxmax() if dfm["c_v"].notna().any() else 0
    best_models_per_k[k] = models_k[int(idx_best)]
    mean_metrics = dfm.mean(numeric_only=True).to_dict()
    mean_metrics["stability_jaccard_top10"] = stab
    mean_metrics["k"] = k
    results.append(mean_metrics)

metrics_df = pd.DataFrame(results).sort_values("k").reset_index(drop=True)
metrics_df

In [None]:
# Gráfico de métricas de coerência e estabilidade em função de k

plt.figure(figsize=(10, 6))
plt.plot(metrics_df["k"], metrics_df["c_v"], marker="o", label="C_v")
plt.plot(metrics_df["k"], metrics_df["c_npmi"], marker="o", label="C_npmi")
plt.plot(metrics_df["k"], metrics_df["u_mass"], marker="o", label="U_Mass")
plt.plot(metrics_df["k"], metrics_df["stability_jaccard_top10"], marker="o", label="Estabilidade (Jaccard)")

plt.xlabel("Número de tópicos (k)")
plt.ylabel("Valor da métrica")
plt.title("Métricas de coerência e estabilidade vs k")
plt.legend()
plt.grid(True)

DIAG_PNG = DIR_REPORTS_FIGS / "lda_k_diagnostics.png"
plt.savefig(DIAG_PNG, bbox_inches="tight")
plt.show()

print("Figura de diagnóstico salva em:", DIAG_PNG)

In [None]:
# Seleção do melhor k e salvamento do modelo

def pick_k(df:pd.DataFrame) -> int:
    if df["c_v"].notna().any():
        top_cv = df["c_v"].max()
        cands = df[df["c_v"] == top_cv]
        if "c_npmi" in df and cands["c_npmi"].notna().any():
            return int(cands.loc[cands["c_npmi"].idxmax(), "k"])
        return int(cands.iloc[0]["k"])
    elif "c_npmi" in df and df["c_npmi"].notna().any():
        return int(df.loc[df["c_npmi"].idxmax(), "k"])
    return int(df["k"].max())

k_star = pick_k(metrics_df)
best_model = best_models_per_k[k_star]
print(f"Melhor k selecionado: {k_star}")

MODEL_PATH = DIR_PROCESSED_LDA / "model.lda"
best_model.save(str(MODEL_PATH))

METRICS_CSV = DIR_PROCESSED_LDA / "coherences.csv"
metrics_df.to_csv(METRICS_CSV, index=False)

print("Modelo salvo em:", MODEL_PATH)
print("Métricas salvas em:", METRICS_CSV)

In [None]:
# Análise de robustez de k: inspeção da estabilidade relativa

plt.figure(figsize=(8, 5))
plt.plot(metrics_df["k"], metrics_df["stability_jaccard_top10"], marker="s", color="darkred")
plt.xlabel("Número de tópicos (k)")
plt.ylabel("Estabilidade média (Jaccard top-10 termos)")
plt.title("Robustez da escolha de k: estabilidade entre reinicializações")
plt.grid(True)

ROBUST_PNG = DIR_REPORTS_FIGS / "lda_k_robustness.png"
plt.savefig(ROBUST_PNG, bbox_inches="tight")
plt.show()

print("Figura de robustez salva em:", ROBUST_PNG)

In [None]:
# Visualizações estáticas (tópicos e distribuição doc-tópico)

# Top termos por tópico
def plot_top_terms(model:LdaModel, topn:int=10, maxcols:int=3, figsize=(14, 10), savepath=None):
    topics = [model.show_topic(t, topn=topn) for t in range(model.num_topics)]
    n = len(topics); ncols = min(maxcols, n); nrows = math.ceil(n / ncols)

    fig = plt.figure(figsize=figsize)
    for i, terms in enumerate(topics, start=1):
        ax = plt.subplot(nrows, ncols, i)
        labels  = [w for w,_ in terms][::-1]
        weights = [v for _,v in terms][::-1]
        ax.barh(range(len(labels)), weights)
        ax.set_yticks(range(len(labels)))
        ax.set_yticklabels(labels)
        ax.set_title(f"Tópico {i-1}")
        ax.set_xlabel("Peso")
    plt.tight_layout()
    if savepath:
        plt.savefig(savepath, bbox_inches="tight")
    plt.show()

TOPICS_PNG = DIR_REPORTS_FIGS / "lda_topics.png"
plot_top_terms(best_model, topn=10, maxcols=3, figsize=(14, 10), savepath=TOPICS_PNG)
print("Figura salva em:", TOPICS_PNG)

# Distribuição de tópicos por documento (heatmap amostra)
def doc_topic_matrix(model:LdaModel, corpus):
    n_topics = model.num_topics
    rows = []
    for bow in corpus:
        dist = [0.0]*n_topics
        for t, p in model.get_document_topics(bow, minimum_probability=0.0):
            dist[t] = p
        rows.append(dist)
    return np.array(rows)

M = doc_topic_matrix(best_model, corpus)
print("Matriz doc x tópicos:", M.shape)

DOC_DIST_PNG = DIR_REPORTS_FIGS / "lda_doc_topic_dist.png"
plt.figure(figsize=(10, 6))
plt.imshow(M[:min(200, M.shape[0]), :], aspect="auto")
plt.colorbar()
plt.title("Distribuição de tópicos por documento (amostra)")
plt.xlabel("Tópicos"); plt.ylabel("Documentos")
plt.savefig(DOC_DIST_PNG, bbox_inches="tight")
plt.show()
print("Figura salva em:", DOC_DIST_PNG)

In [None]:
# LDAvis interativo

vis_data = gensimvis.prepare(best_model, corpus, dictionary)
LDAVIS_HTML = DIR_REPORTS_FIGS / "lda_vis.html"
pyLDAvis.save_html(vis_data, str(LDAVIS_HTML))
print("LDAvis salvo em:", LDAVIS_HTML)

In [None]:
# Manifesto de auditoria

audit = {
    "timestamp": datetime.utcnow().isoformat() + "Z",
    "random_state_base": RANDOM_STATE,
    "k_grid": list(K_VALUES),
    "k_selected": int(k_star),
    "training_params": {
        "passes": PASSES, "iterations": ITERATIONS, "chunksize": CHUNKSIZE,
        "alpha": "auto", "eta": "auto", "n_restarts": N_RESTARTS
    },
    "coherence_metrics": {
        "table_csv": str(METRICS_CSV.relative_to(REPO)),
        "primary_metric": "c_v", "tie_breaker": "c_npmi", "umass_included": True
    },
    "stability": {"method": "mean_jaccard_top10_best_matching"},
    "inputs": {
        "corpus_mm": str(PATH_BOW_MM.relative_to(REPO)),
        "vocab_dict": str(PATH_VOCAB_DICT.relative_to(REPO)),
        "bow_index_csv": str(PATH_BOW_INDEX_CSV.relative_to(REPO)),
        "vocab_terms_csv": str(PATH_VOCAB_TERMS.relative_to(REPO)),
        "tokens_used_for_cv_npmi": bool(texts_like_tokens is not None)
    },
    "outputs": {
        "model_lda": str(MODEL_PATH.relative_to(REPO)),
        "topics_png": str(TOPICS_PNG.relative_to(REPO)),
        "doc_dist_png": str(DOC_DIST_PNG.relative_to(REPO)),
        "ldavis_html": str(LDAVIS_HTML.relative_to(REPO))
    },
    "env": {
        "python": sys.version,
        "gensim": gensim.__version__,
        "pyLDAvis": pyLDAvis.__version__,
        "numpy": np.__version__
    }
}
AUDIT_JSON = DIR_PROCESSED_LDA / "audit_modelagem.json"
with open(AUDIT_JSON, "w", encoding="utf-8") as f:
    json.dump(audit, f, ensure_ascii=False, indent=2)
print("Manifesto de auditoria salvo em:", AUDIT_JSON)

In [None]:
# Exportação de tabela de tópicos

def topics_table(model:LdaModel, topn:int=15) -> pd.DataFrame:
    return pd.DataFrame([
        {"topic_id": t, "top_terms": ", ".join([w for w,_ in model.show_topic(t, topn=topn)])}
        for t in range(model.num_topics)
    ])

TOPICS_CSV = DIR_PROCESSED_LDA / "topics_top_terms.csv"
topics_df = topics_table(best_model, topn=15)
topics_df.to_csv(TOPICS_CSV, index=False)
print("CSV de tópicos salvo em:", TOPICS_CSV)
topics_df.head(10)