In [None]:
# Imports e paths
from pathlib import Path
import json, ast, re, unicodedata
from datetime import datetime, timezone

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression

# Este notebook deve estar na RAIZ do projeto
ROOT = Path.cwd()
DATA = ROOT / "data"
PROCESSED = DATA / "processed"
INTERIM = DATA / "interim"
RESULTS = ROOT / "results"
EXPORT = DATA / "exports" / "dashboard"

EXPORT.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)

In [None]:
# Utilitários

def _latest_compare_csv(results_root: Path) -> Path | None:
    comp = results_root / "comparison"
    if not comp.exists(): return None
    runs = sorted(comp.glob("run_*"))
    if not runs: return None
    csv = runs[-1] / "all_trials_gr_rz.csv"
    return csv if csv.exists() else None

def _pick_best_bertopic(csv: Path, max_outliers=0.30, k_min=5):
    df = pd.read_csv(csv)
    cand = df[(df["method"]=="bertopic") & (df["K"]>=k_min)].copy()
    if "outliers_pct" in cand.columns:
        cand = cand[cand["outliers_pct"] <= max_outliers]
    if cand.empty:
        cand = df[df["method"]=="bertopic"].copy()
    cand = cand.sort_values(["RZ_index","GR_index"], ascending=False)
    assert not cand.empty, "Nenhum trial BERTopic encontrado no CSV."
    return cand.iloc[0].to_dict()

def _trial_dir_bertopic(bt_root: Path, run: str, trial: str) -> Path:
    if isinstance(run, str) and run.startswith("run_"):
        return bt_root / run / trial
    return bt_root / trial

def _parse_representation(val):
    if isinstance(val, (list, tuple)): return [str(x) for x in val]
    try:
        lst = ast.literal_eval(str(val))
        if isinstance(lst, (list, tuple)):
            return [str(x) for x in lst]
    except Exception:
        pass
    return [t for t in str(val).split() if t]

def _load_topic_labels(tdir: Path, topn_terms=5):
    labels = {}
    tip = tdir / "topic_info.csv"
    if tip.exists():
        df = pd.read_csv(tip)
        if "Topic" in df.columns:
            for _, row in df.iterrows():
                tid = int(row["Topic"])
                if tid == -1: 
                    continue
                name = str(row.get("Name", "")).strip()
                if name:
                    labels[tid] = name
                else:
                    reps = _parse_representation(row.get("Representation", ""))
                    if reps:
                        labels[tid] = ", ".join(reps[:topn_terms])
    # fallback com c_tf_idf + vocab
    missing = [t for t in range(max(labels.keys())+1 if labels else 0) if t not in labels]
    ctf_p, voc_p = tdir/"c_tf_idf.npy", tdir/"vocab.txt"
    if missing and ctf_p.exists() and voc_p.exists():
        ctf = np.load(ctf_p)
        vocab = [line.strip() for line in open(voc_p, encoding="utf-8")]
        for tid in range(ctf.shape[0]):
            if tid in labels: continue
            row = ctf[tid]
            idx = np.argsort(-row)[:topn_terms]
            labels[tid] = ", ".join(vocab[i] for i in idx)
    return labels

def _keyify_name(x: str) -> str:
    x = str(x).strip()
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\s+([,.;:])", r"\1", x).replace(" ,", ",").replace(" .", ".")
    x = unicodedata.normalize("NFKD", x)
    x = "".join(c for c in x if not unicodedata.combining(c))
    x = x.lower()
    x = re.sub(r"[^a-z0-9 ,.\-]", "", x)
    return re.sub(r"\s+", " ", x).strip()

def _detect_year_column(df: pd.DataFrame) -> str:
    # tenta achar 'ano' ou 'year' em qualquer caixa/variação
    for c in df.columns:
        cl = c.lower().strip()
        if cl in {"ano","year","ano_defesa","ano_defesa_tcc","ano_publicacao"}: 
            return c
        if "ano" in cl or "year" in cl:
            return c
    raise AssertionError("Coluna de ano não encontrada no prep.csv. Esperado algo contendo 'ano' ou 'year'.")

def _year_to_int(s):
    try:
        return int(str(s)[:4])
    except Exception:
        return np.nan

def _soft_prob(counts: pd.Series | np.ndarray, alpha=0.5):
    arr = np.asarray(counts, dtype=float)
    arr = arr + alpha
    s = arr.sum()
    return arr / s if s > 0 else arr * np.nan

def _jsd(p, q, eps=1e-12):
    p = np.asarray(p, dtype=float); q = np.asarray(q, dtype=float)
    p = p / max(p.sum(), eps); q = q / max(q.sum(), eps)
    m = 0.5*(p+q)
    def _kl(a, b):
        a = np.where(a<=0, eps, a)
        b = np.where(b<=0, eps, b)
        return np.sum(a * np.log(a/b))
    return float(np.sqrt(0.5*_kl(p,m) + 0.5*_kl(q,m)))

def _ols_slope(x: np.ndarray, y: np.ndarray) -> float:
    x = np.asarray(x).reshape(-1,1)
    y = np.asarray(y).reshape(-1,1)
    if len(x) < 2: return np.nan
    model = LinearRegression().fit(x, y)
    return float(model.coef_[0,0])

In [None]:
# Seleção automática do melhor BERTopic
compare_csv = _latest_compare_csv(RESULTS)
assert compare_csv and compare_csv.exists(), "Rode compare_models antes (CSV de comparação não encontrado)."

best = _pick_best_bertopic(compare_csv, max_outliers=0.30, k_min=5)
best_run, best_trial, K = best["run"], best["trial"], int(best["K"])
print("Selecionado:", best_run, best_trial, "| K =", K)

BT_ROOT = PROCESSED / "bertopic"
TRIAL_DIR = _trial_dir_bertopic(BT_ROOT, best_run, best_trial)
assert TRIAL_DIR.exists(), f"Pasta do trial não encontrada: {TRIAL_DIR}"
print("TRIAL_DIR:", TRIAL_DIR)

# diretório de export desta análise temporal
OUT_DIR = EXPORT / "bertopic_time" / str(best_run) / str(best_trial)
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Carrega meta e doc→tópico
prep_csv = INTERIM / "bertopic" / "prep.csv"
assert prep_csv.exists(), f"Meta não encontrada: {prep_csv}"

meta = pd.read_csv(prep_csv, encoding="utf-8")
assert "DOC_ID" in meta.columns, "prep.csv precisa conter coluna DOC_ID"

# detecta ano e orientador
col_year = _detect_year_column(meta)
cand_orient_cols = [c for c in meta.columns if "orient" in c.lower()]
assert cand_orient_cols, "Coluna de orientador não encontrada (esperado algo contendo 'orient')."
col_orient = cand_orient_cols[0]

meta = meta[["DOC_ID", col_year, col_orient]].rename(columns={col_year:"ANO", col_orient:"orientador"})
meta["DOC_ID"] = meta["DOC_ID"].astype(int)
meta["ANO"] = meta["ANO"].map(_year_to_int).astype("Int64")
meta["orientador"] = meta["orientador"].astype(str).fillna("NA").str.strip()

# normaliza orientadores (consolida variações)
meta["_key"] = meta["orientador"].map(_keyify_name)
name_map = meta.groupby("_key")["orientador"].agg(lambda s: s.mode().iat[0] if not s.mode().empty else s.iloc[0]).to_dict()
meta["orientador"] = meta["_key"].map(name_map)
meta = meta.drop(columns=["_key"]).copy()

# Doc→tópico (ignora outliers -1)
labels = pd.read_csv(TRIAL_DIR / "doc_topics.csv", encoding="utf-8")
assert {"DOC_ID","topic"}.issubset(labels.columns), "doc_topics.csv precisa conter DOC_ID e topic"
labels["DOC_ID"] = labels["DOC_ID"].astype(int)
labels = labels[labels["topic"] != -1].copy()

# junção
df = labels.merge(meta, on="DOC_ID", how="inner")
assert not df.empty, "Junção vazia; verifique DOC_ID/ANO entre prep.csv e doc_topics.csv"

topic_labels = _load_topic_labels(TRIAL_DIR, topn_terms=5)
print("Intervalo de anos:", int(df["ANO"].min()), "→", int(df["ANO"].max()))
print("Docs com tópico:", len(df), "| K:", K, "| Orientadores:", df["orientador"].nunique())

In [None]:
# Contagens por ano/tópico e totais por ano
ct_ty = df.groupby(["ANO","topic"]).size().rename("n").reset_index()
ct_y  = df.groupby("ANO").size().rename("n_docs_assigned")  # só docs com tópico (≠ -1)

# total de docs por ano (base completa)
year_tot = meta.groupby("ANO").size().rename("n_docs_total")

# matriz P(t|ano) com suavização
pivot_ty = ct_ty.pivot(index="ANO", columns="topic", values="n").fillna(0.0).sort_index()
p_t_given_year = pivot_ty.apply(lambda row: _soft_prob(row.values, alpha=0.5), axis=1, result_type="broadcast")
p_t_given_year.index = pivot_ty.index
p_t_given_year.columns = pivot_ty.columns

# tabela longa
long_topic_year = pivot_ty.stack().rename("n_topic").reset_index()
long_topic_year["p_topic_given_year"] = [
    float(p_t_given_year.loc[y, t]) for y, t in zip(long_topic_year["ANO"], long_topic_year["topic"])
]
long_topic_year["topic_label"] = long_topic_year["topic"].map(lambda t: topic_labels.get(int(t), f"topic_{t}"))

# cobertura por ano
coverage_year = pd.concat([year_tot, ct_y], axis=1).fillna(0)
coverage_year["coverage"] = coverage_year["n_docs_assigned"] / coverage_year["n_docs_total"].replace(0, np.nan)
coverage_year = coverage_year.reset_index().rename(columns={"index":"ANO"})

display(long_topic_year.head(10))
display(coverage_year.head())

In [None]:
# Tendências por tópico: slope (OLS) de p(t|ano), delta, "CAGR" aprox., Spearman rho

years = np.array(sorted(long_topic_year["ANO"].dropna().unique()), dtype=float)
trend_rows = []
for t in sorted(p_t_given_year.columns):
    series = p_t_given_year[t].dropna()
    yy = series.index.values.astype(float)
    pp = series.values.astype(float)

    if len(pp) < 2:
        slope = rho = np.nan
        delta = np.nan
        cagr = np.nan
    else:
        slope = _ols_slope(yy, pp)                     # variação de proporção por ano
        delta = float(pp[-1] - pp[0])                  # variação absoluta no período
        # CAGR aproximado sobre proporções, com epsilon para zeros
        eps = 1e-6
        years_span = max(int(yy[-1]-yy[0]), 1)
        cagr = float(((pp[-1]+eps)/(pp[0]+eps))**(1/years_span) - 1)
        rho, _ = spearmanr(yy, pp, nan_policy="omit")

    trend_rows.append({
        "topic": int(t),
        "topic_label": topic_labels.get(int(t), f"topic_{t}"),
        "years_min": int(yy[0]) if len(series)>0 else np.nan,
        "years_max": int(yy[-1]) if len(series)>0 else np.nan,
        "p_first": float(pp[0]) if len(series)>0 else np.nan,
        "p_last": float(pp[-1]) if len(series)>0 else np.nan,
        "delta": delta,
        "slope_per_year": slope,
        "cagr_approx": cagr,
        "spearman_rho": float(rho) if 'rho' in locals() else np.nan
    })

topic_trends = pd.DataFrame(trend_rows).sort_values(["slope_per_year","delta"], ascending=False)
display(topic_trends.head(10))

In [None]:
# P(tópico|orientador, ano). Exigimos um mínimo de docs/ano para estabilidade.

MIN_DOCS_ORIENTADOR_ANO = 3
alpha = 0.5  # suavização das probabilidades

# contagens por orientador/ano/tópico
ct_oyt = df.groupby(["orientador","ANO","topic"]).size().rename("n").reset_index()
ct_oy  = df.groupby(["orientador","ANO"]).size().rename("n_oy").reset_index()

# somente cels com pelo menos MIN_DOCS_ORIENTADOR_ANO
valid_pairs = set(
    ct_oy[ct_oy["n_oy"] >= MIN_DOCS_ORIENTADOR_ANO][["orientador","ANO"]] \
        .itertuples(index=False, name=None)
)

rows_p = []
for (o, y), block in ct_oyt.groupby(["orientador","ANO"]):
    if (o, y) not in valid_pairs:
        continue
    # vetor em todos os tópicos (0..K-1)
    counts = np.zeros(K, dtype=float)
    for _, r in block.iterrows():
        counts[int(r["topic"])] = r["n"]
    probs = _soft_prob(counts, alpha=alpha)
    for t in range(K):
        rows_p.append({
            "orientador": o,
            "ANO": int(y),
            "topic": int(t),
            "topic_label": topic_labels.get(int(t), f"topic_{t}"),
            "P_topic_given_orientador_year": float(probs[t]),
            "n_docs_orientador_year": int(ct_oy[(ct_oy["orientador"]==o)&(ct_oy["ANO"]==y)]["n_oy"].iloc[0])
        })

orientador_topic_year = pd.DataFrame(rows_p).sort_values(["orientador","ANO","P_topic_given_orientador_year"], ascending=[True, True, False])
display(orientador_topic_year.head(10))

# Top topic por orientador/ano
top_by_year = (
    orientador_topic_year
    .sort_values(["orientador","ANO","P_topic_given_orientador_year"], ascending=[True, True, False])
    .groupby(["orientador","ANO"], as_index=False)
    .first()[["orientador","ANO","topic","topic_label","P_topic_given_orientador_year","n_docs_orientador_year"]]
)
display(top_by_year.head(10))

# Métrica de mudança de preferência: nº de trocas do top topic e JSD médio entre anos consecutivos
shift_rows = []
for o, g in orientador_topic_year.groupby("orientador"):
    years_sorted = sorted(g["ANO"].unique())
    if len(years_sorted) < 2: 
        continue

    # seq de top topics
    tops = top_by_year[top_by_year["orientador"]==o].sort_values("ANO")
    seq = tops["topic"].tolist()
    switches = int(np.sum(np.array(seq[1:]) != np.array(seq[:-1])))

    # JSD médio e máximo entre distribuições consecutivas P(t|o,ano)
    jsds = []
    for y1, y2 in zip(years_sorted[:-1], years_sorted[1:]):
        p1 = orientador_topic_year[(orientador_topic_year["orientador"]==o)&(orientador_topic_year["ANO"]==y1)] \
                .sort_values("topic")["P_topic_given_orientador_year"].values
        p2 = orientador_topic_year[(orientador_topic_year["orientador"]==o)&(orientador_topic_year["ANO"]==y2)] \
                .sort_values("topic")["P_topic_given_orientador_year"].values
        if len(p1)==K and len(p2)==K:
            jsds.append(_jsd(p1, p2))
    mean_jsd = float(np.mean(jsds)) if jsds else np.nan
    max_jsd  = float(np.max(jsds))  if jsds else np.nan

    shift_rows.append({
        "orientador": o,
        "years_covered": len(years_sorted),
        "first_year": int(years_sorted[0]),
        "last_year": int(years_sorted[-1]),
        "top_switches": switches,
        "mean_jsd_consecutive": mean_jsd,
        "max_jsd_consecutive": max_jsd,
        "last_top_topic": int(tops.iloc[-1]["topic"]),
        "last_top_label": str(tops.iloc[-1]["topic_label"])
    })

orientador_shift_summary = pd.DataFrame(shift_rows).sort_values(
    ["top_switches","mean_jsd_consecutive"], ascending=[False, False]
)
display(orientador_shift_summary.head(10))

# Slopes por orientador-tópico (tendência de preferência)
slope_rows = []
for (o, t), g in orientador_topic_year.groupby(["orientador","topic"]):
    g = g.sort_values("ANO")
    if len(g) < 2: 
        continue
    slope = _ols_slope(g["ANO"].values.astype(float), g["P_topic_given_orientador_year"].values.astype(float))
    slope_rows.append({
        "orientador": o,
        "topic": int(t),
        "topic_label": topic_labels.get(int(t), f"topic_{t}"),
        "years": len(g),
        "slope_per_year": float(slope),
        "p_first": float(g.iloc[0]["P_topic_given_orientador_year"]),
        "p_last": float(g.iloc[-1]["P_topic_given_orientador_year"])
    })

orientador_topic_slopes = pd.DataFrame(slope_rows).sort_values(
    ["slope_per_year","years"], ascending=[False, False]
)
display(orientador_topic_slopes.head(10))

In [None]:
# Exports
(long_topic_year
 .sort_values(["ANO","topic"])
 .to_csv(OUT_DIR/"topic_by_year.csv", index=False, encoding="utf-8"))

(topic_trends
 .to_csv(OUT_DIR/"topic_trend_summary.csv", index=False, encoding="utf-8"))

(coverage_year
 .to_csv(OUT_DIR/"coverage_by_year.csv", index=False, encoding="utf-8"))

(orientador_topic_year
 .to_csv(OUT_DIR/"orientador_topic_year.csv", index=False, encoding="utf-8"))

(top_by_year
 .to_csv(OUT_DIR/"orientador_top_topic_by_year.csv", index=False, encoding="utf-8"))

(orientador_shift_summary
 .to_csv(OUT_DIR/"orientador_shift_summary.csv", index=False, encoding="utf-8"))

(orientador_topic_slopes
 .to_csv(OUT_DIR/"orientador_topic_slopes.csv", index=False, encoding="utf-8"))

# Metadados
with open(OUT_DIR/"selection.json","w",encoding="utf-8") as f:
    json.dump({
        "selected_run": best_run,
        "selected_trial": best_trial,
        "K": int(K),
        "source_compare_csv": str(compare_csv),
        "timestamp_utc": datetime.now(timezone.utc).isoformat()
    }, f, ensure_ascii=False, indent=2)

print("Arquivos salvos em:", OUT_DIR)

In [None]:
# Heatmap 1: P(t|ano)
plt.figure(figsize=(max(8, len(p_t_given_year.columns)*0.6), max(6, len(p_t_given_year.index)*0.5)))
plt.imshow(p_t_given_year.values, aspect='auto', interpolation='nearest')
plt.colorbar(label="P(tópico | ano)")
plt.yticks(range(len(p_t_given_year.index)), p_t_given_year.index)
plt.xticks(range(len(p_t_given_year.columns)), 
           [topic_labels.get(int(t), t) for t in p_t_given_year.columns], rotation=90)
plt.title("Distribuição de tópicos por ano (P(t|ano))")
plt.tight_layout()
plt.savefig(OUT_DIR/"heatmap_topic_by_year.png", dpi=150)
plt.show()

# Heatmap 2: para N orientadores com mais docs, agregando últimos N anos
TOP_ORI = 20
ori_order = df["orientador"].value_counts().head(TOP_ORI).index.tolist()
sub = orientador_topic_year[orientador_topic_year["orientador"].isin(ori_order)]
# usa último ano de cada orientador só para um snapshot comparável
snap = sub.sort_values(["orientador","ANO"]).groupby("orientador").tail(1)
mat = (snap.pivot(index="orientador", columns="topic", values="P_topic_given_orientador_year")
           .reindex(ori_order).fillna(0))

plt.figure(figsize=(max(8, len(mat.columns)*0.6), max(6, len(mat.index)*0.35)))
plt.imshow(mat.values, aspect='auto', interpolation='nearest')
plt.colorbar(label="P(tópico | orientador, último ano)")
plt.yticks(range(len(mat.index)), mat.index)
plt.xticks(range(len(mat.columns)), [topic_labels.get(int(t), t) for t in mat.columns], rotation=90)
plt.title("Preferências por orientador (snapshot do último ano disponível)")
plt.tight_layout()
plt.savefig(OUT_DIR/"heatmap_orientador_snapshot.png", dpi=150)
plt.show()

print("Figuras salvas em:", OUT_DIR)