In [18]:
# Imports, paths e utilitários

from pathlib import Path
import re, unicodedata, json

import polars as pl
import pandas as pd
import pandera as pa
from pandera import Column, DataFrameSchema, Check

def find_project_root(start: Path) -> Path:
    # Sobe até encontrar uma pasta que tenha "data" e "notebooks"
    for p in [start] + list(start.parents):
        if (p / "data").exists() and (p / "notebooks").exists():
            return p
    # fallback: assume dois níveis acima (funciona se estiver em notebooks/bertopic/*)
    return start.parents[1]

CWD = Path.cwd()
ROOT = find_project_root(CWD)         # <-- raiz correta do projeto
DATA = ROOT / "data"
INTERIM = DATA / "interim" / "bertopic"
EXPORT = DATA / "exports" / "dashboard"
BEST = EXPORT / "bertopic_best"       # artefatos do vencedor consolidados

# Não cria nada fora do projeto
EXPORT.mkdir(parents=True, exist_ok=True)

print("CWD  :", CWD.resolve())
print("ROOT :", ROOT.resolve())       # deve imprimir ...\analise_topicos_tcc
print("DATA :", DATA.resolve())
print("INTERIM:", INTERIM.resolve())  # ...\analise_topicos_tcc\data\interim\bertopic
print("BEST :", BEST.resolve())       # ...\analise_topicos_tcc\data\exports\dashboard\bertopic_best
print("EXPORT:", EXPORT.resolve())

def slugify(s: str) -> str:
    if s is None:
        return "na"
    s = unicodedata.normalize("NFKD", str(s))
    s = s.encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
    return s or "na"

CWD  : C:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\notebooks\bertopic
ROOT : C:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc
DATA : C:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data
INTERIM: C:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\interim\bertopic
BEST : C:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\bertopic_best
EXPORT: C:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard


In [19]:
# Leiotura de entradas: prep.csv e resultados do vencedor

# Metadados do corpus (prep.csv)
prep_path = INTERIM / "prep.csv"
assert prep_path.exists(), f"Arquivo não encontrado: {prep_path}"

prep = pl.read_csv(prep_path, infer_schema_length=10000)

# Normaliza tipos mínimos usados no dashboard
prep = prep.with_columns([
    pl.col("DOC_ID").cast(pl.Int64),
    pl.col("ano").cast(pl.Int64),
    pl.col("titulo").cast(pl.Utf8),
    pl.col("autor").cast(pl.Utf8, strict=False),
    pl.col("orientador").cast(pl.Utf8, strict=False).alias("orientador"),
    pl.col("resumo").cast(pl.Utf8, strict=False),
    pl.col("url").cast(pl.Utf8, strict=False),
    pl.col("RESUMO_PREP_BERTOPIC").cast(pl.Utf8, strict=False)
])

# Artefatos do BERTopic selecionado (consolidados em berTopic_best)
topic_info_csv = BEST / "topic_info.csv"
doc_topics_csv = BEST / "doc_topics.csv"
selection_json = BEST / "selection.json"  # informativo

assert topic_info_csv.exists(), f"Não encontrado: {topic_info_csv}"
assert doc_topics_csv.exists(), f"Não encontrado: {doc_topics_csv}"

topic_info = pl.read_csv(topic_info_csv)
doc_topics_raw = pl.read_csv(doc_topics_csv)

# Carrega info de seleção (opcional)
sel = {}
if selection_json.exists():
    try:
        sel = json.loads(selection_json.read_text(encoding="utf-8"))
    except Exception:
        sel = {}
sel

{'method': 'bertopic',
 'run': 'run_20250831T230602Z',
 'trial': 'trial_24',
 'K': 12,
 'c_npmi': 0.007560071814801735,
 'c_v': 0.6058599854512132,
 'diversity@10': 0.75,
 'sep_jsd': 0.789577921470549,
 'balance': 0.9262042731060053,
 'clarity': 0.6036024422926879,
 'outliers_pct': 0.14420803782505912,
 'scenario': 'opt',
 'RZ_index': 0.6800424648862116,
 'GR_index': 0.13238824907920146}

In [20]:
# Construção de docs.parquet

# Renomeia 'orientador' -> 'orientador_nome' e cria 'orientador_id' canônico (slug)
docs = (prep
        .with_columns([
            pl.col("orientador").alias("orientador_nome"),
            pl.col("orientador").map_elements(slugify).alias("orientador_id"),
        ])
        .select([
            "DOC_ID","ano","titulo","resumo","url",
            "autor","orientador_id","orientador_nome"
        ])
        .with_columns([
            pl.col("DOC_ID").cast(pl.Int64),
            pl.col("ano").cast(pl.Int64),
        ])
)

# Persistir
docs_out = EXPORT / "docs.parquet"
docs.write_parquet(docs_out)
print("OK ->", docs_out, "| linhas:", docs.height)
docs.head(3)

OK -> c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\docs.parquet | linhas: 423


DOC_ID,ano,titulo,resumo,url,autor,orientador_id,orientador_nome
i64,i64,str,str,str,str,str,str
0,2024,"""AGENDEVC: um sistema de agenda…","""Gerenciar a agenda e fornecer …","""http://dspace.sti.ufcg.edu.br:…","""FERREIRA, Williamberg de Albuq…","""massoni-tiago-lima""","""MASSONI, Tiago Lima."""
1,2024,"""Análise de técnicas de explica…","""Doenças oftalmológicas, como c…","""http://dspace.sti.ufcg.edu.br:…","""SILVA, Wendson Magalhães da.""","""gomes-herman-martins""","""GOMES, Herman Martins."""
2,2024,"""O impacto do uso de tags de ra…","""O presente trabalho busca comp…","""http://dspace.sti.ufcg.edu.br:…","""RIBEIRO, Vinicius Trindade Roc…","""mongiovi-melina""","""MONGIOVI, Melina."""


In [21]:
# Construção de topics.parquet

# OBS.: exclui -1 (outlier).

# Espera-se que topic_info tenha pelo menos colunas: 'Topic' e 'Name'
# 'Name' costuma conter label no estilo "topic <id>: palavra1, palavra2, ..."
# Vamos usar Name como 'label' e extrair 'keywords' de Name quando possível.

def extract_keywords_from_name(name: str) -> str:
    if not isinstance(name, str):
        return ""
    # pega sequência após ':' (se existir)
    parts = name.split(":", 1)
    if len(parts) == 2:
        return "; ".join([w.strip() for w in parts[1].split(",") if w.strip()])
    # fallback: retorna o próprio name
    return name

topics_df = (topic_info
             .rename({"Topic":"topic", "Name":"label"})
             .with_columns([
                 pl.col("topic").cast(pl.Int64),
                 pl.col("label").cast(pl.Utf8),
             ]))

# Exclui -1 (outlier)
topics_df = topics_df.filter(pl.col("topic") != -1)

# Cria 'keywords' a partir de label (quando disponível)
topics_df = topics_df.with_columns(
    pl.col("label").map_elements(extract_keywords_from_name).alias("keywords")
)

# (Opcional) adiciona coluna 'coherence' vazia para contrato estável
topics_df = topics_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("coherence"))

topics_out = EXPORT / "topics.parquet"
topics_df.select(["topic","label","keywords","coherence"]).sort("topic").write_parquet(topics_out)
print("OK ->", topics_out, "| tópicos:", topics_df.height)
topics_df.head(5)

OK -> c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\topics.parquet | tópicos: 11


topic,Count,label,Representation,Representative_Docs,keywords,coherence
i64,i64,str,str,str,str,str
0,52,"""0_modelos_dados_linguagem_imag…","""['modelos', 'dados', 'linguage…","""[""as redes adversárias generat…","""0_modelos_dados_linguagem_imag…",
1,51,"""1_nuvem_recursos_dados_sistema""","""['nuvem', 'recursos', 'dados',…","""['em um mercado globalizado e …","""1_nuvem_recursos_dados_sistema""",
2,44,"""2_alunos_computao_programao_si…","""['alunos', 'computao', 'progra…","""['a unidade acadêmica de siste…","""2_alunos_computao_programao_si…",
3,42,"""3_software_projetos_cdigo_bugs""","""['software', 'projetos', 'cdig…","""['criar projetos de programaçã…","""3_software_projetos_cdigo_bugs""",
4,39,"""4_dados_digital_sobre_privacid…","""['dados', 'digital', 'sobre', …","""['o dadosjusbr é um projeto se…","""4_dados_digital_sobre_privacid…",


In [22]:
# Construção de doc_topics.parquet

# OBS.: Mantém -1 (outlier) somente para auditoria.

# Espera-se em doc_topics.csv colunas: DOC_ID, topic, prob (ou equivalentes)
# Harmoniza nomes comuns encontrados em exportações
cand_topic_cols = [c for c in doc_topics_raw.columns if c.lower() in {"topic","topics"}]
cand_prob_cols  = [c for c in doc_topics_raw.columns if c.lower() in {"prob","probability","score"}]
cand_doc_cols   = [c for c in doc_topics_raw.columns if c.upper() == "DOC_ID" or c.lower()=="doc_id"]

assert cand_topic_cols, f"Não encontrei coluna de tópico em {doc_topics_csv}"
assert cand_prob_cols,  f"Não encontrei coluna de probabilidade em {doc_topics_csv}"
assert cand_doc_cols,   f"Não encontrei coluna DOC_ID em {doc_topics_csv}"

doc_topics = (doc_topics_raw
              .rename({
                  cand_topic_cols[0]: "topic",
                  cand_prob_cols[0]: "prob",
                  cand_doc_cols[0]: "DOC_ID"
              })
              .select(["DOC_ID","topic","prob"])
              .with_columns([
                  pl.col("DOC_ID").cast(pl.Int64),
                  pl.col("topic").cast(pl.Int64),
                  pl.col("prob").cast(pl.Float64)
              ]))

doc_topics_out = EXPORT / "doc_topics.parquet"
doc_topics.write_parquet(doc_topics_out)
print("OK ->", doc_topics_out, "| linhas:", doc_topics.height)
doc_topics.head(5)

OK -> c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\doc_topics.parquet | linhas: 423


DOC_ID,topic,prob
i64,i64,f64
0,6,1.0
1,5,0.190732
2,-1,0.097848
3,3,0.193371
4,6,0.294686


In [23]:
# topic_trends.parquet

# Exclui -1; inclui share
dt_valid = (
    doc_topics
    .filter(pl.col("topic") != -1)
    .join(docs.select(["DOC_ID", "ano"]), on="DOC_ID", how="inner")
)

if dt_valid.is_empty():
    # Evita quebrar pipeline caso tudo tenha sido classificado como -1
    trends = pl.DataFrame(schema={"topic": pl.Int64, "ano": pl.Int64, "n_docs": pl.Int64, "share": pl.Float64})
else:
    trends = (
    dt_valid
    .group_by(["topic", "ano"])
    .agg(pl.len().alias("n_docs"))
    .with_columns(pl.col("n_docs").cast(pl.Int64))
    .sort(["topic", "ano"])
)

totais_ano = trends.group_by("ano").agg(pl.col("n_docs").sum().alias("n_total_ano"))
trends = (
    trends
    .join(totais_ano, on="ano")
    .with_columns((pl.col("n_docs") / pl.col("n_total_ano")).alias("share"))
    .select(["topic", "ano", "n_docs", "share"])
)

trends_out = EXPORT / "topic_trends.parquet"
trends.write_parquet(trends_out)
print("OK ->", trends_out, "| linhas:", trends.height)
trends.head(8)

OK -> c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\topic_trends.parquet | linhas: 52


topic,ano,n_docs,share
i64,i64,i64,f64
0,2020,2,0.08
0,2021,6,0.08
0,2022,10,0.147059
0,2023,19,0.152
0,2024,15,0.217391
1,2020,1,0.04
1,2021,10,0.133333
1,2022,9,0.132353


In [24]:
# advisor_profiles.parquet e advisor_topics.parquet

# Base: docs + doc_topics (mantendo -1 apenas para listagens, não para agregados)
docs_by_advisor = docs.join(doc_topics, on="DOC_ID", how="left")

# Perfil por orientador
perfil = (
    docs
    .group_by(["orientador_id","orientador_nome"])
    .agg([
        pl.len().alias("n_tccs"),
        pl.col("ano").min().alias("ano_min"),
        pl.col("ano").max().alias("ano_max"),
    ])
    .with_columns([
        pl.col("n_tccs").cast(pl.Int64),  # <-- aqui
        (pl.col("ano_min").cast(pl.Int64).cast(pl.Utf8) + "–" + 
         pl.col("ano_max").cast(pl.Int64).cast(pl.Utf8)).alias("anos_atuacao")
    ])
)

# advisor_topics (n_docs)
advisor_topics = (
    docs_by_advisor
    .filter(pl.col("topic") != -1)
    .group_by(["orientador_id","topic"])
    .agg(pl.len().alias("n_docs"))
    .with_columns(pl.col("n_docs").cast(pl.Int64))   # <-- aqui
    .sort(["orientador_id","n_docs"], descending=[False, True])
)

tot_orient = advisor_topics.group_by("orientador_id").agg(pl.col("n_docs").sum().alias("n_total"))
advisor_topics = (
    advisor_topics
    .join(tot_orient, on="orientador_id")
    .with_columns((pl.col("n_docs")/pl.col("n_total")).alias("share_no_orientador"))
    .select(["orientador_id","topic","n_docs","share_no_orientador"])
)

# Map de topic -> label (para exibir top-k no perfil)
topics_map = pl.read_parquet(EXPORT / "topics.parquet").select(["topic","label"]).to_dict(as_series=False)
id2label = dict(zip(topics_map["topic"], topics_map["label"]))

def topk_labels_for_advisor(aid: str, k: int = 3) -> str:
    sub = advisor_topics.filter(pl.col("orientador_id")==aid).sort("n_docs", descending=True).head(k)
    if sub.is_empty():
        return "–"
    labs = []
    for r in sub.iter_rows(named=True):
        labs.append(id2label.get(int(r["topic"]), f"Topic {int(r['topic'])}"))
    return "; ".join(labs)

perfil = perfil.with_columns(
    pl.col("orientador_id").map_elements(topk_labels_for_advisor).alias("temas_top")
)

# Persistir
advisor_profiles_out = EXPORT / "advisor_profiles.parquet"
advisor_topics_out = EXPORT / "advisor_topics.parquet"

perfil.select(["orientador_id","orientador_nome","n_tccs","temas_top","anos_atuacao"]).write_parquet(advisor_profiles_out)
advisor_topics.write_parquet(advisor_topics_out)

print("OK ->", advisor_profiles_out, "| linhas:", perfil.height)
print("OK ->", advisor_topics_out, "| linhas:", advisor_topics.height)

perfil.head(3), advisor_topics.head(3)

OK -> c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\advisor_profiles.parquet | linhas: 77
OK -> c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\advisor_topics.parquet | linhas: 188


(shape: (3, 7)
 ┌──────────────────┬─────────────────┬────────┬─────────┬─────────┬──────────────┬─────────────────┐
 │ orientador_id    ┆ orientador_nome ┆ n_tccs ┆ ano_min ┆ ano_max ┆ anos_atuacao ┆ temas_top       │
 │ ---              ┆ ---             ┆ ---    ┆ ---     ┆ ---     ┆ ---          ┆ ---             │
 │ str              ┆ str             ┆ i64    ┆ i64     ┆ i64     ┆ str          ┆ str             │
 ╞══════════════════╪═════════════════╪════════╪═════════╪═════════╪══════════════╪═════════════════╡
 │ pereira-eanes-to ┆ PEREIRA, Eanes  ┆ 10     ┆ 2021    ┆ 2024    ┆ 2021–2024    ┆ 4_dados_digital │
 │ rres             ┆ Torres.         ┆        ┆         ┆         ┆              ┆ _sobre_privacid │
 │                  ┆                 ┆        ┆         ┆         ┆              ┆ …               │
 │ campelo-claudio- ┆ CAMPELO,        ┆ 15     ┆ 2021    ┆ 2024    ┆ 2021–2024    ┆ 0_modelos_dados │
 │ elizio-calazan…  ┆ Claudio Elízio  ┆        ┆         ┆         

In [25]:
# Validação de esquemas
import pandera.pandas as pa
from pandera import Check
import pandas as pd

DocsSchema = pa.DataFrameSchema({
    "DOC_ID": pa.Column(int, Check.ge(0)),
    "ano": pa.Column(int, Check.in_range(1900, 2100)),
    "titulo": pa.Column(str),
    "resumo": pa.Column(object, nullable=True),
    "url": pa.Column(object, nullable=True),
    "autor": pa.Column(object, nullable=True),
    "orientador_id": pa.Column(str),
    "orientador_nome": pa.Column(str),
}, strict=True)

TopicsSchema = pa.DataFrameSchema({
    "topic": pa.Column(int, Check.ge(0)),
    "label": pa.Column(str),
    "keywords": pa.Column(str),
    "coherence": pa.Column(object, nullable=True),
}, strict=True)

DocTopicsSchema = pa.DataFrameSchema({
    "DOC_ID": pa.Column(int, Check.ge(0)),
    "topic": pa.Column(int),  # pode conter -1
    "prob": pa.Column(float, Check.in_range(0, 1, include_min=True, include_max=True)),
}, strict=True)

TrendsSchema = pa.DataFrameSchema({
    "topic": pa.Column(int, Check.ge(0)),
    "ano": pa.Column(int, Check.in_range(1900, 2100)),
    "n_docs": pa.Column(int, Check.ge(0)),
    "share": pa.Column(float, Check.in_range(0, 1, include_min=True, include_max=True)),
}, strict=True)

AdvisorProfilesSchema = pa.DataFrameSchema({
    "orientador_id": pa.Column(str),
    "orientador_nome": pa.Column(str),
    "n_tccs": pa.Column(int, Check.ge(0)),
    "temas_top": pa.Column(str),
    "anos_atuacao": pa.Column(str),
}, strict=True)

AdvisorTopicsSchema = pa.DataFrameSchema({
    "orientador_id": pa.Column(str),
    "topic": pa.Column(int, Check.ge(0)),
    "n_docs": pa.Column(int, Check.ge(0)),
    "share_no_orientador": pa.Column(float, Check.in_range(0, 1, include_min=True, include_max=True)),
}, strict=True)

DocsSchema.validate(pd.read_parquet(EXPORT / "docs.parquet"))
TopicsSchema.validate(pd.read_parquet(EXPORT / "topics.parquet"))
DocTopicsSchema.validate(pd.read_parquet(EXPORT / "doc_topics.parquet"))
TrendsSchema.validate(pd.read_parquet(EXPORT / "topic_trends.parquet"))
AdvisorProfilesSchema.validate(pd.read_parquet(EXPORT / "advisor_profiles.parquet"))
AdvisorTopicsSchema.validate(pd.read_parquet(EXPORT / "advisor_topics.parquet"))

print("✔ Validação OK")

✔ Validação OK


In [26]:
# Checagens de sanidade e relatórios

docs_df = pl.read_parquet(EXPORT / "docs.parquet")
dt_df = pl.read_parquet(EXPORT / "doc_topics.parquet")
topics_df = pl.read_parquet(EXPORT / "topics.parquet")
trends_df = pl.read_parquet(EXPORT / "topic_trends.parquet")

# 1) unicidade de DOC_ID
assert docs_df["DOC_ID"].n_unique() == docs_df.height, "DOC_ID duplicados em docs.parquet"

# 2) cobertura: cada DOC_ID aparece ao menos uma vez em doc_topics
miss = set(docs_df["DOC_ID"].to_list()) - set(dt_df["DOC_ID"].to_list())
assert not miss, f"DOC_IDs sem atribuição em doc_topics: {sorted(list(miss))[:10]} ..."

# 3) probabilidade média do vencedor
prob_mean = float(dt_df["prob"].mean())
print(f"Probabilidade média do tópico vencedor: {prob_mean:.3f}")

# 4) presença de -1 (apenas doc_topics)
has_minus1_topics = (topics_df["topic"] == -1).any()
has_minus1_trends = (trends_df["topic"] == -1).any()
print("Tópico -1 em topics.parquet? ", bool(has_minus1_topics))
print("Tópico -1 em topic_trends.parquet? ", bool(has_minus1_trends))

# 5) anos cobertos
yrs_docs = docs_df["ano"].drop_nulls().unique().sort().to_list()
yrs_trends = trends_df["ano"].drop_nulls().unique().sort().to_list()
faltantes = set(yrs_docs) - set(yrs_trends)
if faltantes:
    print("Aviso: anos com zero temas válidos (apenas -1 ou nenhum doc):", sorted(list(faltantes)))
else:
    print("Cobertura temporal OK.")


Probabilidade média do tópico vencedor: 0.535
Tópico -1 em topics.parquet?  False
Tópico -1 em topic_trends.parquet?  False
Cobertura temporal OK.


In [27]:
# Freezer de dtypes para Parquets do dashboard
# Garante que contagens e chaves fiquem sempre em Int64 e floats em Float64.

import polars as pl

# 1) docs.parquet
docs_fp = EXPORT / "docs.parquet"
docs_fix = (pl.read_parquet(docs_fp)
            .with_columns([
                pl.col("DOC_ID").cast(pl.Int64),
                pl.col("ano").cast(pl.Int64),
            ]))
docs_fix.write_parquet(docs_fp)
print("dtypes/docs:", docs_fix.dtypes)

# 2) doc_topics.parquet
dt_fp = EXPORT / "doc_topics.parquet"
dt_fix = (pl.read_parquet(dt_fp)
          .with_columns([
              pl.col("DOC_ID").cast(pl.Int64),
              pl.col("topic").cast(pl.Int64),  # mantém -1
              pl.col("prob").cast(pl.Float64),
          ]))
dt_fix.write_parquet(dt_fp)
print("dtypes/doc_topics:", dt_fix.dtypes)

# 3) topics.parquet
topics_fp = EXPORT / "topics.parquet"
topics_fix = (pl.read_parquet(topics_fp)
              .with_columns([
                  pl.col("topic").cast(pl.Int64),
                  pl.col("label").cast(pl.Utf8),
                  pl.col("keywords").cast(pl.Utf8),
              ]))
topics_fix.write_parquet(topics_fp)
print("dtypes/topics:", topics_fix.dtypes)

# 4) topic_trends.parquet
trends_fp = EXPORT / "topic_trends.parquet"
trends_fix = (pl.read_parquet(trends_fp)
              .with_columns([
                  pl.col("topic").cast(pl.Int64),
                  pl.col("ano").cast(pl.Int64),
                  pl.col("n_docs").cast(pl.Int64),
                  pl.col("share").cast(pl.Float64),
              ]))
trends_fix.write_parquet(trends_fp)
print("dtypes/topic_trends:", trends_fix.dtypes)

# 5) advisor_profiles.parquet
ap_fp = EXPORT / "advisor_profiles.parquet"
ap_fix = (pl.read_parquet(ap_fp)
          .with_columns([
              pl.col("n_tccs").cast(pl.Int64),
          ]))
ap_fix.write_parquet(ap_fp)
print("dtypes/advisor_profiles:", ap_fix.dtypes)

# 6) advisor_topics.parquet
at_fp = EXPORT / "advisor_topics.parquet"
at_fix = (pl.read_parquet(at_fp)
          .with_columns([
              pl.col("topic").cast(pl.Int64),
              pl.col("n_docs").cast(pl.Int64),
              pl.col("share_no_orientador").cast(pl.Float64),
          ]))
at_fix.write_parquet(at_fp)
print("dtypes/advisor_topics:", at_fix.dtypes)

print("✔ Dtypes congelados.")

dtypes/docs: [Int64, Int64, String, String, String, String, String, String]
dtypes/doc_topics: [Int64, Int64, Float64]
dtypes/topics: [Int64, String, String, String]
dtypes/topic_trends: [Int64, Int64, Int64, Float64]
dtypes/advisor_profiles: [String, String, Int64, String, String]
dtypes/advisor_topics: [String, Int64, Int64, Float64]
✔ Dtypes congelados.


In [28]:
# Manifesto de exportação para reprodutibilidade
from datetime import datetime
import json, hashlib

docs = pl.read_parquet(EXPORT / "docs.parquet")
topics = pl.read_parquet(EXPORT / "topics.parquet")
dt = pl.read_parquet(EXPORT / "doc_topics.parquet")

# Básicos
n_docs = docs.height
anos = docs["ano"].drop_nulls()
anos_min = int(anos.min()) if anos.len() > 0 else None
anos_max = int(anos.max()) if anos.len() > 0 else None
n_topics = topics.height  # sem -1

# Seleção do vencedor (se existir)
sel = {}
sel_fp = BEST / "selection.json"
if sel_fp.exists():
    try:
        sel = json.loads(sel_fp.read_text(encoding="utf-8"))
    except Exception:
        sel = {}

# Outliers
outliers_count = dt.filter(pl.col("topic") == -1).height
outliers_pct = float(outliers_count / n_docs) if n_docs else 0.0
# Se selection.json tiver outliers_pct, mantém como "reported" e registra "observed"
reported_outliers_pct = sel.get("outliers_pct", None)

# Pequeno hash dos arquivos principais (para rastreio)
def file_sha1(path: Path) -> str:
    h = hashlib.sha1()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

hashes = {}
for p in ["docs.parquet", "topics.parquet", "doc_topics.parquet", "topic_trends.parquet",
          "advisor_profiles.parquet", "advisor_topics.parquet"]:
    fp = EXPORT / p
    if fp.exists():
        hashes[p] = file_sha1(fp)

manifest = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "paths": {
        "root": str(ROOT.resolve()),
        "export_dir": str(EXPORT.resolve()),
        "best_dir": str(BEST.resolve()),
    },
    "selection": {
        "method": sel.get("method"),
        "run": sel.get("run"),
        "trial": sel.get("trial"),
        "K": sel.get("K"),
        "metrics": {
            "c_npmi": sel.get("c_npmi"),
            "c_v": sel.get("c_v"),
            "diversity@10": sel.get("diversity@10"),
            "sep_jsd": sel.get("sep_jsd"),
            "balance": sel.get("balance"),
            "clarity": sel.get("clarity"),
            "RZ_index": sel.get("RZ_index"),
            "GR_index": sel.get("GR_index"),
        },
        "reported_outliers_pct": reported_outliers_pct,
    },
    "corpus": {
        "n_docs": n_docs,
        "years": {"min": anos_min, "max": anos_max},
    },
    "topics": {
        "n_topics": int(n_topics),  # sem -1
    },
    "outliers": {
        "count": int(outliers_count),
        "observed_pct": outliers_pct,
    },
    "artifacts_sha1": hashes,
}

manifest_fp = EXPORT / "_manifest.json"
manifest_fp.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
print("OK ->", manifest_fp)
print(json.dumps(manifest, ensure_ascii=False, indent=2)[:600] + "\n…")

OK -> c:\Users\User\Desktop\TCC\Notebooks locais\analise_topicos_tcc\data\exports\dashboard\_manifest.json
{
  "generated_at": "2025-09-04T20:13:37",
  "paths": {
    "root": "C:\\Users\\User\\Desktop\\TCC\\Notebooks locais\\analise_topicos_tcc",
    "export_dir": "C:\\Users\\User\\Desktop\\TCC\\Notebooks locais\\analise_topicos_tcc\\data\\exports\\dashboard",
    "best_dir": "C:\\Users\\User\\Desktop\\TCC\\Notebooks locais\\analise_topicos_tcc\\data\\exports\\dashboard\\bertopic_best"
  },
  "selection": {
    "method": "bertopic",
    "run": "run_20250831T230602Z",
    "trial": "trial_24",
    "K": 12,
    "metrics": {
      "c_npmi": 0.007560071814801735,
      "c_v": 0.6058599854512132,
      "
…


In [None]:
# === Relatório rápido de outliers ===
dt = pl.read_parquet(EXPORT / "doc_topics.parquet")
docs = pl.read_parquet(EXPORT / "docs.parquet")

outliers = dt.filter(pl.col("topic") == -1).join(docs.select(["DOC_ID","ano","titulo","orientador_nome"]), on="DOC_ID", how="left")

n_outliers = outliers.height
n_total = dt.height
pct = (n_outliers / n_total) if n_total else 0.0

print(f"Outliers (topic = -1): {n_outliers}/{n_total} = {pct:.2%}")

if n_outliers > 0:
    by_year = (outliers.group_by("ano").agg(pl.len().alias("n_docs"))
                        .sort("ano"))
    print("\nDistribuição por ano:")
    display(by_year.to_pandas())

    print("\nAmostra (até 5 docs):")
    display(outliers.select(["DOC_ID","ano","titulo"]).head(5).to_pandas())

Outliers (topic = -1): 61/423 = 14.42%


AttributeError: 'DataFrame' object has no attribute 'groupby'

In [None]:
# === Versionamento dos artefatos de tópicos ===
from shutil import copyfile
import re

topics_src = EXPORT / "topics.parquet"
topics_current = EXPORT / "topics_current.parquet"

# Determina próximo N para topics_v{N}.parquet
existing = [p.name for p in EXPORT.glob("topics_v*.parquet")]
nums = []
for name in existing:
    m = re.match(r"topics_v(\d+)\.parquet$", name)
    if m:
        nums.append(int(m.group(1)))
next_n = (max(nums) + 1) if nums else 1
topics_versioned = EXPORT / f"topics_v{next_n}.parquet"

# Copia snapshot e atualiza "current"
copyfile(topics_src, topics_versioned)
copyfile(topics_src, topics_current)

print("Criado snapshot:", topics_versioned.name)
print("Atualizado:", topics_current.name)

In [None]:
# === Smoke test final: 3 linhas de cada saída ===
for name in ["docs.parquet","topics.parquet","doc_topics.parquet","topic_trends.parquet",
             "advisor_profiles.parquet","advisor_topics.parquet","_manifest.json","topics_current.parquet"]:
    fp = EXPORT / name
    if not fp.exists():
        print(f"[faltando] {name}")
        continue
    print("\n==>", name)
    if name.endswith(".json"):
        print((EXPORT / name).read_text(encoding="utf-8")[:400] + "\n…")
    else:
        df = pl.read_parquet(fp)
        print(df.head(3))