In [None]:
import pandas as pd
import pickle
import numpy as np
import plotly.express as px
import datamapplot as dmp


In [None]:
# Cargar títulos de noticias 
data = pd.read_parquet("data/titulos.parquet")
docs = list(data.text)
del data

In [None]:
# Cargar algunas cosas que están dentro del modelo
with open('data/bertopic_results.pkl', 'rb') as f:
    data = pickle.load(f)
    topics = data['topics']
    probs = data['probs']

In [None]:
# Cargar el modelo completo
with open('data/bertopic_model.pkl', 'rb') as f:
    topic_model = pickle.load(f)


In [None]:
# Cargar títulos creados por gemini
with open("data/topic_titles.pkl", 'rb') as f:
    topic_titles = pickle.load(f)
    


In [None]:
# Cargar embeddings en d2
with open("data/embeddings_2d.pkl", "rb") as f:
    embeddings_2d = pickle.load(f)

In [None]:
# Actualizar el título de los tópicos por el conseguido con el modelo de google

#  Obtener la información de los documentos
docs_info = topic_model.get_document_info(docs)

# Crear un diccionario a partir de la lista
topic_id_to_title =  {i:topic_titles[i + 1 ]  for i in range(-1, len(topic_titles) - 1)    }

#docs_info.head()
docs_info["Topic_Title"] = docs_info["Topic"].map(topic_id_to_title)




In [None]:



# Obtén los clusters (topics)
topic_labels = np.array(topics)



In [None]:
docs_info.shape

In [None]:
# Filtra los outliers (topic -1)
mask = topic_labels != -1
embeddings_2d_filtered = embeddings_2d[mask]
topic_labels_filtered = topic_labels[mask]
custom_labels_filtered = docs_info["Topic_Title"][mask]
titles_filtered = np.array(docs)[mask]  # docs debe ser la lista de títulos



In [None]:
# Gráfico con plotly
fig = px.scatter(
    x=embeddings_2d_filtered[:, 0],
    y=embeddings_2d_filtered[:, 1],
    color=custom_labels_filtered.astype(str),
    hover_name=titles_filtered,
    hover_data={"Tópico": custom_labels_filtered},
    labels={'color': 'Topic'},
    title="Clusters de noticias por BERTopic (sin outliers)"
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(showlegend=False)
fig.show()

In [None]:

# Acomodar los nombres de los clusters para el paquete de visualización
arr_str = custom_labels_filtered.astype(str)

# Armar la leyenda que aparece en el hover
combined_hover_text = [f"NOTICIA: {title}\nCLUSTER {text}" for title, text in zip(titles_filtered, arr_str)]




In [None]:

# Visualización
plot = dmp.create_interactive_plot(
    embeddings_2d_filtered,
    arr_str,
    hover_text=combined_hover_text
)



plot.save("topicos_noticias_prod.html")

plot