In [None]:
from dotenv import load_dotenv
import pymysql
import os 
from bertopic import BERTopic
import nbformat
import pandas as pd
import pickle
import time
import numpy as np
import datamapplot as dmp
import plotly.express as px
import numpy as np
import google.generativeai as genai

print(nbformat.__version__)


In [None]:

load_dotenv(override=True)
# Conectar a MySQL
conn = pymysql.connect(
    host=os.getenv('HOST'),
    user=os.getenv('USER2'),
    password=os.getenv('PASSWORD'),
    database=os.getenv('DATABASE')
)

In [None]:
# Consultar registros nuevos o actualizados
n = 200000

cursor = conn.cursor()
cursor.execute(f"SELECT id, titulo, contenido_limpio, fecha, url, created_at, medio FROM noticias limit {n};")

# Indexar los registros en Elasticsearch
# Esto borra cualquier campo que exista para un documento
i = 0
docs = []
for row in cursor.fetchall():
    print(row[0], row[1]),
    if row[1] is not None:
        docs.append(row[1])
    i += 1






In [None]:

# Guardar en un archivo parquet
df = pd.DataFrame({"text": docs})
df.to_parquet('data/titulos.parquet', index=False)

print(df.shape)
print(df.head())
del df

In [None]:

genai.configure(api_key=os.getenv('API_KEY'))
model = genai.GenerativeModel('gemini-2.5-flash-lite')

# Configuración de generación
generation_config = genai.types.GenerationConfig(
    max_output_tokens=50,
    temperature=0.7,
    candidate_count=1,
)


In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=35, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)

In [None]:
# Correr el modelo
topics, probs = topic_model.fit_transform(docs)

In [None]:
# Guardar topicos y probabilidades
with open('data/bertopic_results.pkl', 'wb') as f:
    pickle.dump({'topics': topics, 'probs': probs}, f)

# Guardar modelo completo
with open('data/bertopic_model.pkl', 'wb') as f:
    pickle.dump(topic_model, f)



In [None]:
# Explorar lo que salió
x = topic_model.get_topic_info()


In [None]:
topic_model.get_topic_info()["Representation"][4]

In [None]:
info_topicos = topic_model.get_topic_info()

In [None]:
waiting_time = 60
max_attempts = 5

topic_titles  = []
# 2. Obtener información de los tópicos
info_topicos = topic_model.get_topic_info()
total_topics = info_topicos.shape[0]

titulos = {}

for index, row in info_topicos.iterrows():
    topic_id = row['Topic']

    if topic_id == -1:
        topic_titles.append("cluster outlier")
        continue

    palabras_topico = topic_model.get_topic(topic_id)
    palabras_principales = [palabra for palabra, peso in palabras_topico[:5]]
    # Obtener documentos representativos del tópico
    docs_representativos = row['Representative_Docs']
    # Tomar los primeros 2 documentos representativos como contexto
    contexto_docs = ". ".join(docs_representativos)
    # Crear prompt más informativo con palabras clave y documentos representativos
    palabras_texto = ", ".join(palabras_principales)
    print("Tópico ", index, "de ", total_topics )
    prompt = f"Estoy tratando de ponerle nombre a clusters que provienen de bertopic. Basándote en estas palabras clave: {palabras_texto} y estos ejemplos: {contexto_docs}. Crea un solo título descriptivo corto para mi cluster. No agregues ningún comentario. Quiero solo el título"

    # Se intenta varias veces antes de dar el caso por perdido
    attempts = 0
    while attempts < max_attempts:
        try:
            response = model.generate_content(prompt, generation_config=generation_config)
            topic_titles.append(response.text.strip())
            break
        except Exception as e:
            print(f"⚠️ Error al generar título para el tópico {topic_id}: {e}")
            attempts += 1
            if attempts == max_attempts:
                topic_titles.append("⚠️ Error al generar título después de {} intentos".format(max_attempts))
    
    if index % 10 == 0:
        print(f"Durmiendo por {waiting_time} segundos")
        time.sleep(waiting_time)

    

In [None]:
# Guarda la lista en un archivo
with open("data/topic_titles.pkl", "wb") as f:
    pickle.dump(topic_titles, f)

In [None]:
docs_representativos

In [None]:
topic_titles

In [None]:
# Actualizar el título de los tópicos por el conseguido con el modelo de google

#  Obtener la información de los documentos
docs_info = topic_model.get_document_info(docs)

# Crear un diccionario a partir de la lista
topic_id_to_title =  {i:topic_titles[i + 1 ]  for i in range(-1, len(topic_titles) - 1)    }

#docs_info.head()
docs_info["Topic_Title"] = docs_info["Topic"].map(topic_id_to_title)


docs_info.head()


In [None]:
import numpy as np

# Obtén los embeddings de los documentos
embeddings = topic_model._extract_embeddings(docs, method="document")

# Reduce dimensionalidad a 2D
umap_model = UMAP(n_components=2, random_state=42)
embeddings_2d = umap_model.fit_transform(embeddings)

# Obtén los clusters (topics)
topic_labels = np.array(topics)


# Guarda la lista en un archivo
with open("data/embeddings_2d.pkl", "wb") as f:
    pickle.dump(embeddings_2d, f)


In [None]:

# Filtra los outliers (topic -1)
mask = topic_labels != -1
embeddings_2d_filtered = embeddings_2d[mask]
topic_labels_filtered = topic_labels[mask]
custom_labels_filtered = docs_info["Topic_Title"][mask]
titles_filtered = np.array(docs)[mask]  # docs debe ser la lista de títulos



In [None]:




fig = px.scatter(
    x=embeddings_2d_filtered[:, 0],
    y=embeddings_2d_filtered[:, 1],
    color=custom_labels_filtered.astype(str),
    hover_name=titles_filtered,
    hover_data={"Tópico": custom_labels_filtered},
    labels={'color': 'Topic'},
    title="Clusters de noticias por BERTopic (sin outliers)"
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(showlegend=False)
fig.show()

In [None]:

# Acomodar los nombres de los clusters para el paquete de visualización
arr_str = custom_labels_filtered.astype(str)

# Armar la leyenda que aparece en el hover
combined_hover_text = [f"NOTICIA: {title}\nCLUSTER {text}" for title, text in zip(titles_filtered, arr_str)]




In [None]:


# Visualización
plot = dmp.create_interactive_plot(
    embeddings_2d_filtered,
    arr_str,
    hover_text=combined_hover_text
)



plot.save("topicos_noticias_dev.html")

plot
