<a href="https://colab.research.google.com/github/gabrielfernandorey/ITBA-NLP/blob/main/NLP_02_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Trabajo Practico NLP - Detección de Tópicos y clasificación
- ITBA 2024
- Alumno: Gabriel Rey
---

### >>> Solo para correr en Colab

In [2]:
from google.colab import userdata

git_token = userdata.get('GIT_TOKEN')
git_username = "gabrielfernandorey"
git_repository = "ITBA-NLP.git"

!git clone https://{git_token}@github.com/{git_username}/{git_repository}

!pip install -r /content/ITBA-NLP/requirements.txt

fatal: destination path 'ITBA-NLP' already exists and is not an empty directory.
Collecting es-core-news-lg@ https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.7.0/es_core_news_lg-3.7.0-py3-none-any.whl#sha256=08020b83e0c6da1584e567551a5e0de7b15dc0534eaaee21acc1ce908d1be742 (from -r /content/ITBA-NLP/requirements.txt (line 26))
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.7.0/es_core_news_lg-3.7.0-py3-none-any.whl (568.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.0/568.0 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting absl-py==2.1.0 (from -r /content/ITBA-NLP/requirements.txt (line 1))
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting altair==5.3.0 (from -r /content/ITBA-NLP/requirements.txt (line 2))
  Downloading altair-5.3.0-p

In [5]:
!pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


## MODELO

In [6]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from matplotlib import pyplot as plt
import os
import json
from datetime import datetime, date
from dateutil.parser import parse
from dotenv import load_dotenv

from NLP_tools import Cleaning_text, top_keywords, top_entities, get_topic_name, best_document, clean_all, topic_documents
from core.functions import *

ModuleNotFoundError: No module named 'NLP_tools'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
from opensearch_data_model import Topic, TopicKeyword, News, os_client, TOPIC_INDEX_NAME, NEWS_INDEX_NAME
from opensearch_io import init_opensearch, get_news
from opensearchpy import helpers

In [9]:
from openai import OpenAI

ModuleNotFoundError: No module named 'openai'

### Path

In [7]:
load_dotenv()
PATH_REMOTO='/content/ITBA-NLP/data/'
PATH=os.environ.get('PATH_LOCAL', PATH_REMOTO)
PATH

'/content/ITBA-NLP/data/'

In [8]:
if PATH == os.environ.get('PATH_LOCAL'):
    client = OpenAI(api_key= os.environ.get('OPENAI_API_KEY'))
else:
    from google.colab import userdata
    client = OpenAI(api_key= userdata.get('OPENAI_API_KEY'))

NameError: name 'OpenAI' is not defined

### Inicializamos la base vectorial

In [None]:
init_opensearch()



El índice Topic ya existe. Saltando inicialización de base de datos.
El índice News ya existe. Saltando inicialización de base de datos.


### Data de noticias original

In [None]:
df_params = {'0_1000':'0_1000_data.parquet',
             '1000_2000':'1000_2000_data.parquet',
             '2000_3000':'2000_3000_data.parquet',
             'df_joined':'df_joined_2024-04-01 00_00_00.parquet'
            }

chunk = os.environ.get('CHUNK')
chunk = '0_1000'

df_parquet = pd.read_parquet(PATH+df_params[chunk])
data = list(df_parquet['in__text'])

# Cantidad total de documentos
print(chunk)


In [None]:
# Cargar vocabulario
with open(PATH+f'modelos/vocabulary_{chunk}.json', 'r') as json_file:
    vocab = json.load(json_file)
len(vocab)

### StopWords

In [None]:
# Stopwords
SPANISH_STOPWORDS = list(pd.read_csv(PATH+'spanish_stop_words.csv' )['stopwords'].values)

### Modelo

In [None]:
tfidf_vectorizer = TfidfVectorizer(
        tokenizer=None,
        max_df=0.9,
        min_df=0.1,
        ngram_range=(1, 2),
        vocabulary=vocab,
        # max_features=100_000
)
tfidf_vectorizer.fit(data)

Capas del modelo BERTopic

In [None]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 4 - Tokenize topics
vectorizer_model = tfidf_vectorizer
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()
# Step 6 - (Optional) Fine-tune topic representations with a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,              # Step 1 - Extract embeddings
  umap_model=umap_model,                        # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,                  # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,            # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                    # Step 5 - Extract topic words
  # representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
  # language='multilingual',                    # This is not used if embedding_model is used.
  verbose=True,
  # calculate_probabilities=True
)

In [None]:
def my_callback(stage, **kwargs):
    print(f"Stage: {stage}")
    for key, value in kwargs.items():
        print(f"{key}: {value}")

In [None]:
my_callback(stage="start_training")

### Entrenamiento

In [None]:
# Cargar modelo entrenado o saltar celda y entrenar
ahora = datetime.today()
topic_model = BERTopic.load(PATH+f"modelos/bertopic_model_{chunk}")
topics = np.load(PATH+f"modelos/topics_{chunk}.npy")
probs = np.load(PATH+f"modelos/probs_{chunk}.npy")

# Cargar los embeddings
docs_embedding = np.load(PATH+f"modelos/topic_embeddings_{chunk}.npy")

In [None]:
topics, probs = topic_model.fit_transform(data)

In [None]:
# Obtenemos embeddings de todos los documentos
docs_embedding = topic_model.embedding_model.embed(data)

In [None]:
# Grabar modelo
topic_model.save(PATH+f"modelos/bertopic_model_{chunk}")
np.save(PATH+f"modelos/topics_{chunk}.npy", topics)
np.save(PATH+f"modelos/probs_{chunk}.npy", probs)

# Guardar los embeddings en un archivo de NumPy
np.save(PATH+f"modelos/topic_embeddings_{chunk}.npy", docs_embedding)

## Resultados

In [None]:
print(f"Cantidad de tópicos {len(set(topics))} (incluye topico -1)")

In [None]:
# Obtener documentos de cada tópico
topic_freq = topic_model.get_topic_freq()

# Imprimir el número de tópicos encontrados (incluyendo el tópico -1)
num_topics = len(topic_freq)
print(f"Número de tópicos encontrados: {num_topics} (incluye el topico -1)")

# Imprimir la cant de documentos de cada tópico
print(topic_freq)

### Recuperar todos los topicos y sus etiquetas generadas por el modelo

In [None]:
topic_model.generate_topic_labels()

### Buscar topicos ingresando un texto

In [None]:
topic_res = topic_model.find_topics("cambio climatico")
topic_res

### Buscar los titulos de los primeros n documentos de un topico

In [None]:
# Obtener los n documentos de un tópico
topic_id = 10
n_docs = 5
topic_docs_idx = [i for i, (doc, topic) in enumerate(zip(list(df_parquet['in__title']), topics)) if topic == topic_id]
n_docs = n_docs if n_docs <= len(topic_docs_idx) else len(topic_docs_idx)

print(f"{n_docs} de {len(topic_docs_idx)} titulos de noticias encontrados en el tópico {topic_id:}")
for idx in topic_docs_idx[:n_docs]:
    print("- ",df_parquet.iloc[idx]['in__title'])

#### Busqueda de documentos por topico, ordenados por mayor probabilidad

In [None]:

T = topic_model.get_document_info(data)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()

topic = 10
# topic = np.random.randint(0, len(docs_per_topics)-1) # Aleatorio

print("Ejemplo para tópico:", topic)

doc_probs_x_topic = []
for doc in docs_per_topics[topic]:
    doc_probs_x_topic.append([df_parquet.index[doc], df_parquet.iloc[doc].in__title, round(probs[doc],4)])

df_query_1 = pd.DataFrame(doc_probs_x_topic)
print(len(df_query_1), "docs encontrados")
df_query_1.sort_values(2, ascending=False)

#### Palabras clave del topico

In [None]:
# Obtener las palabras clave para un topico dado
topic_keywords = topic_model.get_topic(topic)
print("Topico:", topic)
print(topic_keywords)

### Criterio de corte (umbral)
El criterio de corte utilizado para filtrar las noticias que pertenecen a un topico es el valor de -1 desvio std.

In [None]:
# Calcular la media, el desvío estándar

mean = np.mean([fila[2] for fila in doc_probs_x_topic])
std_dev = np.std([fila[2] for fila in doc_probs_x_topic])

# Crear el histograma
plt.hist([fila[2] for fila in doc_probs_x_topic], bins=10, edgecolor='black')

# Añadir líneas para la media, la moda y el desvío estándar
plt.axvline(mean, color='r', linestyle='dashed', linewidth=1, label=f'Media: {mean:.2f}')
plt.axvline(mean - std_dev, color='b', linestyle='dashed', linewidth=1, label=f'-1 STD: {mean - std_dev:.2f}')


# Añadir títulos y etiquetas
plt.title(f'Histograma de probabilidades topico: {topic}')
plt.xlabel('Valor')
plt.ylabel('Frecuencia')
plt.legend()

# Mostrar el gráfico
plt.show()

#### Documentos mas representativos de un topico

- [1]  Obtenido por el metodo del modelo

In [None]:
docs_representative = topic_model.get_representative_docs(topic=topic)
docs_representative

- [2] Obtenido por busqueda de probabilidad de documentos perteneciente al topico ( utilizando el umbral de corte )

In [None]:
# Definir la función de estilo
def color_rows(row, label, value):
    if row[label] >= value:
        return ['color: cyan'] * len(row)
    else:
        return [''] * len(row)

# Agrupamos documentos por topico
T = topic_model.get_document_info(data)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()

# Obtener los IDs de los documentos y sus probabilidades
docs_ids = []
docs_topic = []
doc_probs_x_topic = []
for doc_ID in tqdm(docs_per_topics[topic]):
    docs_ids.append(df_parquet.index[doc_ID])
    doc_probs_x_topic.append(probs[doc_ID])

# Calcular la media, el desvío estándar
mean = np.mean(doc_probs_x_topic)
std_dev = np.std(doc_probs_x_topic)
threshold = mean - std_dev

# Crear una consulta de múltiples IDs
index_name = 'news'
mget_query = {
    "docs": [{"_index": index_name, "_id": doc_id} for doc_id in docs_ids]
}
# Realizar la búsqueda de múltiples IDs
response = os_client.mget(body=mget_query, index=index_name)

# Procesar la respuesta
for i, doc in enumerate(response['docs']):
    if doc['found']:
        idx = doc['_id']
        title = df_parquet.iloc[docs_per_topics[topic][i]].in__title
        prob_doc = probs[docs_per_topics[topic][i]]

        if 'entities' in doc['_source']:
            ent = doc['_source']['entities']
        else:
            ent = []

        docs_topic.append([idx, title, prob_doc, ent])


df_view = pd.DataFrame(docs_topic, columns = ['indice','titulo','prob','entidades']).sort_values('prob', ascending=False)
df_view.style.apply(lambda row: color_rows(row, 'prob', threshold), axis=1)


Nota: Los documentos mas representativos encontrados utilizando el metodo "get_representative_docs" no refleja lo mismo que encontrando los documentos por probabilidades maximas

- [3] Por similitud coseno del topico a los tres documento mas cercanos

In [None]:
# Obtenemos la matriz de similitud coseno entre topicos y documentos
sim_matrix = cosine_similarity(topic_model.topic_embeddings_, docs_embedding)
sim_matrix.shape

In [None]:
# Similitud coseno entre el topico y los documentos del topico elegido
s_coseno = []
for i in docs_per_topics[topic]:
    s_coseno.append(cosine_similarity([topic_model.topic_embeddings_[topic + 1]], [docs_embedding[i]])[0][0])

# Indices
idx_coseno_sort = np.argsort(s_coseno)[::-1]

for idx in idx_coseno_sort[:3]:
    print(idx, df_parquet.iloc[docs_per_topics[topic][idx]].in__title)

Nota: Del mismo modo que en el punto anterior, los documentos mas cercanos al topico no coinciden no son exactamente los mismos que los hallados en el punto 1

- [4] Primer documento mas cercano al embedding del topico

In [None]:
# Documento de maxima similitud con el topico

simil_docs_topic = sim_matrix[topic + 1].argmax()
print(f"Noticia de maxima similitud con el topico: {topic}")
print(f"Doc ID: {df_parquet.index[simil_docs_topic]}")
print(f"Titulo: {df_parquet.iloc[simil_docs_topic].in__title}")
print(f"Noticia: {data[simil_docs_topic][:80]}...")
best_doc = data[simil_docs_topic]

#### Keywords de solo un topico

In [None]:
keywords = topic_model.topic_representations_[topic]
topic_keywords = [TopicKeyword(name=keyword, score=score) for keyword, score in keywords if keyword != '']
topic_keywords

In [None]:
# Calculo de umbral de corte para las keywords
freq_k = []
for name_score in topic_keywords:
    freq_k.append(name_score['score'])
umbral_k = np.array(freq_k).mean()
print(umbral_k)

topic_keywords_top = {}
for name_score in topic_keywords:
    if name_score['score'] >= umbral_k:
        topic_keywords_top[name_score['name']] = name_score['score']

topic_keywords_top

#### Entidades de un Topico a partir de los n documentos mas cercanos al embedding del topico

In [None]:
# Entidades de documentos ordenados por similitud del topico elelgido
n_docs = 5 # n docs cercanos
entities_topic = []
for doc in df_view[:n_docs].iterrows():
    entities_topic.append(doc[1][3])

from collections import defaultdict

# Crear un diccionario para contar en cuántos documentos aparece cada palabra
document_frequencies = defaultdict(int)

# Crear un conjunto para cada documento y contar las palabras únicas
for lista in entities_topic:
    unique_words = set(lista)
    for palabra in unique_words:
        document_frequencies[palabra] += 1

# Ordenar las palabras por la frecuencia de documentos de mayor a menor
sorted_frequencies = sorted(document_frequencies.items(), key=lambda item: item[1], reverse=True)

freq_e = []
for item in sorted_frequencies:
    freq_e.append(item[1])
umbral_e = np.array(freq_e).mean()

# Imprimir el resultado ordenado de las primeras 10 entidades segun criterio de corte
topic_entities_top = {}
c=0
for idx in range(len(sorted_frequencies)):
    if sorted_frequencies[idx][1] >= umbral_e:
        if c != 10:
            topic_entities_top[sorted_frequencies[idx][0]] = sorted_frequencies[idx][1]
        else:
            break
        c += 1

topic_entities_top


### Grabar todos los registros en Topic y actualizar en News

In [None]:
def topic_documents(topic, topic_model, probs, df_news):
    """
    función que devuelve los ids de los documentos del tópico por encima del umbral,
    los titulos de los documentos del tópico,
    y el umbral de corte.
    """
    try:
        # Cantidad de documentos por topico
        docs_per_topics = [i for i, x in enumerate(topic_model.topics_) if x == topic]

        # Obtener los IDs de los documentos y sus probabilidades
        docs_IDs = {}
        doc_probs = []
        for doc_idx in docs_per_topics:

            docs_IDs[df_news.indice[doc_idx]] = probs[doc_idx]
            doc_probs.append(probs[doc_idx])

        # Calcular la media, el desvío estándar
        mean = np.mean(doc_probs)
        std_dev = np.std(doc_probs)
        threshold = mean - std_dev

        # Filtra los docs que superan o igualan al valor del umbral calculado
        filter = {}
        for k,v in docs_IDs.items():
            if v >= threshold:
                filter[k] = v

        # Ordeno de mayor a menor
        ids_filter_sort = dict(sorted(filter.items(), key=lambda item: item[1], reverse=True))

        title_filter_sort = [ df_news.loc[df_news['indice'] == idx].values[0][1] for idx in ids_filter_sort.keys() ]

        return ids_filter_sort, title_filter_sort, threshold
    except:
        return {}, {}, 0.0

In [None]:
topic_documents(topic, topic_model, probs, data)

In [None]:
docs_per_topics = [i for i, x in enumerate(topic_model.topics_) if x == topic]
print(docs_per_topics)

# Obtener los IDs de los documentos y sus probabilidades
docs_IDs = {}
doc_probs = []
for doc_idx in docs_per_topics:

    docs_IDs[df_parquet.indice[doc_idx]] = probs[doc_idx]
    doc_probs.append(probs[doc_idx])

In [None]:
df_parquet.indice[doc_idx]

In [None]:
# Busqueda de todas las noticias no procesadas de la base ( en False ) (al menos 10.000)
db_news = news_no_process()

df_news = pd.DataFrame(db_news , columns=["indice", "titulo", "noticia", "keywords", "entidades", "creado"])
id_data    = list(df_news['indice'])
title_data = list(df_news['titulo'])

In [None]:
# Cargar entities
with open(PATH+f'modelos/entities_{chunk}.json', 'r') as json_file:
    entities = json.load(json_file)

# Grabar todos los topicos en la base
for topic in topic_model.get_topics().keys():
    if topic > -1:

        topic_keywords_top  = top_keywords(topic, topic_model)
        topic_entities_top  = top_entities(topic, topic_model, docs_embedding, data, entities)
        topic_documents_ids, topic_documents_title, threshold  = topic_documents(topic, topic_model, probs, df_parquet, data)
        id_best_doc, title_best_doc, best_doc = best_document(topic, topic_model, docs_embedding, id_data, title_data, data)

        topic_doc = Topic(
            index = topic,
            name = get_topic_name(''.join({**topic_keywords_top, **topic_entities_top}), client),
            vector = list(topic_model.topic_embeddings_[topic + 1 ]),
            similarity_threshold = threshold,
            created_at = datetime.now(),
            to_date = parse('2024-04-02'),
            from_date = parse('2024-04-01'),
            keywords = topic_keywords_top,
            entities = topic_entities_top,
            id_best_doc = id_best_doc,
            title_best_doc = title_best_doc,
            best_doc = best_doc,
        )

        topic_doc.save()

##### Actualizar datos en News

In [None]:
# Marcar registros de noticias procesados
index_name = 'news'
search_query = {
    'query': {
        'match': {
            'process': False
        }
    },
    'size': 10000
}

# Realizar la búsqueda
response = os_client.search( body=search_query, index=index_name )

for i, reg in enumerate(response['hits']['hits']):
    doc_id = reg['_id']

    update_body = {
                    "doc": {
                        "vector": ,
                        "process": True
                    }
    }

    # Realizar la actualización
    os_client.update(index=index_name, id=doc_id, body=update_body)

### Recuperar todos los topicos

In [None]:
index_name = 'topic'

db_topics = []
for i, doc in enumerate(Topic.search().query().scan()):
    db_topics.append(doc.to_dict())
    print(db_topics[i]['index'], db_topics[i]['name'])

In [None]:
T = topic_model.find_topics("israel")
T

### Recuperar de la base el documento mas cercano a un topico

In [None]:
index_name = 'topic'
search_query = {
    'query': {
        'match': {
            'index': 10  # Sustituir 'campo' y 'valor' por campo y valor de búsqueda
        }
    }
}

# Realizar la búsqueda
response = os_client.search(
                            body=search_query,
                            index=index_name
)

texto = response['hits']['hits'][0]['_source']

#Imprimir los resultados
print(f"Topico: {response['hits']['hits'][0]['_source']['name']}")
print("\n"+ texto['best_doc'])

In [None]:
ver_embedding = response['hits']['hits'][0]['_source']['vector']

In [None]:
np.array(ver_embedding).shape

### Nuevo documento consultando embeddings generados por el modelo

In [None]:
new_doc = "Fuga de presos en San Telmo"

new_doc_embedding = topic_model.embedding_model.embed(new_doc)
sim_matrix_new = cosine_similarity(topic_model.topic_embeddings_, new_doc_embedding.reshape(1, -1))

idx = np.argmax(sim_matrix_new)-1         # Topicos desde -1, 0, 1, ..., n
print(db_topics[idx]['index'], db_topics[idx]['name'])

### Nuevo documento consultando embeddings de la base

In [None]:
new_doc = "Fuga de presos en San Telmo"

new_doc_embedding = topic_model.embedding_model.embed(new_doc)

# Buscamos en la base a que topico pertenece el nuevo documento
knn_query = {
    "size": 1,
    "query": {
        "knn": {
            "vector": {
                "vector": new_doc_embedding,
                "k" : 3
            }
        }
    }
}
response = os_client.search(index='topic', body=knn_query)

if response['hits']['total']['value'] > 0:
    print(f"Topico: {response['hits']['hits'][0]['_source']['name']}")
    print(f"Estimacion: {response['hits']['hits'][0]['_score']}")
else:
    print(f"Topico no encontrado")

### Graficar los topicos en 3d

In [None]:
# Obtener los embeddings de los tópicos
topic_embeddings = topic_model.topic_embeddings_

new_doc = "Fuga de presos en San Telmo"
new_doc_embedding = topic_model.embedding_model.embed([new_doc])[0]

total = np.vstack((topic_embeddings, new_doc_embedding))

# Reducir la dimensionalidad de los embeddings a 3D usando UMAP
umap_model = UMAP(n_components=3)
embeddings_3d = umap_model.fit_transform(total)


In [None]:
import plotly.express as px

# Obtener las etiquetas de los tópicos
topic_labels = topic_model.get_topic_info()['Topic']

# Crear un DataFrame para Plotly
df = pd.DataFrame(embeddings_3d, columns=['Dim1', 'Dim2', 'Dim3'])
df['Topico'] = list(range(len(topic_embeddings))) + ['Nuevo Doc'] # Identificamos el nuevo documento
df['Etiqueta'] = list(topic_labels) + ['Nuevo Documento']

# Graficar los tópicos en 3D de manera interactiva usando Plotly, ajustando el tamaño del gráfico
fig = px.scatter_3d(df, x='Dim1', y='Dim2', z='Dim3', color='Topico', text='Etiqueta', title='Visualización 3D de Tópicos con BERTopic')


# Ajustar el tamaño del gráfico
fig.update_layout(
    autosize=False,
    width=1200,  # Ancho del gráfico
    height=800,  # Altura del gráfico
    margin=dict(l=65, r=50, b=65, t=90)
)

# Mostrar las etiquetas en el gráfico
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers+text'))

# Mostrar el gráfico
fig.show()

In [None]:
type(topic_labels)

In [None]:
response

### Busqueda de un documento por su indice, topico asociado, keywords, entities.

In [None]:

index_name = 'news'
search_query = {
    'query': {
        'match': {
            '_id': '105640350'
        }
    }
}

# Realizar la búsqueda
response = os_client.search(
                            body=search_query,
                            index=index_name
)
print(f"Texto de Noticia: {response['hits']['hits'][0]['_source']['news'][:200]}...\n")

new_doc_embedding = topic_model.embedding_model.embed(response['hits']['hits'][0]['_source']['news'])

# Define el índice y el campo del vector
index_name = 'topic'
vector_field = 'vector'

# Crear una consulta KNN para buscar el embedding más similar
knn_query = {
    "size": 1,  # Número de resultados que deseas obtener, en este caso 1
    "query": {
        "knn": {
            vector_field: {
                "vector": new_doc_embedding,
                "k": 3  # Número de vecinos más cercanos
            }
        }
    }
}

# Realizar la búsqueda
response_topic = os_client.search(index=index_name, body=knn_query)

# Obtener el tópico más cercano
if response_topic['hits']['total']['value'] > 0:
    closest_topic = response_topic['hits']['hits'][0]['_source']
    print(f"El nuevo documento pertenece al tópico: {closest_topic['index']}")
    print(closest_topic['name'])
    print(f"Estimacion: {closest_topic['similarity_threshold']}")
    print(f"Keywords del topico: {closest_topic['keywords']}")
    print(f"Entidades del topico: {closest_topic['entities']}")

else:
    print("No se encontró un tópico cercano.")

### Agrupamiento de topicos

In [None]:
index_name = 'topic'

db_topics = []
for i, doc in enumerate(Topic.search().query().scan()):
    db_topics.append(doc.to_dict())
    print(db_topics[i]['index'], db_topics[i]['name'])

#### Agrupando por embeddings cercanos

In [None]:
topic_embeddings = topic_model.topic_embeddings_

In [None]:
# Reducir la dimensionalidad a 3D usando UMAP
umap_model = UMAP(n_neighbors=5, n_components=3, metric='cosine')
umap_embeddings = umap_model.fit_transform(topic_embeddings)

In [None]:
import plotly.express as px
import pandas as pd

# Crear un DataFrame para los embeddings y los IDs de los tópicos
df = pd.DataFrame(umap_embeddings, columns=['UMAP Dimension 1', 'UMAP Dimension 2', 'UMAP Dimension 3'])
df['Topic'] = topic_model.get_topic_info()['Topic'].values

# Crear la gráfica 3D interactiva usando Plotly
fig = px.scatter_3d(df, x='UMAP Dimension 1', y='UMAP Dimension 2', z='UMAP Dimension 3',
                    text='Topic', title='Embeddings de los Tópicos Reducidos a 3D con UMAP',
                    width=1000, height=800)  # Ajustar el tamaño del gráfico

# Mostrar la gráfica
fig.show()



In [None]:
# Eliminamos el topico -1
new_topic_embeddings = topic_embeddings[1:]

In [None]:
# Calcular la similitud del coseno entre los embeddings de los tópicos
similarities = cosine_similarity(new_topic_embeddings)

In [None]:
# Crear una matriz de similitud excluyendo la diagonal
np.fill_diagonal(similarities, 0)

In [None]:
# Encontrar los pares de tópicos más cercanos
topic_pairs = np.dstack(np.unravel_index(np.argsort(similarities.ravel())[::-1], similarities.shape))[0]

In [None]:
# Mostrar los 5 pares de tópicos más cercanos
for i in range(10):
    topic_id_1, topic_id_2 = topic_pairs[i]
    similarity_score = similarities[topic_id_1, topic_id_2]
    print(f"Topico {topic_id_1} y Topico {topic_id_2} tienen una similitud de: {similarity_score:.4f}")


In [None]:
topics_to_merge = [16, 17]

id_docs_to_merge = []
for topic in db_topics:
    if topic['index'] in topics_to_merge:
        id_docs_to_merge.append(topic['docs'].keys())

list_id_docs_to_merge = [ item for sublist in id_docs_to_merge for item in sublist ]


In [None]:
index_name = 'news'

# Construir el cuerpo de la solicitud para `mget`
body = {
    "docs": [{"_index": "news", "_id": int(doc_id)} for doc_id in list_id_docs_to_merge]
}

# Realizar la solicitud `mget`
response = os_client.mget(body=body)


In [None]:
idx_relativo = []
docs_input = []
for i, doc in enumerate(response['docs']):
    idx_relativo.append(doc['_id'])
    docs_input.append(doc['_source']['news'])



In [None]:
len(docs_input)

In [None]:
topic_model.merge_topics(docs_input, topics_to_merge)

In [None]:
def style_tags(tags):
    styled_tags = ' | '.join([f' {tag} ' for tag in tags])
    return styled_tags

db_news = []
for doc in News.search().query().scan():
    index       = doc.meta.id
    title       = doc.to_dict()['title']
    author      = doc.to_dict()['author']
    try:
        keywords =  doc.to_dict()['keywords']
    except:
        keywords = [""]
    try:
        entities =  doc.to_dict()['entities']
    except:
        entities = [""]

    created_at  = doc.to_dict()['created_at']
    process     = doc.to_dict()['process']

    db_news.append([index, title, style_tags(keywords), style_tags(entities), author, created_at, process])

T = topic_model.get_document_info(data)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()

# Obtener los IDs de los documentos y sus probabilidades
docs_ids = []
docs_topic = []
doc_probs_x_topic = []
for doc_ID in tqdm(docs_per_topics[topic]):
    docs_ids.append(df_parquet.index[doc_ID])
    doc_probs_x_topic.append(probs[doc_ID])


In [None]:
doc_probs_x_topic

In [None]:
db_news = get_news()

In [None]:
# Crear un diccionario para agrupar los registros por fecha (solo día, mes y año)
fechas_dict = defaultdict(list)

# Agrupar registros por fecha
for registro in db_news:
    fecha_completa = registro[-1]
    fecha_solo_dia = fecha_completa.split('T')[0]  # Tomar solo el día, mes y año
    if fecha_solo_dia not in fechas_dict:
        fechas_dict[fecha_solo_dia] = 1
    else:
        fechas_dict[fecha_solo_dia] += 1




In [None]:
fechas_dict

defaultdict(list, {'2024-04-01': 1000})