<a href="https://colab.research.google.com/github/gabrielfernandorey/ITBA-NLP/blob/main/ITBA_nlp01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Trabajo Practico NLP - Detección de Tópicos y clasificación
- ITBA 2024
- Alumno: Gabriel Rey
---

## MODELO

In [30]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from matplotlib import pyplot as plt
import os
import json
from datetime import datetime, date
from dateutil.parser import parse
from dotenv import load_dotenv

from NLP_tools import Cleaning_text, top_keywords, top_entities, get_topic_name, best_document, clean_all, topic_documents
from core.functions import *

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import unicodedata
from tqdm import tqdm

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [3]:
from opensearch_data_model import Topic, TopicKeyword, News, os_client, TOPIC_INDEX_NAME, NEWS_INDEX_NAME

In [4]:
from openai import OpenAI

### Inicializamos la base vectorial

In [5]:
init_opensearch()



El índice Topic ya existe. Saltando inicialización de base de datos.
El índice News ya existe. Saltando inicialización de base de datos.


### Path

In [6]:
load_dotenv()
PATH_REMOTO='/content/ITBA-NLP/data/'
PATH=os.environ.get('PATH_LOCAL', PATH_REMOTO)
PATH

'C:/Users/gabri/OneDrive/Machine Learning/Github/ITBA-NLP/data/'

In [7]:
if PATH == os.environ.get('PATH_LOCAL'):
    client = OpenAI(api_key= os.environ.get('OPENAI_API_KEY'))
else:
    from google.colab import userdata
    client = OpenAI(api_key= userdata.get('OPENAI_API_KEY'))

### Data de noticias original 

In [8]:
df_params = {'0_1000':'0_1000_data.parquet',
             '1000_2000':'1000_2000_data.parquet',
             '2000_3000':'2000_3000_data.parquet',
             'df_joined':'df_joined_2024-04-01 00_00_00.parquet'
            }

chunk = os.environ.get('CHUNK')

df_parquet = pd.read_parquet(PATH+df_params[chunk])
data = list(df_parquet['in__text'])

# Cantidad total de documentos
len(data)

3104

In [None]:
# Cargar vocabulario
with open(PATH+f'modelos/entities_vocabulary{chunk}.json', 'r') as json_file:
    vocab_entities = json.load(json_file)

### StopWords

In [None]:
# Stopwords
SPANISH_STOPWORDS = list(pd.read_csv(PATH+'spanish_stop_words.csv' )['stopwords'].values)

### Modelo

In [None]:
tfidf_vectorizer = TfidfVectorizer(
        tokenizer=None,
        max_df=0.9,
        min_df=0.1,
        ngram_range=(1, 2),
        vocabulary=vocab_entities,
        # max_features=100_000
)
#tfidf_vectorizer.fit(data)

Capas del modelo BERTopic

In [10]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric='cosine', random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 4 - Tokenize topics
vectorizer_model = tfidf_vectorizer
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()
# Step 6 - (Optional) Fine-tune topic representations with a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,              # Step 1 - Extract embeddings
  umap_model=umap_model,                        # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,                  # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,            # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                    # Step 5 - Extract topic words
  # representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
  # language='multilingual',                    # This is not used if embedding_model is used.
  verbose=True,
  # calculate_probabilities=True
)

In [None]:
def my_callback(stage, **kwargs):
    print(f"Stage: {stage}")
    for key, value in kwargs.items():
        print(f"{key}: {value}")

In [None]:
my_callback(stage="start_training")

### Entrenamiento

In [None]:
# Cargar modelo entrenado o saltar celda y entrenar
ahora = datetime.today()
topic_model = BERTopic.load(PATH+"modelos/bertopic_model_app")
topics = np.load(PATH+"modelos/topics_app.npy")
probs = np.load(PATH+"modelos/probs_app.npy")

In [11]:
topics, probs = topic_model.fit_transform(data)

2024-07-08 11:45:26,030 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/97 [00:00<?, ?it/s]

2024-07-08 11:50:35,676 - BERTopic - Embedding - Completed ✓
2024-07-08 11:50:35,678 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-08 11:50:59,757 - BERTopic - Dimensionality - Completed ✓
2024-07-08 11:50:59,757 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-08 11:50:59,913 - BERTopic - Cluster - Completed ✓
2024-07-08 11:50:59,927 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-08 11:51:01,759 - BERTopic - Representation - Completed ✓


In [14]:
# Grabar modelo
topic_model.save(PATH+"modelos/bertopic_model_app")
np.save(PATH+"modelos/topics_app.npy", topics)
np.save(PATH+"modelos/probs_app.npy", probs)



### Resultados

In [12]:
print(f"Cantidad de tópicos {len(set(topics))} (incluye topico -1)")

Cantidad de tópicos 76 (incluye topico -1)


In [31]:
# Busqueda de documentos por topico, ordenados por mayor probabilidad

T = topic_model.get_document_info(data)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()

topic =70
#topic = np.random.randint(0, len(docs_per_topics)-1)

print("Ejemplo para tópico:", topic)

doc_probs_x_topic = []
for doc in docs_per_topics[topic]:
    doc_probs_x_topic.append([df_parquet.index[doc], df_parquet.iloc[doc].in__title, round(probs[doc],4)])

df_query_1 = pd.DataFrame(doc_probs_x_topic)
print(len(df_query_1), "docs encontrados")
df_query_1.sort_values(2, ascending=False)

Ejemplo para tópico: 70
11 docs encontrados


Unnamed: 0,0,1,2
0,105638712,Pedro Castillo: Magistrado del TC votó a favor de anular arresto y proceso penal por el golpe de Estado,1.0
1,105594298,Podemos recurre al TC para que el Supremo investigue su querella contra el juez García Castellón por supuesto 'lawfare',1.0
2,105593952,Uno de los jueces que juzgará a Cristina por los Cuadernos de las Coimas le pide audiencia a Milei,1.0
3,105580987,"Susana Medina, la jueza entrerriana que quiere llegar a la Corte Suprema",1.0
4,105593793,Un senador de La Libertad Avanza rechazó la postulación de Ariel Lijo a la Corte Suprema,1.0
5,105558658,Las empresas de EEUU juegan fuerte contra la postulación de Lijo para la Corte,1.0
6,105580339,Paoltroni se opone a la designación de Lijo porque salvó a Insfrán en una causa vinculada al caso Ciccone,1.0
7,105594425,La Corte Suprema acumula demandas de gobernadores por más de $600.000 millones y preocupa a Milei,1.0
8,105591618,"Montenegro, sobre la candidatura de Lijo a la Corte: ""Está bien, es un tipo preparado""",1.0
9,105595372,Javier Milei: Papas fritas y jamoncito,1.0


### Palabras clave del topico

In [32]:
# Obtener las palabras clave para un tema específico (por ejemplo, el tema 0)
topic_keywords = topic_model.get_topic(topic)
print(topic_keywords)

[('corte', 0.029647818069535512), ('lijo', 0.02596069496028331), ('suprema', 0.0157537929430417), ('la', 0.014801340287951928), ('tribunal', 0.014216508490334208), ('jueces', 0.01398201336864679), ('que', 0.013643079164380269), ('el', 0.01348919625347898), ('juez', 0.012589202175765337), ('de', 0.012503295638684627)]


### Criterio de corte (umbral)
El criterio de corte utilizado para filtrar las noticias que pertenecen a un topico es el valor de -1 desvio std.

In [None]:
# Calcular la media, el desvío estándar
mean = np.mean(doc_probs_x_topic)
std_dev = np.std(doc_probs_x_topic)

# Crear el histograma 
plt.hist(doc_probs_x_topic, bins=10, edgecolor='black')

# Añadir líneas para la media, la moda y el desvío estándar
plt.axvline(mean, color='r', linestyle='dashed', linewidth=1, label=f'Media: {mean:.2f}')
plt.axvline(mean - std_dev, color='b', linestyle='dashed', linewidth=1, label=f'-1 STD: {mean - std_dev:.2f}')


# Añadir títulos y etiquetas
plt.title(f'Histograma de probabilidades topico: {topic}')
plt.xlabel('Valor')
plt.ylabel('Frecuencia')
plt.legend()

# Mostrar el gráfico
plt.show()

#### Documentos mas representativos de un topico 

- [1]  Obtenido por el metodo del modelo

In [None]:
docs_representative = topic_model.get_representative_docs(topic=topic)
docs_representative

- [2] Obtenido por busqueda de probabilidad de documentos perteneciente al topico ( utilizando el umbral de corte )

In [None]:
# Definir la función de estilo
def color_rows(row, label, value):
    if row[label] >= value:
        return ['color: cyan'] * len(row)
    else:
        return [''] * len(row)
    
# Agrupamos documentos por topico
T = topic_model.get_document_info(data)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()

# Obtener los IDs de los documentos y sus probabilidades 
docs_ids = []
docs_topic = []
doc_probs_x_topic = []
for doc_ID in tqdm(docs_per_topics[topic]):
    docs_ids.append(df_parquet.index[doc_ID])
    doc_probs_x_topic.append(probs[doc_ID])

# Calcular la media, el desvío estándar
mean = np.mean(doc_probs_x_topic)
std_dev = np.std(doc_probs_x_topic)
threshold = mean - std_dev

# Crear una consulta de múltiples IDs
index_name = 'news'
mget_query = {
    "docs": [{"_index": index_name, "_id": doc_id} for doc_id in docs_ids]
}
# Realizar la búsqueda de múltiples IDs
response = os_client.mget(body=mget_query, index=index_name)

# Procesar la respuesta
for i, doc in enumerate(response['docs']):
    if doc['found']:
        idx = doc['_id']
        title = df_parquet.iloc[docs_per_topics[topic][i]].in__title
        prob_doc = probs[docs_per_topics[topic][i]]
        
        if 'entities' in doc['_source']:
            ent = doc['_source']['entities']
        else:
            ent = []

        docs_topic.append([idx, title, prob_doc, ent])


df_view = pd.DataFrame(docs_topic, columns = ['indice','titulo','prob','entidades']).sort_values('prob', ascending=False)
df_view.style.apply(lambda row: color_rows(row, 'prob', threshold), axis=1)


Nota: Los documentos mas representativos encontrados utilizando el metodo "get_representative_docs" no refleja lo mismo que encontrando los documentos por probabilidades maximas

- [3] Por similitud coseno del topico a los tres documento mas cercanos

In [None]:
# Cargar los embeddings en un archivo de NumPy o procesar la celda inferior
docs_embedding = np.load(PATH+f"modelos/topic_embeddings_{chunk}.npy")

In [None]:
# Obtenemos embeddings de todos los documentos
docs_embedding = topic_model.embedding_model.embed(data)

In [None]:
# Guardar los embeddings en un archivo de NumPy
np.save(PATH+f"modelos/topic_embeddings_{chunk}.npy", docs_embedding)

In [None]:
# Obtenemos la matriz de similitud coseno entre topicos y documentos
sim_matrix = cosine_similarity(topic_model.topic_embeddings_, docs_embedding)
sim_matrix.shape

In [None]:
# Similitud coseno entre el topico y los documentos del topico elegido
s_coseno = []
for i in docs_per_topics[topic]:
    s_coseno.append(cosine_similarity([topic_model.topic_embeddings_[topic + 1]], [docs_embedding[i]])[0][0])

# Indices
idx_coseno_sort = np.argsort(s_coseno)[::-1]

for idx in idx_coseno_sort[:3]:
    print(idx, df_parquet.iloc[docs_per_topics[topic][idx]].in__title)

Nota: Del mismo modo que en el punto anterior, los documentos mas cercanos al topico no coinciden no son exactamente los mismos que los hallados en el punto 1

- [4] Primer documento mas cercano al embedding del topico

In [None]:
# Documento de maxima similitud con el topico

simil_docs_topic = sim_matrix[topic + 1].argmax()
print(f"Noticia de maxima similitud con el topico: {topic}")
print(f"Doc ID: {df_parquet.index[simil_docs_topic]}")
print(f"Titulo: {df_parquet.iloc[simil_docs_topic].in__title}")
print(f"Noticia: {data[simil_docs_topic][:80]}...")
best_doc = data[simil_docs_topic]

""Ojo, con la selección del mas cercano respecto a las probabilidades de los docs del topico""

#### Keywords de solo un topico

In [None]:
keywords = topic_model.topic_representations_[topic]
topic_keywords = [TopicKeyword(name=keyword, score=score) for keyword, score in keywords if keyword != '']
topic_keywords

- Nota: el umbral de corte para las keywords es la media, se tomaran hasta 10 keywords mientras no supere el umbral de corte.

In [None]:
freq_k = []
for name_score in topic_keywords:
    freq_k.append(name_score['score'])
umbral_k = np.array(freq_k).mean()
umbral_k

In [None]:
topic_keywords_top = {}
for name_score in topic_keywords:
    if name_score['score'] >= umbral_k:
        topic_keywords_top[name_score['name']] = name_score['score']

topic_keywords_top


#### Entidades de un Topico a partir de los n documentos mas cercanos al embedding del topico

In [None]:
# Entidades de documentos ordenados por similitud del topico elelgido
n_docs = 5 # n docs cercanos
entities_topic = []
for doc in df_view[:n_docs].iterrows():
    entities_topic.append(doc[1][3])

from collections import defaultdict 

# Crear un diccionario para contar en cuántos documentos aparece cada palabra
document_frequencies = defaultdict(int)

# Crear un conjunto para cada documento y contar las palabras únicas
for lista in entities_topic:
    unique_words = set(lista)
    for palabra in unique_words:
        document_frequencies[palabra] += 1

# Ordenar las palabras por la frecuencia de documentos de mayor a menor
sorted_frequencies = sorted(document_frequencies.items(), key=lambda item: item[1], reverse=True)

freq_e = []
for item in sorted_frequencies:
    freq_e.append(item[1])
umbral_e = np.array(freq_e).mean()

# Imprimir el resultado ordenado de las primeras 10 entidades segun criterio de corte
topic_entities_top = {}
c=0
for idx in range(len(sorted_frequencies)):
    if sorted_frequencies[idx][1] >= umbral_e:
        if c != 10:
            topic_entities_top[sorted_frequencies[idx][0]] = sorted_frequencies[idx][1]
        else:
            break
        c += 1

topic_entities_top
   

### Grabar todos los registros en Topic y actualizar en News

In [None]:
# Cargar o saltar carga y procesar celda inferior
with open(PATH+f'modelos/entities{chunk}.json', 'r') as json_file:
    entities = json.load(json_file)

In [None]:
# Grabar todos los topicos en la base
for topic in topic_model.get_topics().keys():
    if topic > -1:

        topic_keywords_top  = top_keywords(topic, topic_model)
        topic_entities_top  = top_entities(topic, topic_model, docs_embedding, data, entities)
        topic_documents_ids, threshold  = topic_documents(topic, topic_model, probs, df_parquet, data)

        topic_doc = Topic(
            index = topic,   
            name = get_topic_name(''.join({**topic_keywords_top, **topic_entities_top}), client),
            vector = list(topic_model.topic_embeddings_[topic + 1 ]), 
            similarity_threshold = threshold,                      
            created_at = datetime.now(),
            to_date = parse('2024-04-02'),
            from_date = parse('2024-04-01'),         
            keywords = topic_keywords_top,
            entities = topic_entities_top,
            best_doc = best_document(topic, topic_model, docs_embedding, data),
            docs = topic_documents_ids
        ) 

        topic_doc.save()

##### Actualizar datos en News

In [None]:
# Marcar registros de noticias procesados
index_name = 'news'
search_query = {
    'query': {
        'match': {
            'process': False  
        }
    },
    'size': 10000
}

# Realizar la búsqueda
response = os_client.search( body=search_query, index=index_name )

for i, reg in enumerate(response['hits']['hits']):
    doc_id = reg['_id']
    
    update_body = {
                    "doc": {
                        
                        "process": True
                    }
    }

    # Realizar la actualización
    os_client.update(index=index_name, id=doc_id, body=update_body)

### Recuperar todos los topicos

In [None]:
index_name = 'topic'

db_topics = []
for i, doc in enumerate(Topic.search().query().scan()):
    db_topics.append(doc.to_dict())
    print(db_topics[i]['index'], db_topics[i]['name'])

### Recuperar documento mas cercano a un topico

In [None]:
index_name = 'topic'
search_query = {
    'query': {
        'match': {
            'index': 6  # Sustituir 'campo' y 'valor' por campo y valor de búsqueda
        }
    }
}

# Realizar la búsqueda
response = os_client.search(
                            body=search_query,
                            index=index_name
)

texto = response['hits']['hits'][0]['_source']

#Imprimir los resultados
print(f"Topico: {response['hits']['hits'][0]['_source']['name']}")
print("\n"+ texto['best_doc'])

### Nuevo documento

In [None]:
new_doc = "Bitcoin esta en alza"
new_doc = response['hits']['hits'][0]['_source']['news']
new_doc_embedding = topic_model.embedding_model.embed(new_doc)

# Buscamos en la base a que topico pertenece el nuevo documento
query = {
    "size": 1,
    "query": {
        "knn": {
            "vector": {
                "vector": list(new_doc_embedding),
                "k" : 3
            }
        }
    }
}
response = os_client.search(index='topic', body=query)

print(f"Topico: {response['hits']['hits'][0]['_source']['name']}")
print(f"Estimacion: {response['hits']['hits'][0]['_source']['similarity_threshold']}")

### Busqueda de un documento por su indice, topico asociado, keywords, entities.

In [None]:

index_name = 'news'
search_query = {
    'query': {
        'match': {
            '_id': '105640350'
        }
    }
}

# Realizar la búsqueda
response = os_client.search(
                            body=search_query,
                            index=index_name
)
print(f"Texto de Noticia: {response['hits']['hits'][0]['_source']['news'][:200]}...\n")

new_doc_embedding = topic_model.embedding_model.embed(response['hits']['hits'][0]['_source']['news'])

# Define el índice y el campo del vector
index_name = 'topic'
vector_field = 'vector'

# Crear una consulta KNN para buscar el embedding más similar
knn_query = {
    "size": 1,  # Número de resultados que deseas obtener, en este caso 1
    "query": {
        "knn": {
            vector_field: {
                "vector": new_doc_embedding,
                "k": 3  # Número de vecinos más cercanos
            }
        }
    }
}

# Realizar la búsqueda
response_topic = os_client.search(index=index_name, body=knn_query)

# Obtener el tópico más cercano
if response_topic['hits']['total']['value'] > 0:
    closest_topic = response_topic['hits']['hits'][0]['_source']
    print(f"El nuevo documento pertenece al tópico: {closest_topic['index']}")
    print(closest_topic['name'])
    print(f"Estimacion: {closest_topic['similarity_threshold']}")
    print(f"Keywords del topico: {closest_topic['keywords']}")
    print(f"Entidades del topico: {closest_topic['entities']}")

else:
    print("No se encontró un tópico cercano.")

### Agrupamiento de topicos

In [None]:
index_name = 'topic'

db_topics = []
for i, doc in enumerate(Topic.search().query().scan()):
    db_topics.append(doc.to_dict())
    print(db_topics[i]['index'], db_topics[i]['name'])

#### Agrupando por embeddings cercanos

In [None]:
topic_embeddings = topic_model.topic_embeddings_

In [None]:
# Reducir la dimensionalidad a 3D usando UMAP
umap_model = UMAP(n_neighbors=5, n_components=3, metric='cosine')
umap_embeddings = umap_model.fit_transform(topic_embeddings)

In [None]:
import plotly.express as px
import pandas as pd

# Crear un DataFrame para los embeddings y los IDs de los tópicos
df = pd.DataFrame(umap_embeddings, columns=['UMAP Dimension 1', 'UMAP Dimension 2', 'UMAP Dimension 3'])
df['Topic'] = topic_model.get_topic_info()['Topic'].values

# Crear la gráfica 3D interactiva usando Plotly
fig = px.scatter_3d(df, x='UMAP Dimension 1', y='UMAP Dimension 2', z='UMAP Dimension 3',
                    text='Topic', title='Embeddings de los Tópicos Reducidos a 3D con UMAP',
                    width=1000, height=800)  # Ajustar el tamaño del gráfico

# Mostrar la gráfica
fig.show()



In [None]:
# Eliminamos el topico -1
new_topic_embeddings = topic_embeddings[1:]

In [None]:
# Calcular la similitud del coseno entre los embeddings de los tópicos
similarities = cosine_similarity(new_topic_embeddings)

In [None]:
# Crear una matriz de similitud excluyendo la diagonal
np.fill_diagonal(similarities, 0)

In [None]:
# Encontrar los pares de tópicos más cercanos
topic_pairs = np.dstack(np.unravel_index(np.argsort(similarities.ravel())[::-1], similarities.shape))[0]

In [None]:
# Mostrar los 5 pares de tópicos más cercanos
for i in range(10):
    topic_id_1, topic_id_2 = topic_pairs[i]
    similarity_score = similarities[topic_id_1, topic_id_2]
    print(f"Topico {topic_id_1} y Topico {topic_id_2} tienen una similitud de: {similarity_score:.4f}")


In [None]:
topics_to_merge = [16, 17]

id_docs_to_merge = []
for topic in db_topics:
    if topic['index'] in topics_to_merge:
        id_docs_to_merge.append(topic['docs'].keys())

list_id_docs_to_merge = [ item for sublist in id_docs_to_merge for item in sublist ]


In [None]:
index_name = 'news'

# Construir el cuerpo de la solicitud para `mget`
body = {
    "docs": [{"_index": "news", "_id": int(doc_id)} for doc_id in list_id_docs_to_merge]
}

# Realizar la solicitud `mget`
response = os_client.mget(body=body)


In [None]:
idx_relativo = []
docs_input = []
for i, doc in enumerate(response['docs']):
    idx_relativo.append(doc['_id'])
    docs_input.append(doc['_source']['news'])

    

In [None]:
len(docs_input)

In [None]:
topic_model.merge_topics(docs_input, topics_to_merge)

In [None]:
# Configurar la busqueda de todas las noticias no procesadas ( False ) en la base (al menos 10.000)
index_name = 'news'
search_query = {
    'query': {
        'match': {
            'process': True
        }
    },
    'size': 10000
}

# Realizar la búsqueda
response = os_client.search( body=search_query, index=index_name )

db_news = []
for reg in response['hits']['hits']:
    _id =  reg['_id']
    title =  reg['_source']['title']
    news =  reg['_source']['news']
    try:
        keywords =  reg['_source']['keywords'] 
    except:
        keywords = ['']
    try:
        entities =  reg['_source']['entities'] 
    except:
        entities = ['']
    created_at =  reg['_source']['created_at'] 
    
    db_news.append([_id, title, news, keywords, entities, created_at])

df_news = pd.DataFrame(db_news , columns=["indice", "titulo", "noticia", "keywords", "entidades", "creado"])

In [None]:
topic=10

In [None]:
T = topic_model.get_document_info(data)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()

# Obtener los IDs de los documentos y sus probabilidades 
docs_IDs = {}
doc_probs_x_topic = []
for doc_idx in docs_per_topics[topic]:
    
    docs_IDs[df_news.indice[doc_idx]] = probs[doc_idx]
    doc_probs_x_topic.append(probs[doc_idx])

# Calcular la media, el desvío estándar
mean = np.mean(doc_probs_x_topic)
std_dev = np.std(doc_probs_x_topic)
threshold = mean - std_dev

# Filtra los docs que superan o igualan al valor del umbral calculado
filter = {}
for k,v in docs_IDs.items():
    if v >= threshold:
        filter[k] = v

In [None]:
# Ordeno de mayor a menor
ids_filter_sort = dict(sorted(filter.items(), key=lambda item: item[1], reverse=True))

In [None]:
ids_filter_sort

In [None]:
title_filter_sort = [ df_news.loc[df_news['indice'] == idx].values[0][1] for idx in ids_filter_sort.keys() ]



In [None]:
title_filter_sort 

In [None]:
' '.join(title_filter_sort)

In [None]:
df_news

In [None]:

x_ids, x_titles, umbral = topic_documents(topic, topic_model, probs, df_news, data)

In [None]:
x_titles

In [None]:
x_ids