<a href="https://colab.research.google.com/github/gabrielfernandorey/ITBA-NLP/blob/main/ITBA_nlp01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Trabajo Practico NLP - Detección de Tópicos y clasificación
- ITBA 2024
- Alumno: Gabriel Rey
---

## Merged models

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from matplotlib import pyplot as plt
import os
import json
from datetime import datetime, date
from dateutil.parser import parse
from dotenv import load_dotenv

from NLP_tools import Cleaning_text, top_keywords, top_entities, get_topic_name, best_document, clean_all, topic_documents
from core.functions import *

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
from opensearch_data_model import Topic, TopicKeyword, News, os_client, TOPIC_INDEX_NAME, NEWS_INDEX_NAME
from opensearch_io import init_opensearch, get_news
from opensearchpy import helpers

from openai import OpenAI

### Inicializamos la base vectorial

In [None]:
init_opensearch()

### Path

In [None]:
load_dotenv()
PATH_REMOTO='/content/ITBA-NLP/data/'
PATH=os.environ.get('PATH_LOCAL', PATH_REMOTO)
PATH

In [None]:
if PATH == os.environ.get('PATH_LOCAL'):
    client = OpenAI(api_key= os.environ.get('OPENAI_API_KEY'))
else:
    from google.colab import userdata
    client = OpenAI(api_key= userdata.get('OPENAI_API_KEY'))

### Cargamos noticias de dos chunks

In [None]:
# Read the parquet file | ( lotes de prueba )

df_params = {'0_1000':'0_1000_data.parquet',
             '1000_2000':'1000_2000_data.parquet',
             '2000_3000':'2000_3000_data.parquet',
             'df_joined':'df_joined_2024-04-01 00_00_00.parquet'
            }

chunk_1 = '0_1000'
df_parquet_1 = pd.read_parquet(PATH+df_params[chunk_1])
data_1 = list(df_parquet_1['in__text'])

chunk_2 = '1000_2000'
df_parquet_2 = pd.read_parquet(PATH+df_params[chunk_2])
data_2 = list(df_parquet_2['in__text'])

In [None]:
# Unificamos datos
df_parquet = pd.concat([df_parquet_1, df_parquet_2], ignore_index=True)

### Modelo 1

In [None]:
# Cargar modelo 1 

topic_model_1 = BERTopic.load(PATH+f"modelos/bertopic_model_{chunk_1}")
#topics_1 = np.load(PATH+f"modelos/topics_{chunk}.npy")
#probs_1 = np.load(PATH+f"modelos/probs_{chunk}.npy")

# Cargar los embeddings 
docs_embedding_1 = np.load(PATH+f"modelos/docs_embeddings_{chunk_1}.npy")

### Modelo 2

In [None]:
# Cargar modelo 2
chunk = '1000_2000'

topic_model_2 = BERTopic.load(PATH+f"modelos/bertopic_model_{chunk_2}")
#topics_2 = np.load(PATH+f"modelos/topics_{chunk}.npy")
#probs_2 = np.load(PATH+f"modelos/probs_{chunk}.npy")

# Cargar los embeddings 
docs_embedding_2 = np.load(PATH+f"modelos/docs_embeddings_{chunk_2}.npy")

### Merge

In [None]:
# Combine all models into one
merged_model = BERTopic.merge_models([topic_model_1, topic_model_2])


## Resultados

In [None]:
print(f"Cantidad de tópicos modelo 1: {len(topic_model_1.get_topic_info())} -- del 0 al {len(topic_model_1.get_topic_info())-1} (incluye topico -1)")
print(f"Cantidad de tópicos modelo 2: {len(topic_model_2.get_topic_info())} -- del 0 al {len(topic_model_2.get_topic_info())-1} (incluye topico -1)")

In [None]:
print(f"Cantidad de tópicos modelo merge: {len(merged_model.get_topic_info())} -- del 0 al {len(merged_model.get_topic_info())-1} (incluye topico -1)")

### Topicos de ambos modelos

In [None]:
# Topicos modelo 1
topic_model_1.topic_labels_

In [None]:
# Topicos modelo 2
topic_model_2.topic_labels_

Nota: Cada modelo puede representar ordenes distintos de los topicos, pero al fusionarlos, el orden del modelo 1 se mantiene en el fusionado y se agregan los nuevos al final

In [None]:
# Topicos modelo merged
merged_model.topic_labels_

#### Comparamos modelos ordenados por topicos

In [None]:
topic_freq_1 = topic_model_1.get_topic_freq()
topic_freq_m = merged_model.get_topic_freq()

df1 = topic_freq_1.sort_values(by='Topic').reset_index(drop=True)
dfm = topic_freq_m.sort_values(by='Topic').reset_index(drop=True)

# Renombrar las columnas 'Count' para diferenciar DataFrames
df1 = df1.rename(columns={'Count': 'Count1'})
dfm = dfm.rename(columns={'Count': 'Merged'})

df_combined = pd.merge(df1, dfm, on='Topic', how='outer')

# Calcular la nueva columna 'Count2' como la resta de 'Merged' y 'Count1'
# Asegurarse de manejar NaN correctamente
df_combined['Count2'] = df_combined['Merged'].fillna(0) - df_combined['Count1'].fillna(0)

# Reordenar las columnas en el orden deseado
df_combined = df_combined[['Topic', 'Count1', 'Count2', 'Merged']]
df_combined



Nota: Podemos verificar que topicos se agregaron y cuales de los existentes incorporaron nuevos docs o no

In [None]:
db_news = get_news()

# Crear un diccionario para agrupar los registros por fecha (solo día, mes y año)
fechas_dict = defaultdict(list)

# Agrupar registros por fecha
for registro in db_news:
    fecha_completa = registro[-1]
    fecha_solo_dia = fecha_completa.split('T')[0]  # Tomar solo el día, mes y año
    if fecha_solo_dia not in fechas_dict:
        fechas_dict[fecha_solo_dia] = 1
    else:
        fechas_dict[fecha_solo_dia] += 1

In [None]:
from_date

In [None]:
from_date  = [ '2024-04-03' for _ in range(len(topic_model_1.get_topics().keys())) ] 
to_date = [ datetime.strptime('2024-04-03', '%Y-%m-%d') + timedelta(days=1) for _ in range(len(from_date)) ]

In [None]:
to_date

### Analisis de documentos fusionados en un topico

In [None]:
# modelo 1
topic = 12
docs_per_topics_1 = [i for i, x in enumerate(topic_model_1.topics_) if x == topic]
probs_1 = topic_model_1.probabilities_[docs_per_topics_1]
A = np.array(docs_per_topics_1).reshape(-1,1)
B = np.array([ df_parquet_1.iloc[row]["in__title"] for row in docs_per_topics_1 ]).reshape(-1,1)
C = np.array(probs_1).reshape(-1,1)
combined_array_1 = np.hstack((A, B, C))
pd.DataFrame(combined_array_1, columns=['ID doc', 'title', 'probs'])

In [None]:
# modelo fusionado
topic = 12
docs_per_topics_m = [i for i, x in enumerate(merged_model.topics_) if x == topic]
A = np.array(docs_per_topics_m).reshape(-1,1)
B = np.array([ df_parquet.iloc[row]["in__title"] for row in docs_per_topics_m ]).reshape(-1,1)
combined_array_m = np.hstack((A, B))
pd.DataFrame(combined_array_m, columns=['ID doc', 'title'])

In [None]:
# modelo 2
topic = 12
docs_per_topics_2 = [i for i, x in enumerate(topic_model_2.topics_) if x == topic]
probs_2 = topic_model_1.probabilities_[docs_per_topics_2]
docs_per_topics_2

In [None]:
df_parquet_2.iloc[58]["in__title"]

In [None]:
df_parquet.iloc[1058]["in__title"]

### Recuperar todos los topicos y sus etiquetas generadas por el modelo

In [None]:
# docs de 0-1000 primer chunk
# docs de 1000-2000 segundo chunk
docs = [i for i, x in enumerate(merged_model.topics_) if x == 10]
for i in docs:
    print(i, df_parquet.iloc[i]['in__title'])

### Buscar topicos ingresando un texto en el modelo merged

In [None]:
topic_res = merged_model.find_topics("cambio climatico")
topic_res

### Verificacion de documentos con topico -1, recuperados en topicos positivos luego de fusion

In [None]:
docs = [i for i, x in enumerate(topic_model_1.topics_) if x == -1]
print(f"Cantidad de docs sin topico: {len(docs)} \n")
a = []
for i in docs:
    a.append(i)
    print(i, df_parquet_1.iloc[i]['in__title'])

In [None]:
docs = [i for i, x in enumerate(merged_model.topics_) if x == -1]
m = []
print(f"Cantidad de docs sin topico: {len(docs)} \n")
for i in docs:
    m.append(i)
    print(i, df_parquet.iloc[i]['in__title'])

In [None]:
a = np.array(a).reshape(-1,1)
m = np.array(m).reshape(-1,1)

for i in range(len(a)):
    if a[i] not in m:
        print(a[i])


In [None]:
len(a)

In [None]:
m[0]

In [None]:
docs = [i for i, x in enumerate(topic_model_1.topics_) if x == -1]
print(f"Cantidad de docs sin topico: {len(docs)} \n")
a = []
for i in docs:
    a.append(i)
    print(i, df_parquet_1.iloc[i]['in__title'])

In [None]:
len(topic_model_1.probabilities_)

In [None]:
topics_1 = topic_model_1.get_topics().keys()
topics_1

In [None]:
topics_m = merged_model.get_topics().keys()
topics_m

### Modificar news 

In [None]:
index_name='topic'
query = {   "size": 1000,
            "query": {
                "match_all": {}
            }
        }

response = os_client.search(index=index_name, body=query)
topics = [ (hit['_source']['index'], hit['_source']['from_date'], hit['_source']['to_date'])  for hit in response['hits']['hits']]
topics

### Obtenemos los topicos del indice topic de Opensearch filtrados por fecha

In [None]:
# Obtener los tópicos filtrados por la fecha seleccionada
date_str = '2024-04-02'
topics = get_topics_opensearch(date_filter=date_str)
sorted([  (row['index'], row['name']) for row in topics if row['to_date'][:10] <= date_str  ])

In [None]:
db_topics = []
data_topics = {} 
for reg in topics:
    index = reg['index']
    name = reg['name']
    similarity_threshold = reg['similarity_threshold']
    create_at = format_date(reg['created_at'])
    from_date = reg['from_date'][:10]
    to_date = reg['to_date'][:10]
    title_best_doc = reg['title_best_doc']
    id_best_doc = reg['id_best_doc']

    db_topics.append([index, name, round(similarity_threshold, 4), create_at, from_date, to_date, title_best_doc, id_best_doc])
    data_topics[index] = [name,
                        title_best_doc,
                        reg['best_doc'],
                        reg['entities'],
                        reg['keywords'],
                        similarity_threshold,
                        reg['vector']
                        ] 

db_topics = sorted(db_topics)
# Convertir a DataFrame
topics_df = pd.DataFrame(db_topics, columns=["indice", "nombre", "umbral", "creado", "desde", "hasta", "titulo noticia mas cercana", "id noticia"]) 
topics_df 

### Buscar en news opensearch las 20 noticias mas cercanas a un topico dado

In [None]:
topic = 0
query = {
        "size": 1000,
        "query": {
            "bool": {
                "must": [
                    {   "term": {
                            "index": topic
                        }
                    }
                ]
            }
        }
    }
                    
response = os_client.search(index='topic', body=query)

name = [hit['_source']['name'] for hit in response['hits']['hits']]
embedding = [hit['_source']['vector'] for hit in response['hits']['hits']]

# Construir la consulta de OpenSearch para búsqueda por vector
query = {
    "size": 20,  # Número de resultados 
    "_source": ["title", "vector"],  # Campos a recuperar
    "query": {
        "knn": {
            "vector": {  # Campo que contiene los embeddings
                "vector": embedding[0],  # Embedding del tópico
            "k": 50  
            }
        }
    }
}

# Ejecutar la consulta
response = os_client.search(index='news', body=query)

# Procesar la respuesta
results = response['hits']['hits']
embeddings = []
for result in results:
    print(f"ID: {result['_id']}, Score: {result['_score']}, {result['_source']['title']}")
    embeddings.append(result['_source']['vector'])


In [None]:
docs_embedding = np.load(PATH+f"modelos/docs_embeddings_0_1000.npy")

In [None]:
matrix = cosine_similarity(docs_embedding, embedding)
similarities = matrix.flatten()
# Obtenemos los índices de los 10 valores más altos
top_10_indices = similarities.argsort()[-10:][::-1]

print("Los 10 índices de embeddings más cercanos son:", top_10_indices)

In [None]:
for i in top_10_indices:
    print(df_parquet_1.iloc[i]["in__title"][:100])

In [None]:
topics = get_topics_opensearch(date_filter='2024-04-02')

In [None]:

for reg in topics:
    index = reg['index']
    name = reg['name']
    similarity_threshold = reg['similarity_threshold']
    create_at = format_date(reg['created_at'])
    from_date = reg['from_date'][:10]
    to_date = reg['to_date'][:10]
    title_best_doc = reg['title_best_doc']
    id_best_doc = reg['id_best_doc']

    db_topics.append([index, name, round(similarity_threshold, 4), create_at, from_date, to_date, title_best_doc, id_best_doc])
    data_topics[index] = [name,
                        title_best_doc,
                        reg['best_doc'],
                        reg['entities'],
                        reg['keywords'],
                        similarity_threshold,
                        reg['vector']
                        ] 
    

In [None]:
res = get_topics_opensearch()
res

### Prueba

In [None]:
topic_model = BERTopic.load(PATH+f"modelos/bertopic_model_0_1000")

In [None]:
docs_embedding = np.load(PATH+f"modelos/docs_embeddings_0_1000.npy")

In [None]:
db_news = get_news( '2024-04-01' )
db_news[0]

In [None]:
df_news = pd.DataFrame(db_news , columns=["indice", "titulo", "noticia", "keywords", "entidades", "creado"])

data         = list(df_news['noticia'])
entities     = list(df_news['entidades'])

In [None]:
#topics, probs = topic_model.transform(data)

In [None]:
chunk='0_1000'
#np.save(PATH+f"modelos/topics_{chunk}.npy", topics)
#np.save(PATH+f"modelos/probs_{chunk}.npy", probs)
topics = np.load(PATH+f"modelos/topics_{chunk}.npy")
probs = np.load(PATH+f"modelos/probs_{chunk}.npy")


### Desarrollo

In [None]:
topic_id = 4
n_entities=10

# Obtener todos los documentos de un topico
topic_docs_idx = [i for i, (_, topic) in enumerate(zip(list(df_news.index), topics)) if topic == topic_id]
df_data = pd.DataFrame(np.array(topic_docs_idx).reshape(-1,1), columns=["idx"])

# Similitud coseno entre el topico y los documentos del topico
s_coseno = []
for i in topic_docs_idx:
    s_coseno.append(cosine_similarity([topic_model.topic_embeddings_[topic_id + 1]], [docs_embedding[i]])[0][0])

df_data['similitud'] = s_coseno
threshold = df_data['similitud'].mean()

# Ordenado por mayor similitud
df_filtered = df_data[df_data["similitud"] > threshold].sort_values("similitud", ascending=False)

# Entidades de documentos ordenados para el topico elelgido (cantidad por documento=n_entities)
entities_topic = []
for doc in list(df_filtered["idx"]):
    entities_topic.append(entities[doc][:n_entities])

# Crear un diccionario para contar en cuántos documentos aparece cada palabra
document_frequencies = defaultdict(int)

# Crear un conjunto para cada documento y contar las palabras únicas
for lista in entities_topic:
    unique_words = set(lista)
    for palabra in unique_words:
        document_frequencies[palabra] += 1

# Ordenar las palabras por la frecuencia de documentos de mayor a menor
sorted_frequencies = sorted(document_frequencies.items(), key=lambda item: item[1], reverse=True)

# Calcular el umbral
freq_e = [item[1] for item in sorted_frequencies]
umbral_e = np.mean(freq_e)

# Obtener el resultado ordenado de las primeras 10 entidades segun criterio de corte
topic_entities_top = {}
c=0
for idx in range(len(sorted_frequencies)):
    if sorted_frequencies[idx][1] >= umbral_e:
        if c != 10:
            topic_entities_top[sorted_frequencies[idx][0]] = sorted_frequencies[idx][1]
        else:
            break
        c += 1


#--------------------------
print(topic_model.topic_labels_[topic_id])
print(umbral_e)
print(topic_entities_top)



In [None]:
topic = 0
data_news = list(df_news.index)
topic_entities_top = top_entities_(topic, topic_model, topics, docs_embedding, data_news, entities)
topic_entities_top

In [None]:
df_data['titulo'] = [df_news.iloc[idx]['titulo'] for idx in topic_docs_idx]
df_data.sort_values("similitud", ascending=False)

In [None]:
df_data['similitud'].mean()

In [None]:
topic_docs_idx = [i for i, (doc, topic) in enumerate(zip(list(df_news["indice"]), topics)) if topic == 0]

In [None]:
def top_entities_(topic_id: int, topic_model: object, topics: list, docs_embedding, data_news, entities: list, n_entities=10):
    """
    Las entidades mas representativas del topico se extraen de las entidades de las noticias mas similares al topico
    filtradas por el umbral del tópico
    topic_id        : id del topico
    topic_model     : modelo de topicos
    topics          : lista de los indices de posicion de los documentos del conjunto de documentos de entrenamiento
    n_entities      : cant. de entidades extraidas por cada documento del topico
    """

    try:
        # Obtener todos los documentos de un topic
        #topic_docs_idx = [i for i, (doc, topic) in enumerate(zip(list(df_news["indice"]), topics)) if topic == topic_id]
        topic_docs_idx = [i for i, (doc, topic) in enumerate(zip(data_news, topics)) if topic == topic_id]
        df_data = pd.DataFrame(np.array(topic_docs_idx).reshape(-1,1), columns=["idx"])

        # Similitud coseno entre el topico y los documentos del topico
        s_coseno = []
        for i in topic_docs_idx:
            s_coseno.append(cosine_similarity([topic_model.topic_embeddings_[topic_id + 1]], [docs_embedding[i]])[0][0])

        df_data['similitud'] = s_coseno
        
        # umbral
        threshold = df_data['similitud'].mean()

        # Ordenado por mayor similitud
        df_filtered = df_data[df_data["similitud"] > threshold].sort_values("similitud", ascending=False)

        # Entidades de documentos ordenados para el topico elelgido (cantidad por documento=n_entities)
        entities_topic = []
        for doc in list(df_filtered["idx"]):
            entities_topic.append(entities[doc][:n_entities])

        # Crear un diccionario para contar en cuántos documentos aparece cada palabra
        document_frequencies = defaultdict(int)

        # Crear un conjunto para cada documento y contar las palabras únicas
        for lista in entities_topic:
            unique_words = set(lista)
            for palabra in unique_words:
                document_frequencies[palabra] += 1
        
        # Ordenar las palabras por la frecuencia de documentos de mayor a menor
        sorted_frequencies = sorted(document_frequencies.items(), key=lambda item: item[1], reverse=True)

        # Calcular el umbral
        freq_e = [item[1] for item in sorted_frequencies]
        umbral_e = np.mean(freq_e)

        # Obtener el resultado ordenado de las primeras 10 entidades segun criterio de corte
        topic_entities_top = {}
        c=0
        for idx in range(len(sorted_frequencies)):
            if sorted_frequencies[idx][1] >= umbral_e:
                if c != 10:
                    topic_entities_top[sorted_frequencies[idx][0]] = sorted_frequencies[idx][1]
                else:
                    break
                c += 1 

        print(type(topic_entities_top))
        print(topic_entities_top)
        
        return topic_entities_top
    
    except Exception as e:
        print(f"Ha ocurrido un error: {e}")
        return False  

In [None]:
def top_entities_2(topic_id: int, topic_model: object, topics: list, docs_embedding, xx, entities: list, n_entities=10):
    """
    Las entidades mas representativas del topico se extraen de las entidades de las noticias mas similares al topico
    filtradas por el umbral del tópico
    topic_id        : id del topico
    topic_model     : modelo de topicos
    topics          : lista de los indices de posicion de los documentos del conjunto de documentos de entrenamiento
    n_entities      : cant. de entidades extraidas por cada documento del topico
    """

    
    # Obtener todos los documentos de un topic
    topic_docs_idx = [i for i, (doc, topic) in enumerate(zip(xx, topics)) if topic == topic_id]

    return topic_docs_idx

In [None]:
xx = list(df_news.index)
xx

In [None]:
topic_entities_top = top_entities_2(topic, topic_model, topics, docs_embedding, xx , entities)
topic_entities_top

In [None]:
topic_entities_top

In [None]:
df_news.index

In [None]:
topic_model.get_topics().keys()