# Trabajo Practico NLP - Detección de Tópicos y clasificación
- ITBA 2024
- Alumno: Gabriel Rey
---

## Merged models

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from matplotlib import pyplot as plt
import os
import json
from datetime import datetime, date
from dateutil.parser import parse
from dotenv import load_dotenv

from NLP_tools import *

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [3]:
from opensearch_data_model import Topic, TopicKeyword, News, os_client, TOPIC_INDEX_NAME, NEWS_INDEX_NAME
from opensearch_io import *
from opensearchpy import helpers

from openai import OpenAI

### Inicializamos la base vectorial

In [None]:
init_opensearch()

### Path

In [5]:
load_dotenv()
PATH_REMOTO='/content/ITBA-NLP/data/'
PATH=os.environ.get('PATH_LOCAL', PATH_REMOTO)
PATH

'C:/Users/gabri/OneDrive/Machine Learning/Github/ITBA-NLP/data/'

In [6]:
if PATH == os.environ.get('PATH_LOCAL'):
    if os.environ.get('OPENAI_API_KEY'):
        client = OpenAI(api_key= os.environ.get('OPENAI_API_KEY'))
    else:
        client = None
    print(client)

<openai.OpenAI object at 0x0000023BBF766B90>


### Cargamos noticias 
- Notas:
    - Se carga el segundo batch de noticias, luego de haber corrido al menos el primero y segundo batch con la notebook NLP_01_data
    - Se debe haber generado el primer modelo en NLP_02_model
    - Para cargar el segundo batch, se utiliza la fecha como identificador del lote

In [None]:
# Cargar batch de noticias ( a partir del segundo batch en adelante ) por fecha previamente preprocesado en NLP_01_data 
choice = "20240717"  
date_choice = choice[:4]+"-"+choice[4:6]+"-"+choice[6:8]
batch_news = get_news(date_choice)

if batch_news == []:
    print("No hay noticias a procesar")

id_data     = [reg[0] for reg in batch_news]
title_data  = [reg[1] for reg in batch_news]
news_data   = [reg[2] for reg in batch_news]
keywords    = [reg[3] for reg in batch_news]
entities    = [reg[4] for reg in batch_news]
created     = [reg[5] for reg in batch_news]

# Cargar vocabulario previamente procesado
with open(PATH+f'preproc_notebook/vocabulary_{choice}.json', 'r') as json_file:
    vocab = json.load(json_file)
len(vocab)

### Preprocesar batch de noticias

In [None]:
SPANISH_STOPWORDS = list(pd.read_csv(PATH+'spanish_stop_words.csv' )['stopwords'].values)
SPANISH_STOPWORDS_SPECIAL = list(pd.read_csv(PATH+'spanish_stop_words_spec.csv' )['stopwords'].values)

clean_data = Cleaning_text()

proc_data = []
for data_in in tqdm(news_data):
    aux = clean_data.unicode(data_in)
    aux = clean_data.urls(aux)
    aux = clean_data.simbols(aux)
    aux = clean_data.escape_sequence(aux)
    aux = " ".join([ word for word in aux.split() if word.lower() not in SPANISH_STOPWORDS_SPECIAL])
    proc_data.append(aux)

### Modelo para el batch

In [None]:
tfidf_vectorizer = TfidfVectorizer(
        tokenizer=None,
        max_df=0.9,
        min_df=0.1,
        ngram_range=(1, 2),
        vocabulary=vocab,
        # max_features=100_000
)
tfidf_vectorizer.fit(news_data)

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 4 - Tokenize topics
vectorizer_model = tfidf_vectorizer
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()
# Step 6 - (Optional) Fine-tune topic representations with a `bertopic.representation` model
# representation_model = KeyBERTInspired()

# All steps together
topic_model_2 = BERTopic(
  embedding_model=embedding_model,              # Step 1 - Extract embeddings
  umap_model=umap_model,                        # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,                  # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,            # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                    # Step 5 - Extract topic words
  # representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
  # language='multilingual',                    # This is not used if embedding_model is used.
  verbose=True,
  # calculate_probabilities=True
)

### Entrenamiento

In [None]:
# Entrenamiento
_, _ = topic_model_2.fit_transform(proc_data)

# No necesito generar topicos, ni probabilidades, ni embeddings del modelo 2, ya que vamos a fusionar modelos.

### Merge de modelos

In [None]:
# Cargamos el modelo anterior
topic_model_1 = BERTopic.load(PATH+"modelos_notebook/bertopic_model_last")
print(f"Topicos anteriores: {len(set(topic_model_1.get_topics().keys()))-1}")

In [None]:
# Combinar los modelos en uno solo
merged_model = BERTopic.merge_models([topic_model_1, topic_model_2])

In [None]:
# Grabar modelo fusionado
merged_model.save(PATH+f"modelos_notebook/bertopic_model_last")

In [None]:
print(f"Cantidad de tópicos modelo 1: {len(topic_model_1.get_topic_info())} -- del 0 al {len(topic_model_1.get_topic_info())-1} (incluye topico -1)")
print(f"Cantidad de tópicos modelo 2: {len(topic_model_2.get_topic_info())} -- del 0 al {len(topic_model_2.get_topic_info())-1} (incluye topico -1)")
print(f"Cantidad de tópicos modelo merge: {len(merged_model.get_topic_info())} -- del 0 al {len(merged_model.get_topic_info())-1} (incluye topico -1)")

In [None]:
# Armar set de datos para inferir (ya procesados y nuevos)
news_data_1 = get_news( process=True )
df_news_1 = pd.DataFrame(news_data_1 , columns=["indice", "titulo", "noticia", "keywords", "entidades", "creado", "pos_id"])
df_news_1.sort_values("pos_id", ascending=True, inplace=True)

news_data_2 = {"indice":id_data, "titulo":title_data, "noticia":news_data, "keywords":keywords, "entidades":entities, "creado":created}
df_news_2 = pd.DataFrame(news_data_2)

df_unificado = pd.concat([df_news_1, df_news_2], ignore_index=True)

In [None]:
# Listas de atributos de datos unificados
u_id_data      = list(df_unificado['indice'])
u_title_data   = list(df_unificado['titulo'])
u_data         = list(df_unificado['noticia'])

In [None]:
# Topicos validos (quitamos el topico -1)
topics_to_save = list(merged_model.get_topics().keys())[1:]
len(topics_to_save)

In [None]:
# Visualizar cantidad de documentos que aportan a los topicos el modelo anterior, el modelo fusionado, y el total de ambos
df_combined = merged_results(topic_model_1, merged_model)
print(df_combined)

In [None]:
# Topicos modelo merged
merged_model.topic_labels_

In [None]:
# Elegir un topico que tenga aporte de noticias al topico por ambos modelos
topic_id = 0
docs_per_topics = [i for i, x in enumerate(merged_model.topics_) if x == topic_id]


print(f"Noticias totales del topico {topic_id}:", len(docs_per_topics))
# Lista de columnas que quieres excluir
columnas_a_excluir = ['noticia', 'keywords','entidades','pos_id']


df_query = df_unificado.loc[docs_per_topics, df_unificado.columns.difference(columnas_a_excluir)]
df_query.style.set_properties(subset=['titulo'], **{'text-align': 'left'}).set_table_styles([{'selector': 'th', 'props': [('min-width', '50px')]}, {'selector': 'td', 'props': [('min-width', '140px')]}])

### Obtenemos transformaciones y embeddings del batch

In [None]:
topics_batch, probs_batch = merged_model.transform(news_data)

In [None]:
# Obtenemos embeddings de todos los documentos
docs_embedding_batch = merged_model.embedding_model.embed(news_data)

### Actualizar datos en news 

In [None]:
# Marcar registros de noticias como procesados y grabar sus embeddings, topicos, probs 
update_news( id_data, docs_embedding_batch, topics_batch, probs_batch )

### Actualizar en Topicos

In [None]:
# Obtener las fechas desde/hasta de los topicos existentes de opensearch
from_date, to_date = get_topics_date()
from_date, to_date

In [None]:
date_choice = choice[:4]+"-"+choice[4:6]+"-"+choice[6:8]

# Preparar fechas para actualizar en los topicos ( excluyendo topico -1)
from_date_to_save, to_date_to_save = update_topics_date(from_date, to_date, df_combined[1:], date_choice)
from_date_to_save, to_date_to_save

In [None]:
# Topicos validos (quitamos el topico -1)
topics_to_save = list(merged_model.get_topics().keys())[1:]

In [None]:
# Obtener los nombres de los tópicos existentes
topics_name = get_topics_opensearch()
topics_name = [name['name'] for name in topics_name]
topics_name


In [None]:
# Eliminar topicos existentes
delete_index_opensearch("topic")

# Grabar todos los topicos en la base
for topic_id in merged_model.get_topics().keys():
    
    if topic_id > -1:

        topic_keywords_top  = top_keywords(topic_id, merged_model, PATH)
        topic_entities_top  = get_top_entities_os(topic_id)
        topic_documents_title, threshold  = get_top_documents_threshold(topic_id)
        id_best_doc, title_best_doc, best_doc = best_document(topic_id, merged_model, docs_embedding_batch, u_id_data, u_title_data, u_data)
        
        topic_doc = Topic(
            index = topic_id,
            name = get_topic_name(''.join(topic_documents_title), topic_id, merged_model, client),
            vector = list(merged_model.topic_embeddings_[topic_id + 1 ]),
            similarity_threshold = threshold,
            created_at = parse(from_date[topic_id]),
            from_date = parse(from_date[topic_id]),
            to_date = parse(to_date[topic_id]),
            keywords = topic_keywords_top,
            entities = topic_entities_top,
            id_best_doc = id_best_doc,
            title_best_doc = title_best_doc,
            best_doc = best_doc,
        )

        topic_doc.save()


Nota: Cada modelo puede representar ordenes distintos de los topicos, pero al fusionarlos, el orden del modelo 1 se mantiene en el fusionado y se agregan los nuevos al final

### Verificacion

#### Temporalidad de los topicos

In [None]:
from matplotlib.dates import DateFormatter, AutoDateLocator
import matplotlib.pyplot as plt
import pandas as pd

data_plot = {'topic': range(len(merged_model.get_topics().keys())-1),
             'from': from_date_to_save,
             'to': to_date_to_save}

df_plot = pd.DataFrame(data_plot)
df_plot['from'] = pd.to_datetime(df_plot['from'], format='ISO8601')
df_plot['to'] = pd.to_datetime(df_plot['to'], format='ISO8601')

plt.figure(figsize=(12, 6))

for index, row in df_plot.iterrows():
    plt.plot([row['from'], row['to']], [row['topic'], row['topic']], marker='o')

# Formatear las fechas en el eje X
date_form = DateFormatter("%d-%m-%Y")
plt.gca().xaxis.set_major_formatter(date_form)

# Ajustar los ticks del eje X para que no se repitan las fechas
locator = AutoDateLocator()
plt.gca().xaxis.set_major_locator(locator)

plt.yticks(df_plot['topic'].unique())
plt.xlabel('Fecha')
plt.ylabel('Número de Tópico')
plt.title('Representación de Tópicos a lo Largo del Tiempo')
plt.grid(True)

# Rotar las etiquetas de fecha para mejor legibilidad (opcional)
plt.xticks(rotation=45)

plt.show()


### Panel de Topicos

In [None]:
def panel_topicos(fecha):
    df_topics = pd.DataFrame(get_topics_opensearch(fecha))

    if not df_topics.empty:

        columnas_a_excluir = ['vector', 'created_at', 'best_doc']

        df_query = df_topics[df_topics.columns.difference(columnas_a_excluir)].sort_values("index", ascending=True)
        column_order = ['index','name','from_date', 'to_date', 'similarity_threshold', 'keywords', 'entities', 'id_best_doc', 'title_best_doc']

        df_result = df_query.reindex(columns=column_order).style.set_properties(subset=['name'], **{'text-align': 'left'}).set_table_styles([{'selector': 'th','props': [('min-width', '40px')]},
                                                                                                                                             {'selector': 'td', 'props': [('min-width', '140px')]}])
        
        return df_result
    else:
        print("No hay tópicos para los parámetros elegidos")


In [None]:
# la fecha de consulta se relaciona con la vigencia, es decir con "to_date"
fecha = None
fecha = "2024-07-18"
panel_topicos(fecha)

### Panel de noticias por topico ( filtrado por umbral del topico y fecha )

In [None]:
def panel_news(topic_id, fecha):

    news_results = select_data_from_news(topic=topic_id)
    if news_results:
        if fecha in news_results[4]:
            data_view = {   'ID': news_results[0],
                            'titulo': news_results[1],
                            'estimacion': news_results[3]}
            
            name, threshold = get_one_topic(topic_id)
            print(f"Noticias del topico {topic_id}: <<< {name} >>> | umbral: {threshold}")
            df_view = pd.DataFrame(data_view)
            df_view.sort_values('estimacion', ascending=False, inplace=True)
            df_result = df_view[df_view['estimacion'] > threshold]

            return df_result
        else:
            print("No hay tópicos para los parámetros elegidos")
    else:
        print("No hay tópicos para los parámetros elegidos")

In [None]:
fecha = None
fecha = '2024-07-16'
topic_id = 8
panel_news(topic_id, fecha)


### Inferencia

In [88]:
# Cargamos el ultmo modelo fusionado
merged_model = BERTopic.load(PATH+"modelos_notebook/bertopic_model_last")

In [109]:
def inferencia(texto_entrada):

    results = merged_model.find_topics(texto_entrada)
    if results[0][0] == -1:
        return print("Sin resultados")
    else:
        try:
            index = results[0].index(-1)
        except ValueError:
            index = len(results[0])  # Si no hay -1, se mantiene toda la lista

        # Eliminar los valores desde el índice encontrado hasta el final en ambas listas
        results_ = (results[0][:index-1], results[1][:index-1])

        print("El texto proporcionado se infiere en los siguientes tópicos: ")
        names = [get_one_topic(topic_id) for topic_id in results_[0]]

        t_keywords_entities = [get_topic_keywords_entities(topic_id) for topic_id in results_[0]]

        lista_keywords = [list(tupla[0].keys()) for tupla in t_keywords_entities ]
        lista_entities = [list(tupla[1].keys()) for tupla in t_keywords_entities ]

        data = {"topico": results_[0],
                "nombre": [name[0] for name in names],
                "estimacion": results_[1],
                "keywords": lista_keywords,
                "entities": lista_entities}

        df_res = pd.DataFrame(data)
        df_res.sort_values("estimacion", ascending=False, inplace=True)
        return df_res

In [114]:
texto_entrada = 'Gobierno consiguió superávit financiero por sexto mes consecutivo'
texto_entrada = 'capacidad instalada de la industria'

inferencia(texto_entrada)

El texto proporcionado se infiere en los siguientes tópicos: 


Unnamed: 0,topico,nombre,estimacion,keywords,entities
0,11,Crisis en la industria y empleo,0.566074,"[soles, em, capacidad instalada, crecimiento]","[UIA, Unión Industrial Argentina, Argentina, Gobierno, Javier Milei, AMBA, Adimra, Demanda, Gobierno nacional, República Argentina]"
1,3,Actualizaciones y cambios gubernamentales.,0.431123,"[cuotas, categoría, multas]","[AFIP, DÓLAR, Gobierno, Argentina, Buenos Aires, Provincia, la Ley, ANSES, CBU, Ganancias]"
2,1,Reuniones entre Gobierno y CGT,0.347498,"[viviendas, pacto, ley, reunión]","[Gobierno, Javier Milei, Axel Kicillof, Consejo de Mayo, Congreso, Gobierno nacional, Provincia, CGT, PRO, Río Negro]"
