In [22]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from matplotlib import pyplot as plt
import os
import json
from datetime import datetime, date
from dateutil.parser import parse
from dotenv import load_dotenv

from NLP_tools import Cleaning_text, top_keywords, top_entities, get_topic_name, best_document, clean_all, topic_documents
from core.functions import *

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from opensearch_data_model import Topic, TopicKeyword, News, os_client, TOPIC_INDEX_NAME, NEWS_INDEX_NAME
from opensearch_io import init_opensearch, get_news
from opensearchpy import helpers

In [2]:
init_opensearch()

El índice Topic ya existe. Saltando inicialización de base de datos.
El índice News ya existe. Saltando inicialización de base de datos.


In [3]:
load_dotenv()
PATH_REMOTO='/content/ITBA-NLP/data/'
PATH=os.environ.get('PATH_LOCAL', PATH_REMOTO)
PATH

if PATH == os.environ.get('PATH_LOCAL'):
    client = OpenAI(api_key= os.environ.get('OPENAI_API_KEY'))
else:
    from google.colab import userdata
    client = OpenAI(api_key= userdata.get('OPENAI_API_KEY'))

# Para testear app streamlit

In [5]:
df_params = {'0_1000':'0_1000_data.parquet',
             '1000_2000':'1000_2000_data.parquet',
             '2000_3000':'2000_3000_data.parquet',
             'df_joined':'df_joined_2024-04-01 00_00_00.parquet'
            }

chunk = '0_1000' 
#chunk = '1000_2000' 
df_parquet = pd.read_parquet(PATH+df_params[chunk])
data = list(df_parquet['in__text'])

# Cantidad total de documentos
print(chunk)

0_1000


In [7]:
chunk = "app"
# Cargar modelo entrenado o saltar celda y entrenar
topic_model = BERTopic.load(PATH+f"modelos/bertopic_model_{chunk}")

# Cargar los embeddings
docs_embedding = np.load(PATH+f"modelos/docs_embeddings_{chunk}.npy")

In [8]:
topics, probs = topic_model.transform(data)

#topics = np.load(PATH+f"modelos/topics_{chunk}.npy")
#probs = np.load(PATH+f"modelos/probs_{chunk}.npy")

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [17]:
def funcion_aux(ID):

    query = {
                'query': {
                    'match': {
                        '_id': ID  # Sustituir 'campo' y 'valor' por campo y valor de búsqueda
                    }
                }
    }
    
    response = os_client.search(index='news', body=query)

    # Procesar la respuesta
    results = response['hits']['hits']

    title = [ result['_source']['title'] for result in results]
    news = [ result['_source']['news'] for result in results]
    topic = [ result['_source']['topic'] for result in results]
    prob = [ result['_source']['prob'] for result in results]
    keywords = [ result['_source']['keywords'] for result in results]
    entities = [ result['_source']['entities'] for result in results]
    fila = df_parquet.index.get_loc(ID)

    try:
        keywords_df = df_parquet[df_parquet.index==ID]['Keyword Name'].values[0]
    except:
        keywords_df = ""
    try:
        entities_df = df_parquet[df_parquet.index==ID]['Entity Name'].values[0]
    except:
        entities_df = ""

    print(f"Noticia ID: {ID} {title}")
    print(f"Entities originales: {entities_df}")
    print(f"Keywords originales: {keywords_df}")
    print("-"*80)
    print(f"Topico: {topic}")
    print(f"Fila: {fila}")
    print(f"Prob. modelo: {prob}")
    print(f"Entities calculadas: {entities}")
    print(f"Keywords calculadas: {keywords}")
    print("-"*80)
    print(f"Noticia: {news[0]}")

In [21]:
funcion_aux(105579854)

Noticia ID: 105579854 ['El cielo estará nublado y se esperan lluvias durante toda la jornada en Tucumán']
Entities originales: ['']
Keywords originales: ['madrugada' 'precipitaciones' 'humedad' 'lluvias' 'cielo']
--------------------------------------------------------------------------------
Topico: [14]
Fila: 505
Prob. modelo: [1.0]
Entities calculadas: [[]]
Keywords calculadas: [[]]
--------------------------------------------------------------------------------
Noticia: Por la noche se espera que el cielo continúe cubierto, que la humedad sea del 97 %, mientras que la temperatura retrocederá nuevamente, esta vez hasta los 22 °C. Se espera que las precipitaciones se extienden hasta la madrugada del martes.


### Resetear la base de news

In [24]:
# Nombre del índice
index_name = 'news'

# Campo a actualizar y nuevo valor
campo_a_actualizar_1 = 'process'
campo_a_actualizar_2 = 'topic'
campo_a_actualizar_3 = 'prob'
nuevo_valor_1 = False
nuevo_valor_2 = -1
nuevo_valor_3 = 0

# Buscar todos los documentos en el índice
search_query = {
    "query": {
        "match_all": {}
    }
}

# Obtener todos los documentos
response = os_client.search(index=index_name, body=search_query, scroll='2m', size=1000)

documents = response['hits']['hits']
scroll_id = response['_scroll_id']

# Preparar operaciones bulk
bulk_operations = []

# Procesar el primer lote de documentos
for doc in documents:
    bulk_operations.append({
        "_op_type": "update",
        "_index": index_name,
        "_id": doc["_id"],
        "doc": {
            campo_a_actualizar_1: nuevo_valor_1,
            campo_a_actualizar_2: nuevo_valor_2,
            campo_a_actualizar_3: nuevo_valor_3
        }
    })

# Obtener y procesar el resto de los documentos con el scroll
while len(documents) > 0:
    response = os_client.scroll(scroll_id=scroll_id, scroll='2m')
    documents = response['hits']['hits']
    scroll_id = response['_scroll_id']

    for doc in documents:
        bulk_operations.append({
            "_op_type": "update",
            "_index": index_name,
            "_id": doc["_id"],
            "doc": {
                campo_a_actualizar_1: nuevo_valor_1,
                campo_a_actualizar_2: nuevo_valor_2,
                campo_a_actualizar_3: nuevo_valor_3
            }
        })

# Ejecutar las operaciones bulk
helpers.bulk(os_client, bulk_operations)

# Eliminar el scroll para liberar los recursos
os_client.clear_scroll(scroll_id=scroll_id)

{'succeeded': True, 'num_freed': 1}

In [11]:
# Obtener todos los registros de news con sus topicos

query = {
        "size":1000,
        "query": {
            "bool": {
                "must": [
                    {"match_all": {}}
                ],
            }
        }
    }

# Ejecutar la consulta
response = os_client.search(index='news', body=query)

# Procesar la respuesta
results = response['hits']['hits']

ID = [ result['_id'] for result in results]
title = [ result['_source']['title'] for result in results]
news = [ result['_source']['news'] for result in results]
topic = [ result['_source']['topic'] for result in results]

ID = np.array(ID).reshape(-1,1)
title = np.array(title).reshape(-1,1)
news = np.array(news).reshape(-1,1)
topic = np.array(topic).reshape(-1,1)

combined_array = np.hstack((ID, title, news, topic))
df = pd.DataFrame(combined_array, columns=["id","title","text","topic"])

# Grabar df con etiquetas de topicos
file_path = 'data_clasif.xlsx'
df.to_excel(PATH+file_path, index=False)

In [23]:
d = {"a":1, "b":2, "c":3}
len(d)

3