### Desarrollo de index en ES LOCAL

In [None]:
# 1. Importar librerías necesarias

import os
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch, helpers
import streamlit as st
from pathlib import Path

In [19]:
# 2. Conectar a Elasticsearch (local)
es = Elasticsearch("http://localhost:9200")

In [20]:
#  3. Definir índice con sinónimos y análisis de texto
index_name = "movies"
index_config = {
    "settings": {
        "analysis": {
            "analyzer": {
                "synonym_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "synonym_filter"]
                }
            },
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": [
                        "sci-fi, science fiction",
                        "thriller, suspense",
                        "animation, cartoon",
                        "drama, tragedy"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "synonym_analyzer"},
            "genres": {"type": "text", "analyzer": "synonym_analyzer"},
            "avg_rating": {"type": "float"}
        }
    }
}

In [None]:

# Definir la carpeta base del proyecto
BASE_DIR = Path(os.getcwd()).resolve().parent  # Ajusta según sea necesario
PROCESSED_DATA_DIR = BASE_DIR / "data" / "processed"

#  Cargar datos preprocesados
movies_path = PROCESSED_DATA_DIR / "movies_processed.parquet"
movies = pd.read_parquet(movies_path)

In [25]:
#  6. Indexar datos en Elasticsearch
bulk_data = []
for movie in movies.to_dict(orient="records"):
    bulk_data.append({
        "_index": index_name,
        "_id": movie["movieId"],
        "_source": {
            "title": movie["title"],
            "genres": movie["genres"],
            "avg_rating": movie["avg_rating"]
        }
    })

helpers.bulk(es, bulk_data)

#  7. Función de búsqueda en Elasticsearch con Boosting y Fuzziness
@st.cache_data
def search_movies(query):
    search_body = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "genres^1"],  # Boosting en títulos
                "fuzziness": "AUTO"  #  Tolerancia a errores tipográficos
            }
        }
    }
    res = es.search(index=index_name, body=search_body)
    return [hit["_source"]["title"] for hit in res["hits"]["hits"]]


#  8. Probar la búsqueda en Elasticsearch
query_test = "sci fi"
search_results = search_movies(query_test)
print(f" Resultados de búsqueda para '{query_test}':", search_results)

2025-03-14 12:35:37.103 No runtime found, using MemoryCacheStorageManager


 Resultados de búsqueda para 'sci fi': ['Ski School (1991)', 'Legend, The (Legend of Fong Sai-Yuk, The) (Fong Sai Yuk) (1993)', 'Children of Huang Shi, The (2008)', 'Sacrifice (Zhao shi gu er) (2010)', 'Who Am I? (Wo shi shei) (1998)', 'Grandmaster, The (Yi dai zong shi) (2013)', 'Fallen Angels (Duo luo tian shi) (1995)', 'Beijing Bicycle (Shiqi sui de dan che) (2001)', 'Ashes of Time (Dung che sai duk) (1994)', 'House of Flying Daggers (Shi mian mai fu) (2004)']


In [None]:
#  8. Probar la búsqueda en Elasticsearch
query_test = "sci fi"
search_results = search_movies(query_test)
print(f" Resultados de búsqueda para '{query_test}':", search_results)