<a href="https://colab.research.google.com/github/davidlealo/sic_ai_2025_sept/blob/main/4_pnl/contribuciones_estudiantes/buscador_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
# 1. Instalación e importación de librerías
# ============================================================
!pip install sentence-transformers faiss-cpu gradio kagglehub -q

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
import os

# ============================================================
# 2. Cargar dataset desde KaggleHub
# ============================================================
import kagglehub

path = kagglehub.dataset_download("mauridb/product-data-from-walmart-usa-with-embeddings")
print("Ruta de los archivos del dataset:", path)

csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
print("Archivos encontrados:", csv_files)

df = pd.read_csv(os.path.join(path, csv_files[0]))
print("Columnas disponibles:", df.columns.tolist())

# ============================================================
# 3. Limpieza y selección de campos
# ============================================================
# Combina información textual relevante (nombre + descripción + categoría + marca)
df["text"] = (
    df["product_name"].fillna("") + ". " +
    df["description"].fillna("") + ". " +
    df["category"].fillna("") + ". " +
    df["brand"].fillna("")
)

df = df[df["text"].str.strip() != ""]

# Muestra un ejemplo
df[["product_name", "category", "brand", "text"]].head(2)

# ============================================================
# 4. Generar embeddings semánticos
# ============================================================
modelo = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = modelo.encode(df["text"].tolist(), show_progress_bar=True)

# ============================================================
# 5. Crear índice FAISS
# ============================================================
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings, dtype=np.float32))

# ============================================================
# 6. Función de búsqueda
# ============================================================
def buscar(query, k=5):
    query_vec = modelo.encode([query])
    D, I = index.search(np.array(query_vec, dtype=np.float32), k)
    resultados = df.iloc[I[0]][["product_name", "category", "brand", "description"]]
    resultados["distancia"] = D[0]
    return resultados

# ============================================================
# 7. Interfaz interactiva con Gradio
# ============================================================
def interfaz(query):
    resultados = buscar(query, k=5)
    texto = ""
    for _, fila in resultados.iterrows():
        texto += f"### {fila['product_name']} ({fila['category']})\n"
        texto += f"Marca: {fila['brand']}\n"
        texto += f"Distancia: {fila['distancia']:.4f}\n\n"
        texto += f"{fila['description']}\n\n---\n"
    return texto

demo = gr.Interface(
    fn=interfaz,
    inputs=gr.Textbox(label="Consulta de producto", placeholder="Ejemplo: snack saludable sin azúcar"),
    outputs="markdown",
    title="Buscador Semántico Walmart",
    description="Busca productos de Walmart por significado, combinando nombre, descripción, categoría y marca."
)

demo.launch()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading from https://www.kaggle.com/api/v1/datasets/download/mauridb/product-data-from-walmart-usa-with-embeddings?dataset_version_number=2...


100%|██████████| 246M/246M [00:06<00:00, 38.2MB/s]

Extracting files...





Ruta de los archivos del dataset: /root/.cache/kagglehub/datasets/mauridb/product-data-from-walmart-usa-with-embeddings/versions/2
Archivos encontrados: ['walmart-product-with-embeddings-dataset-usa.csv']
Columnas disponibles: ['id', 'source_unique_id', 'crawl_timestamp', 'product_url', 'product_name', 'description', 'list_price', 'sale_price', 'brand', 'item_number', 'gtin', 'package_size', 'category', 'postal_code', 'available', 'embedding']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/938 [00:00<?, ?it/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c36e9aa3ac69903c2c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [2]:
import pandas as pd
import numpy as np
import ast
from google.colab import files

# ============================================================
# 1. Cargar dataset desde KaggleHub (si aún no está cargado)
# ============================================================
import kagglehub

path = kagglehub.dataset_download("mauridb/product-data-from-walmart-usa-with-embeddings")
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
df = pd.read_csv(os.path.join(path, csv_files[0]))

# ============================================================
# 2. Convertir embeddings desde texto a listas numéricas
# ============================================================
# Convierte la columna 'embedding' (tipo str) a lista de floats
df["embedding"] = df["embedding"].apply(ast.literal_eval)

# ============================================================
# 3. Crear el archivo vectors.tsv
# ============================================================
# Cada fila del TSV será un vector de embedding
vectors = np.array(df["embedding"].tolist())
np.savetxt("vectors.tsv", vectors, delimiter="\t")

# ============================================================
# 4. Crear el archivo metadata.tsv
# ============================================================
# Incluye campos descriptivos para identificar los puntos
metadata = df[["product_name", "category", "brand", "sale_price"]].fillna("")
metadata.to_csv("metadata.tsv", sep="\t", index=False)

# ============================================================
# 5. Descargar los archivos para subirlos al TensorFlow Projector
# ============================================================
files.download("vectors.tsv")
files.download("metadata.tsv")


Using Colab cache for faster access to the 'product-data-from-walmart-usa-with-embeddings' dataset.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
# ============================================================
# 1. Instalación de librerías necesarias
# ============================================================
!pip install sentence-transformers faiss-cpu gradio PyMuPDF -q

import fitz  # PyMuPDF para leer el PDF
import pandas as pd
import numpy as np
import re
import faiss
from sentence_transformers import SentenceTransformer
import gradio as gr
from google.colab import files
import requests
import os

# ============================================================
# 2. Descargar el PDF de la Biblia Reina Valera 1960
# ============================================================
url = "https://ibvictoria.cl/wp-content/uploads/2013/07/biblia-reina-valera-1960.pdf"
pdf_path = "biblia-reina-valera-1960.pdf"

if not os.path.exists(pdf_path):
    response = requests.get(url)
    with open(pdf_path, "wb") as f:
        f.write(response.content)
print("PDF descargado correctamente:", pdf_path)

# ============================================================
# 3. Extraer texto del PDF
# ============================================================
doc = fitz.open(pdf_path)
texto = ""
for page in doc:
    texto += page.get_text("text")

# Limpieza básica: eliminar espacios múltiples y saltos innecesarios
texto = re.sub(r'\s+', ' ', texto).strip()

# ============================================================
# 4. Dividir el texto en fragmentos manejables
# ============================================================
# Cada fragmento tendrá aprox. 1000 caracteres (puedes ajustar)
fragmentos = []
tamano = 1000
for i in range(0, len(texto), tamano):
    fragmentos.append(texto[i:i+tamano])

df = pd.DataFrame({"fragmento": fragmentos})
print(f"Total de fragmentos creados: {len(df)}")

# ============================================================
# 5. Generar embeddings semánticos
# ============================================================
modelo = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = modelo.encode(df["fragmento"].tolist(), show_progress_bar=True)

# ============================================================
# 6. Crear índice FAISS para búsqueda semántica
# ============================================================
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings, dtype=np.float32))

# ============================================================
# 7. Función de búsqueda semántica
# ============================================================
def buscar(query, k=5):
    query_vec = modelo.encode([query])
    D, I = index.search(np.array(query_vec, dtype=np.float32), k)
    resultados = df.iloc[I[0]].copy()
    resultados["distancia"] = D[0]
    return resultados

# ============================================================
# 8. Interfaz interactiva con Gradio
# ============================================================
def interfaz(query):
    resultados = buscar(query, k=5)
    texto = ""
    for _, fila in resultados.iterrows():
        texto += f"### Fragmento (distancia {fila['distancia']:.4f})\n"
        texto += f"{fila['fragmento']}\n\n---\n"
    return texto

demo = gr.Interface(
    fn=interfaz,
    inputs=gr.Textbox(label="Consulta semántica", placeholder="Ejemplo: amor al prójimo, fe, sabiduría..."),
    outputs="markdown",
    title="Buscador Semántico - Biblia Reina Valera 1960",
    description="Busca fragmentos de la Biblia según su significado utilizando embeddings semánticos."
)

demo.launch(share=False)

# ============================================================
# 9. Exportar embeddings y metadata para TensorFlow Projector
# ============================================================
# Guardar vectores
np.savetxt("vectors.tsv", embeddings, delimiter="\t")

# Guardar metadata (fragmentos de texto)
df.to_csv("metadata.tsv", sep="\t", index=False)

# Descargar los archivos
files.download("vectors.tsv")
files.download("metadata.tsv")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hPDF descargado correctamente: biblia-reina-valera-1960.pdf
Total de fragmentos creados: 3959


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>