In [1]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import uuid

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv("/Users/diegohernandez/Downloads/Maestri Milano - Products.csv")

# Clean columns
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

# Connect to Qdrant
client = QdrantClient(host="vps.maestri.com.co", port=6333, https=False)
collection_name = "maestri_products"
embedding_size = 512

# Recreate the collection
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=embedding_size, distance=Distance.COSINE),
)

# Vectorize the short text
model = SentenceTransformer("distiluse-base-multilingual-cased-v2")
points = []

def clean(text):
    if pd.isna(text):
        return ""
    return str(text).strip()

def strip_html(text):
    return BeautifulSoup(text, "html.parser").get_text(separator=" ", strip=True)

for _, row in df.iterrows():
    if pd.isna(row.get("published_on")):
        continue

    product_name = clean(row.get("nombre"))
    bodega = clean(row.get("bodega"))
    region = clean(row.get("región"))
    tipo_raw = clean(row.get("tipo"))
    tipo = clean(f"{tipo_raw}") if tipo_raw else ""
    maridaje1 = clean(row.get("maridaje_1"))
    maridaje2 = clean(row.get("maridaje_2"))
    maridaje = " | ".join([m for m in [maridaje1, maridaje2] if m])
    notas = clean(row.get("notas_de_cata"))
    descripcion_raw = clean(row.get("descripción"))
    descripcion = strip_html(descripcion_raw)
    precio = clean(row.get("precio"))

    short_text_parts = [product_name, bodega, tipo, region, maridaje]
    short_text = " | ".join([part for part in short_text_parts if part])

    if not short_text:
        continue

    vector = model.encode(short_text, convert_to_numpy=True, normalize_embeddings=True).tolist()

    payload = {}
    if product_name: payload["product_name"] = product_name
    if bodega: payload["bodega"] = bodega
    if region: payload["region"] = region
    if tipo: payload["tipo"] = tipo
    if precio: payload["precio"] = precio
    if notas: payload["notas"] = notas
    if descripcion: payload["descripcion"] = descripcion
    if maridaje: payload["maridaje"] = maridaje

    points.append(PointStruct(id=str(uuid.uuid4()), vector=vector, payload=payload))

# Insert into Qdrant
client.upsert(collection_name=collection_name, points=points)

print(f"✅ Inserted {len(points)} products into Qdrant collection: {collection_name}")


  client.recreate_collection(


✅ Inserted 170 products into Qdrant collection: maestri_products


In [19]:
print (df.columns)

Index(['nombre', 'slug', 'collection_id', 'locale_id', 'item_id', 'created_on',
       'updated_on', 'published_on', 'precio', 'imagen_del_producto',
       'categoria', 'descripción', 'maridaje_1', 'maridaje_2', 'notas_de_cata',
       'temperatura_de_servicio', 'pasillo', 'tipo', 'región',
       'cepa_principal', 'ocasión', 'bodega', 'denominación', 'peso_/_volumen',
       'gr/ml', 'item_id', 'precio_descuento', 'categories', 'precios',
       'descuento', 'descuento_2x1', 'descuento_3x2', 'ciudad',
       'productoreserva', 'descuento%off'],
      dtype='object')
