# Libraries

In [7]:
from pathlib import Path
import sys

ROOT = Path("..").resolve()
SRC = ROOT / "src"

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

ROOT, SRC


(WindowsPath('C:/Users/GLORIA MARENA/projects/context-engine-crewai'),
 WindowsPath('C:/Users/GLORIA MARENA/projects/context-engine-crewai/src'))

In [8]:
import pandas as pd
import numpy as np
import json
import ast

In [9]:
csv_path = ROOT / "data" / "new_items_dataset.csv"
items_raw = pd.read_csv(csv_path, low_memory=False)

print("Loaded:", items_raw.shape)
items_raw.head(3)


Loaded: (100000, 26)


Unnamed: 0,id,title,date_created,base_price,price,category_id,tags,attributes,variations,pictures,...,shipping_mode,shipping_admits_pickup,shipping_is_free,status,sub_status,warranty,is_new,initial_quantity,sold_quantity,available_quantity
0,MLA578569012,Escritorio Secretter Espectacular,2015-09-08T21:17:57.000Z,6700.0,6700.0,MLA1902,['dragged_bids_and_visits'],[],[],"[{'size': '500x375', 'secure_url': 'https://a2...",...,custom,True,False,active,,,0,1.0,0.0,1.0
1,MLA576883746,Stassen Espatulas Milenio Numero 3 Codigo 590-3,2015-08-29T12:55:03.000Z,119.0,119.0,MLA3530,['dragged_bids_and_visits'],[],[],"[{'size': '500x375', 'secure_url': 'https://a2...",...,not_specified,False,False,active,,Sí,1,19.0,0.0,19.0
2,MLA581002506,Charm Britania Plata 925 Ovalado!!! Armá La Tu...,2015-09-23T12:41:07.000Z,450.0,450.0,MLA6440,['dragged_bids_and_visits'],[],[],"[{'size': '312x308', 'secure_url': 'https://a2...",...,me2,False,False,active,,Sí,1,2.0,0.0,2.0


In [10]:
items = items_raw.rename(columns={"id": "item_id"}).copy()

def parse_maybe_json(x):
    # Convierte strings tipo JSON/lista/dict a objetos Python
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    if isinstance(x, (dict, list)):
        return x
    if isinstance(x, str):
        s = x.strip()
        if s == "" or s.lower() == "nan":
            return None
        # intenta json
        if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
            try:
                return json.loads(s)
            except Exception:
                pass
        # intenta literal_eval (por si viene con comillas simples)
        try:
            return ast.literal_eval(s)
        except Exception:
            return s
    return x

# tags/attributes robustos
items["tags"] = items["tags"].apply(parse_maybe_json) if "tags" in items.columns else [[]]*len(items)
items["attributes"] = items["attributes"].apply(parse_maybe_json) if "attributes" in items.columns else [{}]*len(items)

# tipos base esperados por tu pipeline
items["item_id"] = items["item_id"].astype(str)
items["seller_id"] = items["seller_id"].astype(str)
items["price"] = pd.to_numeric(items["price"], errors="coerce")
items["available_quantity"] = pd.to_numeric(items["available_quantity"], errors="coerce")
items["sold_quantity"] = pd.to_numeric(items["sold_quantity"], errors="coerce")

items[["item_id","title","seller_id","price","available_quantity","sold_quantity"]].head(3)


Unnamed: 0,item_id,title,seller_id,price,available_quantity,sold_quantity
0,MLA578569012,Escritorio Secretter Espectacular,99151748,6700.0,1.0,0.0
1,MLA576883746,Stassen Espatulas Milenio Numero 3 Codigo 590-3,65545512,119.0,19.0,0.0
2,MLA581002506,Charm Britania Plata 925 Ovalado!!! Armá La Tu...,101563090,450.0,2.0,0.0


In [11]:
expected = {"item_id","title","seller_id","price","available_quantity","sold_quantity","tags","attributes"}
print("Missing:", expected - set(items.columns))
print("OK columns:", list(expected & set(items.columns)))


Missing: set()
OK columns: ['sold_quantity', 'available_quantity', 'price', 'title', 'attributes', 'tags', 'item_id', 'seller_id']


In [12]:
import random
from datetime import datetime, timedelta

random.seed(42)
np.random.seed(42)

queries = [
    "Busco una laptop para edición de video que sea económica",
    "audifonos bluetooth con cancelación de ruido",
    "silla ergonomica para oficina",
    "celular con buena camara barato",
    "tv 4k smart economico",
    "zapatos deportivos running",
    "cafetera espresso",
    "monitor 27 pulgadas 144hz",
]

n_events = 10_000
item_pool = items["item_id"].sample(min(len(items), 50_000), random_state=42).tolist()

base = datetime(2026, 1, 1)
events = pd.DataFrame({
    "event_id": [f"E{i:06d}" for i in range(n_events)],
    "query": [random.choice(queries) for _ in range(n_events)],
    "event_ts": [base + timedelta(minutes=random.randint(0, 60*24*14)) for _ in range(n_events)],  # 2 semanas
    "clicked_item_id": [random.choice(item_pool) for _ in range(n_events)],
})

print("Events:", events.shape)
events.head(3)


Events: (10000, 4)


Unnamed: 0,event_id,query,event_ts,clicked_item_id
0,E000000,audifonos bluetooth con cancelación de ruido,2026-01-05 13:30:00,MLA580332585
1,E000001,Busco una laptop para edición de video que sea...,2026-01-08 14:42:00,MLA581808423
2,E000002,tv 4k smart economico,2026-01-02 08:26:00,MLA582568132


In [14]:
ART = ROOT / "artifacts"
ART.mkdir(exist_ok=True)

def _to_list(x):
    # Normaliza tags para que SIEMPRE sea list[str]
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        return [str(t).strip().lower() for t in x if str(t).strip()]
    if isinstance(x, (set, tuple, np.ndarray)):
        return [str(t).strip().lower() for t in list(x) if str(t).strip()]
    if isinstance(x, str):
        s = x.strip()
        if s == "" or s.lower() == "nan":
            return []
        # si viene como "['a','b']" o '["a","b"]'
        if s.startswith("[") and s.endswith("]"):
            try:
                v = ast.literal_eval(s)
                if isinstance(v, list):
                    return [str(t).strip().lower() for t in v if str(t).strip()]
            except Exception:
                try:
                    v = json.loads(s)
                    if isinstance(v, list):
                        return [str(t).strip().lower() for t in v if str(t).strip()]
                except Exception:
                    pass
        # fallback: separados por coma
        return [t.strip().lower() for t in s.split(",") if t.strip()]
    # cualquier otro tipo raro
    return [str(x).strip().lower()] if str(x).strip() else []

def _to_dict(x):
    # Normaliza attributes para que SIEMPRE sea dict
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return {}
    if isinstance(x, dict):
        return x
    if isinstance(x, str):
        s = x.strip()
        if s == "" or s.lower() == "nan":
            return {}
        if s.startswith("{") and s.endswith("}"):
            try:
                v = json.loads(s)
                return v if isinstance(v, dict) else {}
            except Exception:
                try:
                    v = ast.literal_eval(s)
                    return v if isinstance(v, dict) else {}
                except Exception:
                    return {}
        return {}
    return {}

# --- Asegura consistencia para Parquet ---
items_fixed = items.copy()
items_fixed["tags"] = items_fixed["tags"].apply(_to_list)
items_fixed["attributes"] = items_fixed["attributes"].apply(_to_dict)

# Para evitar líos con estructuras en Parquet, guardamos también una versión "json string"
# (útil si luego quieres leerlo en otras herramientas)
items_fixed["tags_json"] = items_fixed["tags"].apply(json.dumps)
items_fixed["attributes_json"] = items_fixed["attributes"].apply(json.dumps)

# Guardar (Parquet): usando tags_json/attributes_json para máxima compatibilidad
cols_to_save = [c for c in items_fixed.columns if c not in ["tags", "attributes"]]
items_fixed[cols_to_save].to_parquet(ART / "items.parquet", index=False)

# Events (sin problemas)
events.to_parquet(ART / "events.parquet", index=False)

print("Saved:", ART / "items.parquet")
print("Saved:", ART / "events.parquet")
print("items_fixed tags example:", items_fixed["tags"].head(3).tolist())


Saved: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts\items.parquet
Saved: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts\events.parquet
items_fixed tags example: [['dragged_bids_and_visits'], ['dragged_bids_and_visits'], ['dragged_bids_and_visits']]


In [15]:
def ensure_attributes_min(row):
    attrs = row["attributes"] if isinstance(row["attributes"], dict) else {}
    if "category" not in attrs or not attrs.get("category"):
        attrs["category"] = row.get("category_id", None)
    # brand/model podrían no existir, no pasa nada
    return attrs

items = items.copy()
items["attributes"] = items["attributes"].apply(lambda x: x if isinstance(x, dict) else {})
items["attributes"] = items.apply(ensure_attributes_min, axis=1)

items[["item_id", "title", "category_id", "attributes"]].head(3)


Unnamed: 0,item_id,title,category_id,attributes
0,MLA578569012,Escritorio Secretter Espectacular,MLA1902,{'category': 'MLA1902'}
1,MLA576883746,Stassen Espatulas Milenio Numero 3 Codigo 590-3,MLA3530,{'category': 'MLA3530'}
2,MLA581002506,Charm Britania Plata 925 Ovalado!!! Armá La Tu...,MLA6440,{'category': 'MLA6440'}


# Bloque 1: construir item_360 + health report

In [16]:
from features import build_item_360

item_360, health = build_item_360(items)

print("item_360:", item_360.shape)
item_360.head(3)


item_360: (99997, 15)


Unnamed: 0,item_id,title,seller_id,price,price_bucket,available_quantity,sold_quantity,stock_ratio,sell_through,category,brand,model,n_tags,tags_norm,title_len
0,MLA461611298,Renault Clio Mio 0km Confort Plus 5p Aa Da Gus...,57789250,151400.0,5000+,1.0,0.0,2.0,0.0,MLA6606,,,0,[],57
1,MLA461656958,"Piaggio Vespa Lx 150, Www.agrobikes.com.ar",51859727,83000.0,5000+,1.0,0.0,2.0,0.0,MLA32873,,,0,[],42
2,MLA468074443,Rodante De 4 M Full Full En Bahia!! Entrega Ya!!,64979139,87000.0,5000+,1.0,0.0,2.0,0.0,MLA1781,,,0,[],48


In [17]:
item_360.to_parquet(ART / "item_360.parquet", index=False)
print("Saved:", ART / "item_360.parquet")

Saved: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts\item_360.parquet


# BLOQUE 2 : Construir artifacts de retrieval (embeddings + índice)

In [None]:
import sys
from pathlib import Path
import pandas as pd

ROOT = Path("..").resolve() 
SRC  = ROOT / "src"
ART  = ROOT / "artifacts"

sys.path.insert(0, str(SRC))

item_360 = pd.read_parquet(ART / "item_360.parquet")
print("item_360:", item_360.shape)
item_360.head(2)


In [42]:
from retrieval import build_retrieval_artifacts

art = build_retrieval_artifacts(item_360)
type(art), getattr(art, "embeddings", None).shape

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

(retrieval.RetrievalArtifacts, (99997, 384))

## Probar búsqueda (precio = filtro duro + semántica)

In [43]:
from retrieval import search

query = "Laptop para edición de video económica"
hits = search(query, art, top_k=5, max_price=12000)

hits[["item_id","title","price","score"]]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,price,score
0,MLA583894352,Video Camara Deportes Extremos 1080 Hd Wifi Su...,2399.0,0.616373
1,MLA583214129,Placa De Video Ati Radeon Hd 6570 En Excelente...,900.0,0.614378
2,MLA578072379,Adaptador De Computadora A Tv - Ideal Para Lap...,245.0,0.607902
3,MLA577236427,Camara Inalambrica De Bebe Con Video Lcd 3 Pul...,4999.99,0.582607
4,MLA584619154,Placa De Video Gt 240 Msi 1gb - Memoria Ram Oc...,1400.0,0.578281


In [44]:
import re

pattern = r"\b(laptop|notebook|port[aá]til)\b"
hits_laptop = hits[hits["title"].astype(str).str.lower().str.contains(pattern, regex=True, na=False)]

hits_laptop[["item_id","title","price","score"]]



  hits_laptop = hits[hits["title"].astype(str).str.lower().str.contains(pattern, regex=True, na=False)]


Unnamed: 0,item_id,title,price,score


In [45]:
t = laptops_by_cat["title"].astype(str).str.lower()

laptops_clean = laptops_by_cat[
    t.str.contains(kw_laptop, regex=True, na=False) &
    ~t.str.contains(kw_not_laptop, regex=True, na=False)
].copy()

print("laptops_clean:", laptops_clean.shape)
laptops_clean[["item_id","title","price","category"]].head(20)


laptops_clean: (3, 15)


Unnamed: 0,item_id,title,price,category
73996,MLA582475432,Notebook Dell Alienware 18 I7-4710 1tb 8gb 4gb...,39000.0,MLA82598
74506,MLA582527285,Notebook Lenovo 15.6 B5070 I5 4210u 4g 500 Dv...,16300.0,MLA81387
90085,MLA583935002,Notebook Lenovo G50 - Core I5 Windows 10 - Com...,11000.0,MLA81387


In [46]:
from retrieval import build_retrieval_artifacts, search

art_laptops = build_retrieval_artifacts(laptops_clean)

query = "Laptop para edición de video económica"
hits = search(query, art_laptops, top_k=5, max_price=12000)

hits[["item_id","title","price","score"]]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,price,score
0,MLA583935002,Notebook Lenovo G50 - Core I5 Windows 10 - Com...,11000.0,0.354755


In [47]:
bad_rate = hits["title"].astype(str).str.lower().str.contains(kw_not_laptop, regex=True, na=False).mean()
bad_rate


np.float64(0.0)

In [48]:
print("laptops_clean:", laptops_clean.shape)

n_under = (laptops_clean["price"] <= 12000).sum()
print("<=12000:", int(n_under))

laptops_clean["price"].describe()


laptops_clean: (3, 15)
<=12000: 1


count        3.000000
mean     22100.000000
std      14873.802473
min      11000.000000
25%      13650.000000
50%      16300.000000
75%      27650.000000
max      39000.000000
Name: price, dtype: float64

In [20]:
# : guardar el índice

In [49]:
import pickle

out_pkl = ART / "retrieval_artifacts.pkl"
with open(out_pkl, "wb") as f:
    pickle.dump(art, f)

print("Saved:", out_pkl, "size:", out_pkl.stat().st_size)


Saved: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts\retrieval_artifacts.pkl size: 334960891


In [22]:
## llamar el pickle

In [50]:
with open(ART / "retrieval_artifacts.pkl", "rb") as f:
    art2 = pickle.load(f)

hits2 = search("Laptop para edición de video económica", art2, top_k=5, max_price=12000)
hits2[["item_id","title","price","score"]]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,price,score
0,MLA583894352,Video Camara Deportes Extremos 1080 Hd Wifi Su...,2399.0,0.616373
1,MLA583214129,Placa De Video Ati Radeon Hd 6570 En Excelente...,900.0,0.614378
2,MLA578072379,Adaptador De Computadora A Tv - Ideal Para Lap...,245.0,0.607902
3,MLA577236427,Camara Inalambrica De Bebe Con Video Lcd 3 Pul...,4999.99,0.582607
4,MLA584619154,Placa De Video Gt 240 Msi 1gb - Memoria Ram Oc...,1400.0,0.578281


## afinar 

In [68]:
import pandas as pd

t = laptops_real["title"].astype(str).str.lower()

# "seed": cosas que casi seguro son laptops (notebook + marca/modelo)
seed_kw = r"\b(notebook|laptop|port[aá]til)\b"
brand_kw = r"\b(acer|lenovo|dell|hp|toshiba|samsung|asus|msi|sony|vaio|bangho|packard)\b"

seed = laptops_real[t.str.contains(seed_kw, regex=True, na=False) & t.str.contains(brand_kw, regex=True, na=False)].copy()
print("seed shape:", seed.shape)
seed[["item_id","title","price","category"]].head(15)


seed shape: (36, 15)


  seed = laptops_real[t.str.contains(seed_kw, regex=True, na=False) & t.str.contains(brand_kw, regex=True, na=False)].copy()


Unnamed: 0,item_id,title,price,category
4014,MLA575040089,Notebook Acer Aspire 5742 Dual Core 3gb 15.6 3...,3099.99,MLA53841
13268,MLA576106379,Notebook Toshiba Satellite L20 1gb Ram Cpu:cel...,1800.0,MLA54687
18783,MLA576741626,Notebook Toshiba Satellite I7 8gb 1 Tb 15.6'',23900.0,MLA82708
19896,MLA576869283,Notebook Samsung Led Hd I3 8gb Ram Windows 7 P...,6500.0,MLA83739
32831,MLA578287538,Notebook Acer 4740 I5 Sonido Dolby Surround,4099.0,MLA81324
36357,MLA578659610,"Notebook Dell 17 Touch Full Hd, I7 5ta , 16gb,...",32890.0,MLA82598
36649,MLA578688836,"Notebook Packard Bell 15.6 , Intel T6500 2.1 G...",2900.0,MLA54477
37890,MLA578830551,Notebook Hp Core2duo 1gb Ram Hd 160gb Wifi Dvd...,2999.99,MLA54379
38402,MLA578897712,Notebook Samsung 14 Core I5 4g Ram 1 Tera Disco,5300.0,MLA83732
40302,MLA579095946,Notebook Lenovo Intel I3 4gb Ram 500gb Hdd Cam...,13469.07,MLA81492


In [69]:
cat_seed = seed["category"].value_counts()
cat_seed.head(20)


category
MLA81387    3
MLA82598    2
MLA83739    2
MLA82542    2
MLA54687    1
MLA82708    1
MLA53841    1
MLA54379    1
MLA83732    1
MLA81492    1
MLA50938    1
MLA37165    1
MLA81526    1
MLA81324    1
MLA54477    1
MLA54097    1
MLA81462    1
MLA54310    1
MLA81428    1
MLA54418    1
Name: count, dtype: int64

In [123]:
import re

df = candidates.copy()
t = df["title"].astype(str).str.lower()

# 1) Señales fuertes de que es computador
is_pc_word = t.str.contains(r"\b(?:laptop|notebook|netbook|ultrabook)\b", regex=True, na=False)

# Extra: "computadora portátil" como computador (no cualquier "portátil")
is_computadora_portatil = t.str.contains(r"\bcomputador(?:a)?\s+port[áa]til\b", regex=True, na=False)

# 2) "portátil" (ambigua)
is_portatil = t.str.contains(r"\bport[áa]til\b", regex=True, na=False)

# Contexto PC para permitir "portátil"
pc_context = t.str.contains(r"\b(?:pc|computador(?:a)?|laptop|notebook|netbook|ultrabook)\b", regex=True, na=False)

# 3) Exclusión fuerte (accesorios/repuestos/muebles/juguetes/NO laptops)
pat_not_a_laptop = re.compile(
    r"\b(?:"
    # repuestos / partes
    r"repuesto(?:s)?|pieza(?:s)?|refacci[oó]n(?:es)?|"
    r"carcas(?:a|as)|carcaz(?:a|as)|tapa|cover|bisagra|"
    r"tecla(?:s)?|teclado|touchpad|trackpad|pad|"
    r"mother|motherboard|mainboard|placa(?:s)?|board|"
    r"cargador|fuente|transformador|bater(?:i|í)a|"
    r"pantalla|display|lcd|touch|flex|jack|inverter|"
    r"cooler|disipador|ventilador|"

    # componentes / electrónicos
    r"microprocesador|procesador|cpu|"
    r"memoria|ram|ssd|hdd|disco\s+duro|disco|caddy|"
    r"fusible|smd|"

    # periféricos / accesorios
    r"mouse|rat[oó]n|parlante|bocina|webcam|camara|"
    r"dock|hub|adaptador|cable|"
    r"grabador(?:a|as)?|regrabador(?:a|as)?|lectograbador(?:a|as)?|dvd|cd|"
    r"funda(?:s)?|estuche|malet(?:i|í)n|mochila|bolso|"
    r"porta(?:\s|-)?notebook|portafolios|"

    # iluminación/otros “para notebook”
    r"l[áa]mpara|velador|leds?|"

    # muebles
    r"mesa|escritorio|bandeja|soporte|stand|cajonera|"

    # juguetes
    r"barbie|juguete|ni[nñ]os|kids?|did[aá]ctic[ao]|actividades"
    r")\b",
    re.IGNORECASE
)
is_not_a_laptop = t.str.contains(pat_not_a_laptop, regex=True, na=False)

# 4) “Portátil” que típicamente NO es laptop (discos externos, etc.)
pat_portatil_non_pc = re.compile(
    r"\b(?:"
    r"disco\s+r[ií]gido|disco\s+duro|externo|usb|tb|"
    r"power\s*bank|bater[ií]a\s+externa|parlante|bocina"
    r")\b",
    re.IGNORECASE
)
is_portatil_non_pc = t.str.contains(pat_portatil_non_pc, regex=True, na=False)

pat_exclude_even_if_anchor = re.compile(
    r"\b(?:"
    r"bisagra(?:s)?|tecla(?:s)?|teclado|touchpad|trackpad|carcas(?:a|as)|carcaz(?:a|as)|"
    r"vinilo|skin|sticker|calcoman[ií]a|adhesivo|"
    r"sleeve|bag|mochila|malet(?:i|í)n|bolso|porta(?:\s|-)?notebook|portafolios|"
    r"manny|many\s+manos|manos\s+a\s+la\s+obra|"
    r"recuperaci[oó]n|datos|formateo|reparaci[oó]n|mantenimiento|servicio|t[eé]cnico"
    r")\b",
    re.IGNORECASE
)
is_exclude_even_if_anchor = t.str.contains(pat_exclude_even_if_anchor, regex=True, na=False)


keep = (
    ((is_pc_word | is_computadora_portatil) & ~is_not_a_laptop & ~is_exclude_even_if_anchor)
    | (is_portatil & pc_context & ~is_not_a_laptop & ~is_portatil_non_pc & ~is_exclude_even_if_anchor)
)

laptops_ok = df[keep].copy()
dropped = df[~keep].copy()

print("candidates:", df.shape)
print("kept laptops_ok:", laptops_ok.shape)
print("dropped:", dropped.shape)

print("\n--- examples kept ---")
display(laptops_ok[["title", "price"]].head(15))

print("\n--- examples dropped ---")
display(dropped[["title", "price"]].head(15))

bad_words = ["bisagra", "vinilo", "skin", "sleeve", "bag", "manny", "recuperacion", "datos", "formateo"]
print("\n--- quick bad-word check in kept ---")
for w in bad_words:
    print(w, laptops_ok["title"].astype(str).str.lower().str.contains(w, na=False).sum())


candidates: (433, 15)
kept laptops_ok: (82, 15)
dropped: (351, 15)

--- examples kept ---


Unnamed: 0,title,price
666,Laptop Sony Vaio Modelo Pcg-3e2l Sin Funcionar,4800.0
677,Notebook Bgh Ql-310 Serie 300 11.6 Windows 8 ...,6500.0
3148,Notebook Toshiba C50 Intel 4gb 500gb Hdmi W8 C...,11326.99
4014,Notebook Acer Aspire 5742 Dual Core 3gb 15.6 3...,3099.99
4632,Mini Notebook Packard Bell Modelo Dot M/a 100 ...,2200.0
16839,Notebook Ken Brown Mb40ii1 B800 Perfecta,4900.0
17552,Laptop Acer Aspire One V5,3200.0
18665,Notebook Hp Pavillon Dv5,5800.0
18783,Notebook Toshiba Satellite I7 8gb 1 Tb 15.6'',23900.0
22261,Asus Notebook R512m,6999.0



--- examples dropped ---


Unnamed: 0,title,price
1476,Carcasa Base Inferior Para Notebook Lg E300 Lge23,250.0
1524,Notebook Commodore Ke-8327-mb Todos Los Repues...,200.0
1666,Cable De Seguridad Para Notebook Con Llave - C...,120.0
2522,Repues P/notebook Hp 420 425 Bisagra Carcasa C...,238.0
2762,Tapa Plastica Wifi Para Notebook Commodore Ke8...,90.0
2976,Cargador Notebook Hp Hp24 Mini 110-1035tu 110-...,344.89
2977,Cargador Notebook Hp Hp23 Pa-1900-08h2 Pavilio...,474.89
2978,Cargador Notebook Hp Hp23 Pavilion Dv7-2000 43...,474.89
3038,Power Bank Samsung 9000mah Cargador Portatil C...,219.9
3575,Mouse Microsoft Para Notebook Edicion Especia...,265.0



--- quick bad-word check in kept ---
bisagra 0
vinilo 0
skin 0
sleeve 0
bag 0
manny 0
recuperacion 0
datos 0
formateo 0


In [124]:
bad_probe = t[keep].str.contains(r"\b(?:manny|recuperaci[oó]n|datos|formateo)\b", regex=True, na=False).mean()
print("bad_probe_rate:", bad_probe)


bad_probe_rate: 0.0


In [125]:
from retrieval import build_retrieval_artifacts
import pickle

art_laptops = build_retrieval_artifacts(laptops_ok)

out_pkl = ART / "retrieval_artifacts_laptops.pkl"
with open(out_pkl, "wb") as f:
    pickle.dump(art_laptops, f)

print("Saved:", out_pkl, "size:", out_pkl.stat().st_size)


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Saved: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts\retrieval_artifacts_laptops.pkl size: 275148


In [126]:
out_pkl = ART / "retrieval_artifacts_laptops.pkl"
print("PKL:", out_pkl)
print("exists:", out_pkl.exists())
print("size:", out_pkl.stat().st_size)
print("modified:", out_pkl.stat().st_mtime)


PKL: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts\retrieval_artifacts_laptops.pkl
exists: True
size: 275148
modified: 1769376342.4857714


In [127]:
import pickle
from retrieval import search

with open(ART / "retrieval_artifacts_laptops.pkl", "rb") as f:
    art2 = pickle.load(f)

hits = search("Laptop para edición de video económica", art2, top_k=15, max_price=12000)
hits[["item_id","title","price","score"]]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,price,score
0,MLA581846989,"Notebook Lenovo G580 Dual Core 2,3 Ghz 8gb 500...",10199.0,0.452283
1,MLA579554484,Notebook Imb A20m No Da Video,400.0,0.449793
2,MLA582347833,"Notebook Compaq Presario V2000 ,en Muy Buen Es...",2000.0,0.448551
3,MLA574956381,Notebook Toshiba C50 Intel 4gb 500gb Hdmi W8 C...,11326.99,0.437857
4,MLA572649647,Laptop Sony Vaio Modelo Pcg-3e2l Sin Funcionar,4800.0,0.435237
5,MLA583758439,Notebook Compaq Presario F500 Como Nueva !!,3500.0,0.412639
6,MLA582455184,Notebook Toshiba Satellite L515-sp4012 Usada B...,4000.0,0.410945
7,MLA578805993,Notebook Nueva En Caja Hp 15-f004,6999.0,0.399298
8,MLA579214575,Escaner Portatil Obdii No Requiere Pc,600.0,0.391131
9,MLA580391106,Vendo Notebook Lenovo,5500.0,0.39041


In [128]:
bad_probe = r"\b(?:bisagra|vinilo|skin|sleeve|bag|mochila|malet|portafolios|manny|recuperaci[oó]n|formateo|servicio|t[eé]cnico)\b"
rate = hits["title"].astype(str).str.lower().str.contains(bad_probe, regex=True, na=False).mean()
print("bad_probe_rate:", rate)


bad_probe_rate: 0.0


In [129]:
import pickle
from retrieval import search

pkl_path = ART / "retrieval_artifacts_laptops.pkl"
print("USING:", pkl_path.resolve())
print("SIZE:", pkl_path.stat().st_size)

with open(pkl_path, "rb") as f:
    art_laptops_loaded = pickle.load(f)

hits = search("Laptop para edición de video económica", art_laptops_loaded, top_k=15, max_price=12000)
display(hits[["item_id","title","price","score"]])


USING: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts\retrieval_artifacts_laptops.pkl
SIZE: 275148


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,price,score
0,MLA581846989,"Notebook Lenovo G580 Dual Core 2,3 Ghz 8gb 500...",10199.0,0.452283
1,MLA579554484,Notebook Imb A20m No Da Video,400.0,0.449793
2,MLA582347833,"Notebook Compaq Presario V2000 ,en Muy Buen Es...",2000.0,0.448551
3,MLA574956381,Notebook Toshiba C50 Intel 4gb 500gb Hdmi W8 C...,11326.99,0.437857
4,MLA572649647,Laptop Sony Vaio Modelo Pcg-3e2l Sin Funcionar,4800.0,0.435237
5,MLA583758439,Notebook Compaq Presario F500 Como Nueva !!,3500.0,0.412639
6,MLA582455184,Notebook Toshiba Satellite L515-sp4012 Usada B...,4000.0,0.410945
7,MLA578805993,Notebook Nueva En Caja Hp 15-f004,6999.0,0.399298
8,MLA579214575,Escaner Portatil Obdii No Requiere Pc,600.0,0.391131
9,MLA580391106,Vendo Notebook Lenovo,5500.0,0.39041


# BLOQUE 3 — Search Engine (load → search → filters → eval → opcional API)

In [130]:
from pathlib import Path
import pickle
import pandas as pd

from retrieval import search 
ART = Path("artifacts") 


In [138]:
from pathlib import Path

HERE = Path.cwd().resolve()

# buscamos hacia arriba un artifacts/ que contenga los pkls esperados
need_files = {"retrieval_artifacts.pkl", "retrieval_artifacts_laptops.pkl"}

ART = None
for parent in [HERE, *HERE.parents]:
    cand = parent / "artifacts"
    if cand.exists():
        files = {p.name for p in cand.glob("*.pkl")}
        if need_files.issubset(files):
            ART = cand
            break

print("CWD:", HERE)
print("✅ ART found:", ART)

print("Files in ART:")
for p in sorted(ART.glob("*")):
    print(" -", p.name)


CWD: C:\Users\GLORIA MARENA\projects\context-engine-crewai\notebooks
✅ ART found: C:\Users\GLORIA MARENA\projects\context-engine-crewai\artifacts
Files in ART:
 - .gitkeep
 - events.parquet
 - item_360.parquet
 - items.parquet
 - raw
 - retrieval_artifacts.pkl
 - retrieval_artifacts_laptops.pkl


In [139]:
import pickle

pkl_general = ART / "retrieval_artifacts.pkl"
pkl_laptops = ART / "retrieval_artifacts_laptops.pkl"

with open(pkl_general, "rb") as f:
    art_general = pickle.load(f)

with open(pkl_laptops, "rb") as f:
    art_laptops = pickle.load(f)

type(art_general), type(art_laptops)


(retrieval.RetrievalArtifacts, retrieval.RetrievalArtifacts)

In [140]:
def run_search(query: str, *, top_k=10, max_price=None, domain="general"):
    """
    domain:
      - "general": usa el índice general (item_360 completo)
      - "laptops": usa el índice filtrado (solo laptops_ok)
    """
    art = art_general if domain == "general" else art_laptops

    hits = search(
        query,
        art,
        top_k=top_k,
        max_price=max_price
    )

    cols = [c for c in ["item_id", "title", "price", "category", "brand", "model", "score"] if c in hits.columns]
    return hits[cols]


In [141]:
display(run_search("laptop para edición de video económica", top_k=10, max_price=12000, domain="laptops"))
display(run_search("heladera inverter usada", top_k=10, max_price=200000, domain="general"))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,price,category,brand,model,score
0,MLA581846989,"Notebook Lenovo G580 Dual Core 2,3 Ghz 8gb 500...",10199.0,MLA54873,,,0.452283
1,MLA579554484,Notebook Imb A20m No Da Video,400.0,MLA54860,,,0.449793
2,MLA582347833,"Notebook Compaq Presario V2000 ,en Muy Buen Es...",2000.0,MLA53765,,,0.448551
3,MLA574956381,Notebook Toshiba C50 Intel 4gb 500gb Hdmi W8 C...,11326.99,MLA54687,,,0.437857
4,MLA572649647,Laptop Sony Vaio Modelo Pcg-3e2l Sin Funcionar,4800.0,MLA54566,,,0.435237
5,MLA583758439,Notebook Compaq Presario F500 Como Nueva !!,3500.0,MLA53790,,,0.412639
6,MLA582455184,Notebook Toshiba Satellite L515-sp4012 Usada B...,4000.0,MLA81421,,,0.410945


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,item_id,title,price,category,brand,model,score
0,MLA577061327,Soldadora Inverter Tig Cortadora Plasma Omaha ...,6699.99,MLA5231,,,0.601086
1,MLA576711156,Inverter De 10 Hp Industria Argentina,13431.0,MLA30216,,,0.573414
2,MLA579931740,Heladera Trial Gas / 220 Ca /12v Rg410 Premium,16587.0,MLA35923,,,0.567515
3,MLA576304112,Heladera Mostrador Usada,3500.0,MLA30813,,,0.565282
4,MLA580244654,Soldadora Electrica Inverter Mma Modelo Arc 19...,4750.0,MLA5231,,,0.564193
5,MLA575532169,Soldadora Lincoln Elecetric Inverter 270sx,45500.0,MLA30778,,,0.558457
6,MLA576451909,Soldadora Inverter Esab Conarco Buddy Arc 145 ...,4769.99,MLA5231,,,0.548448
7,MLA583174316,"Heladera 12v Solar 237 Litros, Hace Hielo, Mo...",12737.0,MLA1070,,,0.544885
8,MLA579674406,Soldadora Inverter Industrial Rectificadora Lu...,10985.0,MLA5231,,,0.52418
9,MLA579587063,Mig Mag 315 Inverter Trifasica Completa Tubo1m...,23990.0,MLA5231,,,0.51893


In [142]:
import re

BAD = re.compile(r"\b(?:bisagra|vinilo|skin|funda|maletin|mochila|teclado|touchpad|bateria|cargador|carcasa|memoria|ram|ssd|disco|servicio|reparaci[oó]n|datos)\b", re.I)

def bad_probe_rate(hits_df: pd.DataFrame) -> float:
    if len(hits_df) == 0:
        return 0.0
    return hits_df["title"].astype(str).str.contains(BAD, na=False).mean()

hits_test = run_search("laptop para edición de video económica", top_k=30, max_price=12000, domain="laptops")
rate = bad_probe_rate(hits_test)
print("bad_probe_rate:", rate)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

bad_probe_rate: 0.0


# Bloque 3 (Real-time Insights & Summarization)

In [143]:
hits.columns

Index(['item_id', 'title', 'seller_id', 'price', 'price_bucket',
       'available_quantity', 'sold_quantity', 'stock_ratio', 'sell_through',
       'category', 'brand', 'model', 'n_tags', 'tags_norm', 'title_len',
       'score'],
      dtype='object')

In [199]:
import os, json
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
print("OPENAI_API_KEY loaded?", bool(os.getenv("OPENAI_API_KEY")))

client = OpenAI()


OPENAI_API_KEY loaded? True


In [212]:
MODEL_INSIGHTS = "gpt-4o"

def extract_response_text(resp) -> str:
    # Con GPT-4o el acceso es estándar y directo
    try:
        return resp.choices[0].message.content
    except Exception:
        return str(resp)

In [213]:
#  Lógica del Context Engine
import json

def generate_insights_json(query: str, hits: pd.DataFrame, top_n: int = 5) -> dict:

    ctx = build_llm_context_from_hits(hits, top_n=top_n)

    system_prompt = (
        "Eres un analista experto de Mercado Libre. Tu objetivo es ayudar a un agente de IA "
        "a entender por qué ciertos productos son mejores que otros basándote en datos reales."
    )

    user_content = f"""
    QUERY DEL USUARIO: "{query}"
    
    CONTEXTO DE PRODUCTOS:
    {json.dumps(ctx, ensure_ascii=False)}

    INSTRUCCIONES:
    Genera un JSON con esta estructura exacta:
    {{
      "comparative_summary": "Breve análisis de los resultados",
      "top_recommendation": {{
        "item_id": "ID del mejor",
        "reason": "Por qué es el mejor para esta búsqueda"
      }},
      "risk_alerts": [
        {{ "item_id": "ID", "issue": "Descripción del riesgo de stock o precio" }}
      ],
      "market_insight": "Una frase sobre la tendencia de estos productos"
    }}
    Responde ÚNICAMENTE el JSON.
    """

    # Llamada estándar a GPT-4o
    resp = client.chat.completions.create(
        model=MODEL_INSIGHTS,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content}
        ],
        response_format={ "type": "json_object" }
    )

    txt = extract_response_text(resp)
    return json.loads(txt)

In [214]:
query = "Busco una laptop para edición de video que sea económica"

try:
    insights = generate_insights_json(query, hits, top_n=5)
    
    print(json.dumps(insights, indent=2, ensure_ascii=False))
    
    OUT = Path("insights_block3.json")
    with open(OUT, "w", encoding="utf-8") as f:
        json.dump(insights, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Archivo guardado en: {OUT.resolve()}")

except Exception as e:
    print(f"❌ Error en Bloque 3: {e}")

{
  "comparative_summary": "Los productos listados son laptops de varias marcas y modelos con precios que van desde $400 hasta $11,326.99. Sin embargo, la mayoría presentan limitaciones significativas para la edición de video, ya sea por falta de potencia o funcionalidad.",
  "top_recommendation": {
    "item_id": "MLA581846989",
    "reason": "A pesar de no ser ideal para edición de video, la Notebook Lenovo G580 ofrece la mejor combinación de precio y especificaciones, con un Dual Core, 8GB de RAM y un disco duro de 500GB, a un precio económico."
  },
  "risk_alerts": [
    {
      "item_id": "MLA579554484",
      "issue": "Precio anormalmente bajo sugiere que podría no estar en condiciones de uso funcional."
    },
    {
      "item_id": "MLA582347833",
      "issue": "Solo hay una unidad disponible y sin ventas anteriores, lo que indica posible falta de interés o funcionalidad."
    },
    {
      "item_id": "MLA572649647",
      "issue": "La laptop no funciona, lo que la hace prác

In [160]:
import json
import pandas as pd

MODEL_INSIGHTS = "gpt-4.1-mini" 

def generate_insights_json(query: str, hits: pd.DataFrame, top_n: int = 5) -> dict:
    ctx = build_llm_context_from_hits(hits, top_n=top_n)

    system = (
        "Eres un analista de compras de Mercado Libre. "
        "Tu tarea: generar una ficha comparativa de insights para un agente. "
        "REGLAS: "
        "1) Responde SOLO con JSON válido (sin markdown, sin texto extra). "
        "2) No inventes specs (RAM, GPU, etc.). Solo usa lo que viene en context. "
        "3) Si falta info, dilo como 'unknown' o explica en bullets sin inventar."
    )

    user_payload = {
        "desired_schema_example": INSIGHTS_TEMPLATE,
        "query": query,
        "context": ctx,
        "instructions": (
            "Llena el JSON siguiendo exactamente las llaves del ejemplo. "
            f"top_items debe tener máximo {top_n} elementos, cada uno con: "
            "item_id, title, price, seller_id, key_reasons (lista), stock_risk (low/medium/high), caveats (lista). "
            "why_these_are_better: lista de bullets (strings). "
            "stock_risk.overall: low/medium/high con una frase corta."
        ),
    }

    resp = client.responses.create(
        model=MODEL_INSIGHTS,
        input=[
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user_payload, ensure_ascii=False)},
        ],
        max_output_tokens=700,
    )

    # dependiendo de versión del SDK, puede venir como output_text o en output[...]
    txt = getattr(resp, "output_text", None)
    if not txt:
        try:
            txt = resp.output[0].content[0].text
        except Exception:
            txt = str(resp)

    txt = txt.strip()

    try:
        return json.loads(txt)
    except json.JSONDecodeError:
        start = txt.find("{")
        end = txt.rfind("}")
        if start != -1 and end != -1 and end > start:
            return json.loads(txt[start:end+1])
        raise


In [161]:
query = "Busco una laptop para edición de video que sea económica"

# hits = search(query, art_laptops, top_k=10, max_price=12000)  # <-- tu celda existente
insights = generate_insights_json(query, hits, top_n=5)

insights 


{'query': 'Busco una laptop para edición de video que sea económica',
 'top_items': [{'item_id': 'MLA581846989',
   'title': 'Notebook Lenovo G580 Dual Core 2,3 Ghz 8gb 500gb Usb3.0 Hdmi',
   'price': 10199.0,
   'seller_id': '92607234',
   'key_reasons': ['8GB RAM adecuada para edición básica de video',
    'Puerto USB 3.0 y HDMI para conexiones externas',
    'Precio económico comparado con otras laptops con más memoria'],
   'stock_risk': 'low',
   'caveats': ['Procesador Dual Core puede ser limitante para edición intensiva',
    'No se especifica GPU dedicada, importante para edición de video']},
  {'item_id': 'MLA574956381',
   'title': 'Notebook Toshiba C50 Intel 4gb 500gb Hdmi W8 Consultar Stock',
   'price': 11326.99,
   'seller_id': '126686216',
   'key_reasons': ['Incluye puerto HDMI para conexión a monitores externos',
    'Disco de 500GB suficiente para almacenamiento inicial'],
   'stock_risk': 'medium',
   'caveats': ['Solo 4GB de RAM, podría ser insuficiente para edición