In [30]:
import os, json, math, re, ast, asyncio, aiohttp, hashlib, io
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import pandas as pd
from tqdm.auto import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential
from dotenv import load_dotenv
from PIL import Image
from rapidfuzz import process, fuzz

# ---- env ----
load_dotenv()

# Required: Pinecone and (optionally) OpenAI if you want OpenAI embeddings instead of local ST
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV      = os.getenv("PINECONE_ENV", "us-east-1-aws")
PINECONE_TEXT_INDEX = os.getenv("PINECONE_TEXT_INDEX", "products-text")
EMBED_PROVIDER    = os.getenv("EMBED_PROVIDER", "local")

# Paths
CSV_PATH          = os.getenv("CSV_PATH", "intern_data_ikarus.csv")  # set this path
IMAGE_DIR         = os.getenv("IMAGE_DIR", "./data/images")          # will be created
os.makedirs(IMAGE_DIR, exist_ok=True)

assert PINECONE_API_KEY, "Set PINECONE_API_KEY in .env"


In [31]:
# Load
df_raw = pd.read_csv(CSV_PATH)

# Keep only expected columns (add if any missing)
expected_cols = ["uniq_id","title","brand","description","price","categories","images",
                 "manufacturer","package_dimensions","country_of_origin","material","color"]
for c in expected_cols:
    if c not in df_raw.columns:
        df_raw[c] = pd.NA

df = df_raw[expected_cols].copy()

# Normalize types
def to_list(x):
    if pd.isna(x): return []
    sx = str(x).strip()
    if sx.startswith('['):  # JSON-like
        try: return list(ast.literal_eval(sx))
        except: pass
    # fallback: comma-separated
    return [s.strip() for s in sx.split(",") if s.strip()]

def to_float(x):
    if pd.isna(x): return pd.NA
    try:
        s = str(x).replace(",","").replace("₹","").strip()
        return float(re.findall(r"[-+]?\d*\.?\d+", s)[0]) if re.findall(r"[-+]?\d*\.?\d+", s) else pd.NA
    except:
        return pd.NA

df["categories"] = df["categories"].apply(to_list)
df["images"]     = df["images"].apply(to_list)
df["price"]      = df["price"].apply(to_float)


In [32]:
missing_summary = df.isna().mean().sort_values(ascending=False).to_frame("missing_ratio")
missing_summary["count_missing"] = df.isna().sum()
missing_summary["n"] = len(df)
missing_summary


Unnamed: 0,missing_ratio,count_missing,n
country_of_origin,0.599359,187,312
description,0.490385,153,312
manufacturer,0.342949,107,312
price,0.310897,97,312
material,0.301282,94,312
color,0.150641,47,312
package_dimensions,0.019231,6,312
uniq_id,0.0,0,312
title,0.0,0,312
brand,0.0,0,312


In [33]:
# Categorical mode with fallback
def mode_or_na(series: pd.Series):
    vc = series.dropna().astype(str).value_counts()
    return vc.idxmax() if not vc.empty else pd.NA

# Price imputation: median by (brand, primary_category) -> then by primary_category -> global median
df["primary_category"] = df["categories"].apply(lambda lst: (lst[0] if lst else pd.NA))

price_global = df["price"].median(skipna=True)
brand_cat_med = df.groupby(["brand","primary_category"])["price"].median().dropna()
cat_med = df.groupby(["primary_category"])["price"].median().dropna()

def impute_price(row):
    if not pd.isna(row["price"]):
        return row["price"]
    key = (row["brand"], row["primary_category"])
    if key in brand_cat_med:
        return float(brand_cat_med[key])
    if row["primary_category"] in cat_med:
        return float(cat_med[row["primary_category"]])
    return float(price_global) if not math.isnan(price_global) else 0.0

df["price"] = df.apply(impute_price, axis=1)

# Brand impute (use manufacturer or title hint)
brand_mode = mode_or_na(df["brand"])
def impute_brand(row):
    b = row["brand"]
    if pd.isna(b) or str(b).strip()=="":
        cand = row.get("manufacturer")
        if pd.notna(cand) and str(cand).strip()!="":
            return str(cand).strip()
        # fuzzy extract from title using frequent brands (top 100)
        common = df["brand"].dropna().astype(str).str.strip().value_counts().head(100).index.tolist()
        title = str(row.get("title",""))
        match, score, _ = process.extractOne(title, common, scorer=fuzz.partial_ratio) if common else (None,0,None)
        if score>=85: return match
        return brand_mode
    return b

df["brand"] = df.apply(impute_brand, axis=1).astype(str).str.strip()

# Description impute
def impute_desc(row):
    d = row["description"]
    if pd.notna(d) and str(d).strip()!="":
        return str(d).strip()
    bits = [str(row.get("title","")).strip(), str(row.get("brand","")).strip()]
    cats = ", ".join(row.get("categories", []))
    if cats: bits.append(cats)
    return " - ".join([b for b in bits if b])
df["description"] = df.apply(impute_desc, axis=1)

# Material, Color impute (simple: mode per primary_category, else global mode)
mat_mode_global = mode_or_na(df["material"])
col_mode_global = mode_or_na(df["color"])
mat_mode_by_cat = df.groupby("primary_category")["material"].agg(mode_or_na)
col_mode_by_cat = df.groupby("primary_category")["color"].agg(mode_or_na)

def impute_by_cat(row, col, by_cat, global_mode):
    v = row[col]
    if pd.notna(v) and str(v).strip()!="":
        return str(v).strip()
    cat = row["primary_category"]
    if pd.notna(cat) and cat in by_cat and pd.notna(by_cat[cat]):
        return str(by_cat[cat]).strip()
    return str(global_mode) if pd.notna(global_mode) else ""

df["material"] = df.apply(lambda r: impute_by_cat(r,"material", mat_mode_by_cat, mat_mode_global), axis=1)
df["color"]    = df.apply(lambda r: impute_by_cat(r,"color",    col_mode_by_cat, col_mode_global), axis=1)

# Package dimensions: normalize to a canonical string "L x W x H (units)" if possible; else keep
def clean_dims(x):
    if pd.isna(x): return ""
    s = str(x).lower().strip()
    s = s.replace("×","x").replace("*","x")
    return re.sub(r"\s+", " ", s)
df["package_dimensions"] = df["package_dimensions"].apply(clean_dims)

# Country: fill with mode
df["country_of_origin"] = df["country_of_origin"].fillna(mode_or_na(df["country_of_origin"]))


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [35]:
# --- FIX CELL: ensure primary_image and has_image exist ---

import re, ast
from typing import List

def _to_list(x):
    if x is None or (isinstance(x, float) and pd.isna(x)): return []
    s = str(x).strip()
    if s.startswith('['):
        try: return list(ast.literal_eval(s))
        except: pass
    return [t.strip() for t in s.split(",") if t.strip()]

def _is_url(s: str) -> bool:
    return bool(re.match(r'^https?://', s or "", re.I))

def _pick_primary_image(img_list: List[str]) -> str:
    exts = (".jpg",".jpeg",".png",".webp")
    for u in img_list:
        if _is_url(u) and any(u.lower().split("?")[0].endswith(e) for e in exts):
            return u
    for u in img_list:
        if _is_url(u):
            return u
    return ""

# Make sure required columns exist
for col in ["images", "primary_image", "has_image"]:
    if col not in df.columns:
        df[col] = pd.NA

# Normalize images to list
df["images"] = df["images"].apply(_to_list)

# (Re)compute primary_image and has_image
df["primary_image"] = df["images"].apply(_pick_primary_image)
df["has_image"] = df["primary_image"].apply(lambda s: bool(s))

# Optional quick sanity print
print("Rows:", len(df),
      "| with image:", int(df["has_image"].sum()),
      "| without image:", int((~df["has_image"]).sum()))


Rows: 312 | with image: 312 | without image: 0


In [36]:
from urllib.parse import urlparse
import pandas as pd

# Count images we *intend* to download
candidates = df[df["has_image"]].copy()
print("rows with primary_image:", len(candidates))

# Domain stats (helps spot hosts that block bots)
domains = (
    candidates["primary_image"]
    .dropna().astype(str)
    .apply(lambda u: urlparse(u).netloc.lower())
    .value_counts()
    .head(15)
)
domains


rows with primary_image: 312


primary_image
m.media-amazon.com    312
Name: count, dtype: int64

In [37]:
import re
from typing import List

def is_url(s: str) -> bool:
    return bool(re.match(r'^https?://', s or "", re.I))

PREFERRED_EXTS = (".jpg",".jpeg",".png",".webp",".bmp")

def best_image_candidates(img_list: List[str]) -> List[str]:
    """Return a prioritized list of image URLs from the product's images array."""
    if not isinstance(img_list, list):
        img_list = []
    cleaned = []
    for u in img_list:
        if not isinstance(u, str):
            continue
        u = u.strip().strip('"').strip("'")
        if is_url(u):
            cleaned.append(u)
    # 1) URLs with preferred extensions first
    pref = [u for u in cleaned if any(u.lower().split("?")[0].endswith(ext) for ext in PREFERRED_EXTS)]
    # 2) Then any remaining valid URL (some servers hide extension)
    others = [u for u in cleaned if u not in pref]
    return pref + others

# Build per-row candidate lists
df["image_candidates"] = df["images"].apply(best_image_candidates)

# sanity
df[["uniq_id","primary_image","image_candidates"]].head(3)


Unnamed: 0,uniq_id,primary_image,image_candidates
0,02593e81-5c09-5069-8516-b0b29f439ded,https://m.media-amazon.com/images/I/416WaLx10j...,[https://m.media-amazon.com/images/I/416WaLx10...
1,5938d217-b8c5-5d3e-b1cf-e28e340f292e,https://m.media-amazon.com/images/I/31SejUEWY7...,[https://m.media-amazon.com/images/I/31SejUEWY...
2,b2ede786-3f51-5a45-9a5b-bcf856958cd8,https://m.media-amazon.com/images/I/41RgefVq70...,[https://m.media-amazon.com/images/I/41RgefVq7...


# IMAGE DOWNLOAD

In [56]:
import os, re, io, ast, hashlib, asyncio, aiohttp, random, string
from dataclasses import dataclass
from typing import List, Dict, Any
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential
from dotenv import load_dotenv

load_dotenv()

# ⬅️ set these
CSV_PATH  = os.getenv("CSV_PATH", "intern_data_ikarus.csv")
IMAGE_DIR = os.getenv("IMAGE_DIR", "./data/images_all")

os.makedirs(IMAGE_DIR, exist_ok=True)


In [57]:
df = pd.read_csv(CSV_PATH)

# Ensure expected cols exist
for c in ["uniq_id","images"]:
    if c not in df.columns: df[c] = pd.NA

def to_list(x):
    if pd.isna(x): return []
    s = str(x).strip()
    if s.startswith('['):
        try:
            v = ast.literal_eval(s)
            return [str(u).strip() for u in v if str(u).strip()]
        except:
            pass
    return [u.strip() for u in s.split(",") if u.strip()]

df["images_list"] = df["images"].apply(to_list)

total_expected = int(df["images_list"].apply(len).sum())
print("Rows:", len(df), "| total image URLs found:", total_expected)


Rows: 312 | total image URLs found: 1966


In [None]:
def is_url(s: str) -> bool:
    return bool(re.match(r"^https?://", s or "", re.I))

@dataclass
class ImageTask:
    uniq_id: str
    idx: int
    url: str
    out_path: str

def file_name(uniq_id: str, idx: int, url: str) -> str:
    base = os.path.splitext(url.split("?")[0])[1].lower()
    ext = base if base in [".jpg",".jpeg",".png",".webp"] else ".jpg"
    h = hashlib.md5(url.encode("utf-8")).hexdigest()[:8]
    return f"{idx:03d}_{h}{ext}"

tasks: List[ImageTask] = []
for _, row in df.iterrows():
    uid = str(row.get("uniq_id","unknown"))
    imgs = [u for u in row["images_list"] if is_url(u)]
    if not imgs:
        continue
    out_dir = os.path.join(IMAGE_DIR, uid)
    os.makedirs(out_dir, exist_ok=True)
    for i, url in enumerate(imgs):
        out_path = os.path.join(out_dir, file_name(uid, i, url))
        if not os.path.exists(out_path):
            tasks.append(ImageTask(uniq_id=uid, idx=i, url=url, out_path=out_path))

print("Planned downloads:", len(tasks), " (will skip already-downloaded files)")


Planned downloads: 1966  (will skip already-downloaded files)


In [None]:
UA_LIST = [
    # a few realistic UAs
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]

def rand_headers(url: str) -> Dict[str,str]:
    return {
        "User-Agent": random.choice(UA_LIST),
        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*;q=0.8,*/*;q=0.5",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": f"https://{re.sub(r'^https?://','',url).split('/')[0]}/",
        "Cache-Control": "no-cache",
        "Pragma": "no-cache",
    }

def ensure_ext(path: str, content_type: str) -> str:
    if content_type:
        ct = content_type.lower().split(";")[0]
        if ct.endswith("jpeg") or ct.endswith("jpg"):
            return os.path.splitext(path)[0] + ".jpg"
        if ct.endswith("png"):
            return os.path.splitext(path)[0] + ".png"
        if ct.endswith("webp"):
            return os.path.splitext(path)[0] + ".webp"
    return path

@retry(stop=stop_after_attempt(4), wait=wait_exponential(multiplier=0.6, min=0.6, max=6))
async def fetch_and_save(session: aiohttp.ClientSession, task: ImageTask) -> bool:
    async with session.get(task.url, headers=rand_headers(task.url), allow_redirects=True) as resp:
        if resp.status != 200:
            raise RuntimeError(f"HTTP {resp.status}")
        ctype = resp.headers.get("Content-Type","").lower()
        raw = await resp.read()
        # Validate & normalize
        try:
            im = Image.open(io.BytesIO(raw))
            im = im.convert("RGB") if im.mode not in ("RGB","L") else im
        except Exception as e:
            raise RuntimeError(f"Not an image or corrupt: {e}")
        out_path = ensure_ext(task.out_path, ctype)
        # Save (JPEG for non-lossless; preserve PNG if originally PNG)
        ext = os.path.splitext(out_path)[1].lower()
        if ext == ".png":
            im.save(out_path, format="PNG", optimize=True)
        elif ext == ".webp":
            im.save(out_path, format="WEBP", quality=90, method=4)
        else:
            im.save(out_path, format="JPEG", quality=92, optimize=True)
        return True

async def run_download(tasks: List[ImageTask], concurrency: int = 24, per_host: int = 8, timeout_s: int = 20):
    timeout = aiohttp.ClientTimeout(total=timeout_s)
    connector = aiohttp.TCPConnector(limit=concurrency, limit_per_host=per_host, ttl_dns_cache=300)
    success = 0
    failed: List[ImageTask] = []

    pbar = tqdm(total=len(tasks), desc="downloading images")
    async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
        sem = asyncio.Semaphore(concurrency)
        async def worker(t: ImageTask):
            nonlocal success
            async with sem:
                try:
                    ok = await fetch_and_save(session, t)
                    if ok: success += 1
                    else: failed.append(t)
                except Exception:
                    failed.append(t)
                finally:
                    pbar.update(1)

        await asyncio.gather(*[worker(t) for t in tasks])
    pbar.close()
    return success, failed


In [60]:
if not tasks:
    print("Nothing to download (all files already exist).")
else:
    ok, failed = asyncio.run(run_download(tasks, concurrency=32, per_host=8, timeout_s=30))
    print(f"\nDownloaded OK: {ok} / Planned: {len(tasks)}")
    print(f"Total URLs in CSV: {total_expected} (some may already exist on disk)")
    if failed:
        print("\nFailed samples (showing up to 10):")
        for t in failed[:10]:
            print("-", t.uniq_id, t.idx, t.url)


downloading images: 100%|██████████| 1966/1966 [01:18<00:00, 25.04it/s]


Downloaded OK: 1948 / Planned: 1966
Total URLs in CSV: 1966 (some may already exist on disk)

Failed samples (showing up to 10):
- fb5af385-aee6-568c-a22f-e6b90ef92dac 3 https://m.media-amazon.com/images/G/01/HIT/ImageBlockDimension/dimensions_SS522_.png
- 599c5d85-f15d-57ee-a301-30da352c2013 6 https://m.media-amazon.com/images/G/01/HIT/ImageBlockDimension/dimensions_SS522_.png
- 72675ea3-0e2e-5752-a1fb-61a1a3031f4e 6 https://m.media-amazon.com/images/G/01/HIT/ImageBlockDimension/dimensions_SS522_.png
- 579657ef-e010-5fb7-b301-a50db999bba7 7 https://m.media-amazon.com/images/G/01/HIT/ImageBlockDimension/dimensions_SS522_.png
- 9b0e0b55-3984-5624-9bb6-3552bb4d262f 6 https://m.media-amazon.com/images/G/01/HIT/ImageBlockDimension/dimensions_SS522_.png
- ed575bc1-c87c-51f4-b34b-a7b64b1b70f3 4 https://m.media-amazon.com/images/G/01/HIT/ImageBlockDimension/dimensions_SS522_.png
- c5ab3463-5b96-5d58-9f88-8175cdea76a3 6 https://m.media-amazon.com/images/G/01/HIT/ImageBlockDimension/dimensions




# EMBEDDINGS

In [67]:
import os, ast, re, glob, io
from typing import List, Dict, Any, Tuple
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from dotenv import load_dotenv

load_dotenv()

# --- REQUIRED ENV ---
PINECONE_API_KEY     = os.getenv("PINECONE_API_KEY")
PINECONE_ENV         = os.getenv("PINECONE_ENV", "us-east-1-aws")
PINECONE_TEXT_INDEX  = os.getenv("PINECONE_TEXT_INDEX", "products-text")
PINECONE_IMAGE_INDEX = os.getenv("PINECONE_IMAGE_INDEX", "products-image")  # can be unused if no images

CSV_PATH   = os.getenv("CSV_PATH", "intern_data_ikarus.csv")
IMAGE_DIR  = os.getenv("IMAGE_DIR", "./data/images_all")  # pre-downloaded, one folder per uniq_id

assert PINECONE_API_KEY, "Missing PINECONE_API_KEY in .env"
assert os.path.exists(CSV_PATH), f"CSV not found at {CSV_PATH}"


In [68]:
# Expected schema
expected_cols = [
    "uniq_id","title","brand","description","price","categories","images",
    "manufacturer","package_dimensions","country_of_origin","material","color"
]

df = pd.read_csv(CSV_PATH)
for c in expected_cols:
    if c not in df.columns:
        df[c] = pd.NA
df = df[expected_cols].copy()

def to_list(x):
    if pd.isna(x): return []
    s = str(x).strip()
    if s.startswith("["):
        try:
            return list(ast.literal_eval(s))
        except Exception:
            pass
    return [u.strip() for u in s.split(",") if u.strip()]

def to_float(x):
    if pd.isna(x): return pd.NA
    s = str(x).replace(",", "").replace("₹", "").strip()
    m = re.findall(r"[-+]?\d*\.?\d+", s)
    return float(m[0]) if m else pd.NA

df["categories"] = df["categories"].apply(to_list)
df["images"]     = df["images"].apply(to_list)
df["price"]      = df["price"].apply(to_float)

len(df), df.columns.tolist()[:5]


(312, ['uniq_id', 'title', 'brand', 'description', 'price'])

In [None]:
# Build a mapping uniq_id -> list of local image file paths (already downloaded earlier)
uid_to_imgs: Dict[str, List[str]] = {}
if os.path.isdir(IMAGE_DIR):
    for uid in os.listdir(IMAGE_DIR):
        folder = os.path.join(IMAGE_DIR, uid)
        if not os.path.isdir(folder):
            continue
        imgs = sorted(
            p for p in glob.glob(os.path.join(folder, "*"))
            if os.path.splitext(p)[1].lower() in (".jpg",".jpeg",".png",".webp")
        )
        if imgs:
            uid_to_imgs[str(uid)] = imgs

print("Products with local images:", len(uid_to_imgs))


Products with local images: 305


In [71]:
# Cell 4 — Load Embedders (robust dim detection; GPU if available)
import torch, numpy as np
from sentence_transformers import SentenceTransformer
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

# Text encoder (384-d): small, strong for metadata
text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

# Image encoder via CLIP; averages multiple images per product
img_model  = SentenceTransformer("clip-ViT-B-32", device=device)

def _safe_dim_for_text(model) -> int:
    vec = model.encode(["dimension probe"], normalize_embeddings=True, convert_to_numpy=True)
    # vec shape: (1, D)
    return int(vec.shape[-1])

def _safe_dim_for_image(model) -> int:
    # tiny black image probe
    probe = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))
    vec = model.encode([probe], normalize_embeddings=True, convert_to_numpy=True)
    # vec shape: (1, D)
    return int(vec.shape[-1])

TEXT_DIM = _safe_dim_for_text(text_model)   # expected 384
IMG_DIM  = _safe_dim_for_image(img_model)   # expected 512

TEXT_DIM, IMG_DIM, device


(384, 512, 'cpu')

In [73]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

existing = [ix["name"] for ix in pc.list_indexes()]
if PINECONE_TEXT_INDEX not in existing:
    pc.create_index(
        name=PINECONE_TEXT_INDEX,
        dimension=TEXT_DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
if PINECONE_IMAGE_INDEX not in existing:
    pc.create_index(
        name=PINECONE_IMAGE_INDEX,
        dimension=IMG_DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

text_index  = pc.Index(PINECONE_TEXT_INDEX)
image_index = pc.Index(PINECONE_IMAGE_INDEX)


In [76]:
# Cell 6 — Helpers: Build Text & Metadata (robust to missing/empty values)

def _safe_float(x, default=0.0):
    # Accepts float/int/str/None/NaN and returns a float
    try:
        if x is None:
            return float(default)
        # If already numeric:
        if isinstance(x, (int, float)):
            return float(x)
        s = str(x).strip()
        if s == "" or s.lower() == "nan" or s.lower() == "none":
            return float(default)
        # extract first numeric token if needed
        import re
        m = re.findall(r"[-+]?\d*\.?\d+", s)
        return float(m[0]) if m else float(default)
    except Exception:
        return float(default)

def product_text(row: pd.Series) -> str:
    cats = ", ".join(row["categories"]) if isinstance(row["categories"], list) else ""
    parts = [
        f"Title: {row['title']}" if pd.notna(row['title']) else "",
        f"Brand: {row['brand']}" if pd.notna(row['brand']) else "",
        f"Category: {cats}" if cats else "",
        f"Material: {row['material']}" if pd.notna(row['material']) and str(row['material']).strip() else "",
        f"Color: {row['color']}" if pd.notna(row['color']) and str(row['color']).strip() else "",
        f"Country: {row['country_of_origin']}" if pd.notna(row['country_of_origin']) and str(row['country_of_origin']).strip() else "",
        f"Price: {_safe_float(row['price'])}" if pd.notna(row['price']) else "",
        f"Description: {row['description']}" if pd.notna(row['description']) and str(row['description']).strip() else "",
    ]
    return " | ".join([p for p in parts if p and str(p).strip() != ""])

def meta_from_row(row: pd.Series) -> Dict[str, Any]:
    cats = row["categories"] if isinstance(row["categories"], list) else []
    return {
        "uniq_id": str(row["uniq_id"]),
        "title": "" if not pd.notna(row["title"]) else str(row["title"]),
        "brand": "" if not pd.notna(row["brand"]) else str(row["brand"]),
        "price": _safe_float(row["price"], default=0.0),
        "categories": cats,
        "material": "" if not pd.notna(row["material"]) else str(row["material"]),
        "color": "" if not pd.notna(row["color"]) else str(row["color"]),
        "country_of_origin": "" if not pd.notna(row["country_of_origin"]) else str(row["country_of_origin"]),
    }


In [77]:
BATCH = 128
rows = df.fillna("").to_dict(orient="records")

def chunks(it, n):
    for i in range(0, len(it), n):
        yield it[i:i+n]

for batch in tqdm(list(chunks(rows, BATCH)), desc="text embed + upsert"):
    texts = [product_text(pd.Series(r)) for r in batch]
    vecs = text_model.encode(texts, normalize_embeddings=True).tolist()
    to_upsert = []
    for r, v in zip(batch, vecs):
        to_upsert.append({
            "id": str(r["uniq_id"]),
            "values": v,
            "metadata": meta_from_row(pd.Series(r))
        })
    text_index.upsert(vectors=to_upsert, namespace="default")


text embed + upsert: 100%|██████████| 3/3 [00:14<00:00,  4.99s/it]


# image embedding

In [78]:
if image_index is not None:
    MAX_IMG = 8  # cap per product to control runtime/memory

    def embed_images(paths: List[str]) -> np.ndarray | None:
        ims = []
        for p in paths:
            try:
                im = Image.open(p).convert("RGB")
                ims.append(im)
            except Exception:
                pass
        if not ims:
            return None
        embs = img_model.encode(ims, normalize_embeddings=True, convert_to_numpy=True)  # (n, 512)
        vec = embs.mean(axis=0)
        vec = vec / (np.linalg.norm(vec) + 1e-12)
        return vec.astype(np.float32)

    uid_list = df["uniq_id"].astype(str).tolist()
    to_process: List[Tuple[str, List[str]]] = []
    for uid in uid_list:
        imgs = uid_to_imgs.get(uid, [])
        if imgs:
            to_process.append((uid, imgs[:MAX_IMG]))

    BATCH_IMG = 64
    total_upserts = 0
    for chunk in tqdm(list(chunks(to_process, BATCH_IMG)), desc="image embed + upsert"):
        upserts = []
        for uid, paths in chunk:
            vec = embed_images(paths)
            if vec is None:
                continue
            row = df.loc[df["uniq_id"].astype(str) == uid].iloc[0]
            upserts.append({
                "id": uid,
                "values": vec.tolist(),
                "metadata": meta_from_row(row),
            })
        if upserts:
            image_index.upsert(vectors=upserts, namespace="default")
            total_upserts += len(upserts)

    print("Image embeddings upserted:", total_upserts)
else:
    print("Skipping image embeddings (no image_index or no local images).")


image embed + upsert: 100%|██████████| 5/5 [02:12<00:00, 26.48s/it]

Image embeddings upserted: 312





In [79]:
# TEXT → TEXT query
query_text = "modern wooden dining chair under 6000, natural finish"
qv = text_model.encode([query_text], normalize_embeddings=True, convert_to_numpy=True)[0].tolist()
res = text_index.query(vector=qv, top_k=5, include_metadata=True, namespace="default")

print("TEXT query results:")
for m in res.get("matches", []):
    md = m.get("metadata", {})
    print(f"{m.get('score',0):.4f} | {md.get('title')} — {md.get('brand')} — ₹{md.get('price')}")

# IMAGE → IMAGE query (only if we have images locally and image_index exists)
if image_index is not None and uid_to_imgs:
    sample_uid = next(iter(uid_to_imgs))
    probe_path = uid_to_imgs[sample_uid][0]
    im = Image.open(probe_path).convert("RGB")
    iv = img_model.encode([im], normalize_embeddings=True, convert_to_numpy=True)[0].tolist()

    res2 = image_index.query(vector=iv, top_k=5, include_metadata=True, namespace="default")
    print("\nIMAGE query results (probe:", os.path.basename(probe_path), ")")
    for m in res2.get("matches", []):
        md = m.get("metadata", {})
        print(f"{m.get('score',0):.4f} | {md.get('title')} — {md.get('brand')}")
else:
    print("\n(No image query performed.)")


TEXT query results:
0.5897 | CangLong Mid Century Modern Side Chair with Wood Legs for Kitchen, Living Dining Room, Set of 1, Black — CangLong Store — ₹0.0
0.5895 | Leather At Home, Decorative 13 Inch Rounded Pillow Handmade from Full Grain Leather - Chair Seat, Confortable Sitting for Round Wooden/Metal Stools - Bourbon Brown — Leather At Home Store — ₹26.49
0.5858 | Armen Living Julius 30" Cream Faux Leather and Walnut Wood Bar Stool — Armen Living Store — ₹0.0
0.5667 | Christopher Knight Home Munro Recliner, Navy Blue + Teak — Christopher Knight Home Store — ₹0.0
0.5644 | Adeco Euro Style Fabric Arm Bench Chair Footstool Cubic Ottomans, Brown — Adeco Store — ₹0.0

IMAGE query results (probe: 000_237f60e9.jpg )
0.9631 | GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway — GOYMFK
0.9284 | sogesfurniture 5 Tier Free Standing Wooden Shoe Storage Shelf Shoe Organizer, 29.5 inches Shoe Rack Shoe Organizer Storage Cabi