In [13]:
# backfill_image_url_v3.py
import os, json, ast, math, pandas as pd
from tqdm import tqdm
from pinecone import Pinecone

# ---------- CONFIG ----------
INDEX_NAME = "products-text"     # ← change to your Pinecone index name
NAMESPACE  = "default"             # ← change if you used another namespace
CSV_PATH   = "intern_data_ikarus.csv"  # ← path to your dataset CSV
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]

# If you serve local images via FastAPI (app.mount("/static/images", ...)),
# you can build local URLs instead of remote ones by setting:
LOCAL_STATIC_PREFIX = None  # e.g. "/static/images" or keep None to use remote URLs from CSV
# ----------------------------

def is_nan(x):
    try:
        return x is None or (isinstance(x, float) and math.isnan(x))
    except Exception:
        return False

def parse_images_field(v):
    """
    Return list[str] of candidate URLs/paths from:
    - python literal list string: "['https://...', ' https://...']"
    - real list objects
    - JSON list strings
    - delimited strings
    - single string
    """
    if is_nan(v):
        return []

    if isinstance(v, list):
        return [str(x).strip() for x in v if isinstance(x, (str, os.PathLike)) and str(x).strip()]

    s = str(v).strip()
    if not s:
        return []

    # Try python literal (handles single quotes)
    if s.startswith("[") and s.endswith("]"):
        try:
            lit = ast.literal_eval(s)
            if isinstance(lit, list):
                return [str(x).strip() for x in lit if isinstance(x, (str, os.PathLike)) and str(x).strip()]
        except Exception:
            pass

    # Try JSON list (double quotes)
    if s.startswith("[") and s.endswith("]"):
        try:
            arr = json.loads(s)
            if isinstance(arr, list):
                return [str(x).strip() for x in arr if isinstance(x, (str, os.PathLike)) and str(x).strip()]
        except Exception:
            pass

    # Delimited fallback
    for delim in [",", ";", "|"]:
        if delim in s:
            parts = [p.strip() for p in s.split(delim)]
            return [p for p in parts if p]

    # Single string
    return [s]

def first_image_url(row):
    """Pick first http(s) URL, else optional local static path if configured."""
    uid = str(row["uniq_id"])
    imgs = parse_images_field(row.get("images", ""))

    for u in imgs:
        if u.startswith(("http://", "https://")):
            return u

    if LOCAL_STATIC_PREFIX:
        # Adjust filename if your local files have different names
        return f"{LOCAL_STATIC_PREFIX}/{uid}/0.jpg"

    return ""

def main():
    # Connect Pinecone
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(INDEX_NAME)

    # Load CSV and basic diagnostics
    df = pd.read_csv(CSV_PATH)
    print("Rows:", len(df))
    has_imgs = df["images"].notna().sum() if "images" in df.columns else 0
    print("Rows with non-null `images`:", has_imgs)

    # Preview parse for first 3 rows
    for r in df.head(3).to_dict(orient="records"):
        print("SAMPLE uniq_id:", r.get("uniq_id"))
        print("  RAW images:", r.get("images"))
        parsed = parse_images_field(r.get("images"))
        print("  PARSED   :", parsed[:3])
        print("  PICKED   :", first_image_url(r))
        print()

    # (Optional) probe id match
    probe_ids = [str(r["uniq_id"]) for _, r in df.head(10).iterrows()]
    try:
        fetched = index.fetch(ids=probe_ids, namespace=NAMESPACE)
        print("Probe fetch found:", len(fetched.get("vectors", {})), "of", len(probe_ids))
    except Exception as e:
        print("Fetch probe error (check index/namespace):", e)

    # Update one-by-one
    updated, skipped, missing = 0, 0, 0
    for _, row in tqdm(df.iterrows(), total=len(df)):
        uid = str(row["uniq_id"])
        url = first_image_url(row)
        if not url:
            skipped += 1
            continue
        try:
            index.update(id=uid, set_metadata={"image_url": url}, namespace=NAMESPACE)
            updated += 1
        except Exception as e:
            # If the vector doesn't exist in the namespace, you'll see an error here.
            # We just count it and move on.
            missing += 1

    print(f"Updated: {updated} | Skipped (no URL): {skipped} | Missing IDs (not in index/ns): {missing}")

    # Quick verify: fetch one updated ID (if any)
    if updated > 0:
        sample_row = df.iloc[0]
        sample_id = str(sample_row["uniq_id"])
        try:
            check = index.fetch(ids=[sample_id], namespace=NAMESPACE)
            md = (check.get("vectors", {}).get(sample_id, {}) or {}).get("metadata")
            print("Sample metadata for", sample_id, "->", md)
        except Exception:
            pass

if __name__ == "__main__":
    main()


Rows: 312
Rows with non-null `images`: 312
SAMPLE uniq_id: 02593e81-5c09-5069-8516-b0b29f439ded
  RAW images: ['https://m.media-amazon.com/images/I/416WaLx10jL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/41kuxipTsuL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/51T9x4yZd3L._SS522_.jpg ', ' https://m.media-amazon.com/images/I/61w6ifIrCpL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/51bBQlUn8GL._SS522_.jpg']
  PARSED   : ['https://m.media-amazon.com/images/I/416WaLx10jL._SS522_.jpg', 'https://m.media-amazon.com/images/I/41kuxipTsuL._SS522_.jpg', 'https://m.media-amazon.com/images/I/51T9x4yZd3L._SS522_.jpg']
  PICKED   : https://m.media-amazon.com/images/I/416WaLx10jL._SS522_.jpg

SAMPLE uniq_id: 5938d217-b8c5-5d3e-b1cf-e28e340f292e
  RAW images: ['https://m.media-amazon.com/images/I/31SejUEWY7L._SS522_.jpg ', ' https://m.media-amazon.com/images/I/41mr+A9JmbL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/41JjrWgA0XL._SS522_.jpg ', ' https://m.media-amazon.co

100%|██████████| 312/312 [02:04<00:00,  2.51it/s]


Updated: 312 | Skipped (no URL): 0 | Missing IDs (not in index/ns): 0
Sample metadata for 02593e81-5c09-5069-8516-b0b29f439ded -> {'brand': 'GOYMFK', 'categories': ['Home & Kitchen', 'Storage & Organization', 'Clothing & Closet Storage', 'Shoe Organizers', 'Free Standing Shoe Racks'], 'cluster_id': 0.0, 'cluster_tag': 'storage cabinet, bathroom organizer, shoe organizer', 'color': 'White', 'country_of_origin': 'China', 'dup_of': '', 'image_url': 'https://m.media-amazon.com/images/I/416WaLx10jL._SS522_.jpg', 'is_duplicate': False, 'material': 'Metal', 'pred_conf': 0.9522345066070557, 'predicted_category': 'home & kitchen', 'price': 24.99, 'title': 'GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway', 'uniq_id': '02593e81-5c09-5069-8516-b0b29f439ded'}
