In [1]:
import os, io, re, json, math, random, requests, torch
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

# paths
DATA_CSV = os.path.join('..','data','products.csv')
assert os.path.exists(DATA_CSV), f"CSV not found: {DATA_CSV}"

# display
pd.set_option('display.max_colwidth', 200)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(DATA_CSV).fillna("")
print("Rows, Cols:", df.shape)
df.head(3)


Rows, Cols: (312, 12)


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway",GOYMFK,"multiple shoes, coats, hats, and other items Easy to assemble: Includes all necessary hardware and instructions for easy assembly Versatile: Perfect for use in living rooms, bathrooms, hallways, a...",$24.99,"['Home & Kitchen', 'Storage & Organization', 'Clothing & Closet Storage', 'Shoe Organizers', 'Free Standing Shoe Racks']","['https://m.media-amazon.com/images/I/416WaLx10jL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/41kuxipTsuL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/51T9x4yZd3L._SS522_.jpg ', '...",GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,"subrtex Leather ding Room, Dining Chairs Set of 2, Black",subrtex,subrtex Dining chairs Set of 2,,"['Home & Kitchen', 'Furniture', 'Dining Room Furniture', 'Chairs']","['https://m.media-amazon.com/images/I/31SejUEWY7L._SS522_.jpg ', ' https://m.media-amazon.com/images/I/41mr+A9JmbL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/41JjrWgA0XL._SS522_.jpg ', '...",Subrtex Houseware INC,"18.5""D x 16""W x 35""H",,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e
2,"Plant Repotting Mat MUYETOL Waterproof Transplanting Mat Indoor 26.8"" x 26.8"" Portable Square Foldable Easy to Clean Gardening Work Mat Soil Changing Mat Succulent Plant Transplanting Mat Garden G...",MUYETOL,,$5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doormats']","['https://m.media-amazon.com/images/I/41RgefVq70L._SS522_.jpg ', ' https://m.media-amazon.com/images/I/414SPEuzxlL._SS522_.jpg ', ' https://m.media-amazon.com/images/I/51gknsPKCHL._SS522_.jpg ', '...",MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8


In [3]:
# helpful coalesce
def coalesce(*vals):
    return ' '.join([str(v) for v in vals if pd.notna(v) and str(v).strip()])

# standardize helper columns
def first_cat(s):
    s = str(s)
    parts = [p.strip() for p in s.split(',') if p.strip()]
    return parts[0] if parts else ""

def first_img(s):
    s = str(s)
    return s.split(',')[0].strip() if s else ""

# clean price numeric
def parse_price(x):
    s = str(x)
    s = re.sub(r'[^\d.\-]', '', s)   # remove currency symbols/commas
    try:
        return float(s)
    except:
        return np.nan

df['price_num'] = df.get('price', '').apply(parse_price)
df['category_main'] = df.get('categories', '').apply(first_cat)
df['image_0'] = df.get('images', '').apply(first_img)

# master text field for embeddings (mirrors backend)
df['doc'] = df.apply(lambda r: coalesce(
    r.get('title'), r.get('brand'), r.get('description'),
    r.get('material'), r.get('color')), axis=1)

print(df[['uniq_id','title','category_main']].head(5).to_string(index=False))
print("\nPrice stats:\n", df['price_num'].describe())
print("\nCategory samples:\n", df['category_main'].value_counts().head(10))


                             uniq_id                                                                                                                                                                                                    title     category_main
02593e81-5c09-5069-8516-b0b29f439ded                                                                               GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway ['Home & Kitchen'
5938d217-b8c5-5d3e-b1cf-e28e340f292e                                                                                                                                                 subrtex Leather ding Room, Dining Chairs Set of 2, Black ['Home & Kitchen'
b2ede786-3f51-5a45-9a5b-bcf856958cd8 Plant Repotting Mat MUYETOL Waterproof Transplanting Mat Indoor 26.8" x 26.8" Portable Square Foldable Easy to Clean Gardening Work Mat Soil Changing Mat Succulent Plant Transplanting Mat Garden 

In [4]:
from sentence_transformers import SentenceTransformer

EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # fast & good
embedder = SentenceTransformer(EMB_MODEL_NAME)

texts = df['doc'].tolist()
emb = embedder.encode(texts, batch_size=64, normalize_embeddings=True, show_progress_bar=True)
emb = np.asarray(emb, dtype=np.float32)  # (N, D)
emb.shape


Batches: 100%|██████████| 5/5 [00:07<00:00,  1.48s/it]


(312, 384)

In [5]:
def cosine_sim_matrix_row(vectors, i):
    return vectors @ vectors[i]

idx = 0  # take first row as anchor
sims = cosine_sim_matrix_row(emb, idx)
top = np.argsort(-sims)[:6]  # top6 incl. self
df.iloc[top][['uniq_id','title','brand','category_main']].assign(score=sims[top]).reset_index(drop=True)


Unnamed: 0,uniq_id,title,brand,category_main,score
0,02593e81-5c09-5069-8516-b0b29f439ded,"GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway",GOYMFK,['Home & Kitchen',1.0
1,02593e81-5c09-5069-8516-b0b29f439ded,"GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway",GOYMFK,['Home & Kitchen',1.0
2,122c5c2a-5490-51ce-8555-9526c9698a38,"LANTEFUL Shoe Rack Organizer Shoe Storage Cabinet 8 Tiers 32 Pair Portable Shoe Storage Sturdy Plastic Black Shoe Shelf with Hooks Shoe Rack with Door for Entryway, Bedroom and Hallway",LANTEFUL Store,['Home & Kitchen',0.656346
3,f28d5cba-ecd4-5d82-87da-d926d48e1155,"sogesfurniture 5 Tier Free Standing Wooden Shoe Storage Shelf Shoe Organizer, 29.5 inches Shoe Rack Shoe Organizer Storage Cabinet for Entryway, Living Room, Hallway, Doorway, Black",sogesfurniture Store,['Home & Kitchen',0.656067
4,c118a4dc-505e-512f-ba76-bd4e56569556,"FLYJOE Shoe Rack Bench, 3-Tier Freestanding Wooden Shoe Organizer with Seat, Entryway Bench, Storage Shelf for Kitchen Living Room Bathroom Bedroom, Walnut",FLYJOE,['Home & Kitchen',0.633523
5,a10176fb-74af-5428-9aaf-2787aa4d66d2,"MoNiBloom Foldable Storage Free Standing Shoes Shelf, Bamboo Multifunctional 4-Tier Shoe Organizer for 16-20 Pairs Entryway, Hallway, Corridor, Natural",MoNiBloom Store,['Home & Kitchen',0.585027


In [7]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings

class SBERTEmbeddings(Embeddings):
    def __init__(self, model: SentenceTransformer):
        self.model = model
    def embed_documents(self, texts):
        return self.model.encode(texts, normalize_embeddings=True).tolist()
    def embed_query(self, text):
        return self.model.encode([text], normalize_embeddings=True)[0].tolist()

metas = df.to_dict(orient='records')
store = FAISS.from_texts(df['doc'].tolist(), embedding=SBERTEmbeddings(embedder), metadatas=metas)

# test a few queries that should work across many catalogs
for q in ["modern wooden chair", "office desk", "table lamp", "leather sofa"]:
    hits = store.similarity_search(q, k=5)
    print("\nQUERY:", q)
    for h in hits:
        print(" -", h.metadata.get('title','(no title)'))



QUERY: modern wooden chair
 - Black Leather Office Chair Mid Back Leather Desk Chair Modern Excutive Office Chair with Arms and Wheels for Home Office, by Artswish
 - PONTMENT Foot Stool Leather Footstool Solid Wood Vintage Foot Rest Faux Leather Ottoman Upholstered Footrest for Living Room/Sofa/Couch.
 - Armen Living Julius 30" Cream Faux Leather and Walnut Wood Bar Stool
 - Adeco Euro Style Fabric Arm Bench Chair Footstool Cubic Ottomans, Brown
 - Christopher Knight Home Munro Recliner, Navy Blue + Teak

QUERY: office desk
 - ODK Small Computer Desk, 27.5 Inch, Compact Tiny Study Desk with Storage and Monitor Stand for Home Office, Small Spaces, Black
 - ODK Small Computer Desk, 27.5 inch Desk for Small Spaces with Storage, Compact Table with Monitor & Storage Shelves for Home Office, Modern Style Laptop Desk, Pure White
 - It's_Organized Gaming Desk 55 inch PC Computer Desk, K-Frame Home Office Desk Professional Gamer Workstation with Cup Holder Headphone Hook Gaming Handle Rack Fr

In [8]:
SAVE_DIR = os.path.join('..','models','faiss')
os.makedirs(SAVE_DIR, exist_ok=True)
store.save_local(os.path.join(SAVE_DIR, 'sbert_faiss'))
print("Saved:", os.listdir(SAVE_DIR))


Saved: ['sbert_faiss']


In [9]:
from transformers import CLIPModel, CLIPProcessor

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_proc  = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# build label prompts from category_main
labels = sorted(list({c for c in df['category_main'].tolist() if c}))
label_texts = [f"a photo of {c}" for c in labels]
print("Label count:", len(labels), "Example labels:", labels[:10])

# pick a row that has an HTTP(s) image
row = df[df['image_0'].str.startswith('http')].head(1)
if not row.empty:
    row = row.iloc[0]
    img_url = row['image_0']
    gt = row['category_main']  # "ground truth" main category

    try:
        resp = requests.get(img_url, timeout=10)
        image = Image.open(io.BytesIO(resp.content)).convert("RGB")

        inputs = clip_proc(text=label_texts, images=image, return_tensors="pt", padding=True)
        with torch.no_grad():
            logits = clip_model(**inputs).logits_per_image[0]
        pred_idx = int(torch.argmax(logits))
        pred = labels[pred_idx]
        print("GT:", gt, "| Pred:", pred, "| Title:", row.get('title'))
    except Exception as e:
        print("Failed to fetch or classify:", e)
else:
    print("No rows with valid http image URL; skip CLIP demo.")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install 

Label count: 7 Example labels: ["['Baby Products'", "['Beauty & Personal Care'", "['Electronics'", "['Home & Kitchen'", "['Office Products'", "['Patio", "['Tools & Home Improvement'"]
No rows with valid http image URL; skip CLIP demo.


In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tok = AutoTokenizer.from_pretrained("distilgpt2")
lm  = AutoModelForCausalLM.from_pretrained("distilgpt2")

def generate_description(sample):
    prompt = (f"Write a catchy 2-sentence product description.\n"
              f"Title: {sample.get('title','')}\n"
              f"Brand: {sample.get('brand','')}\n"
              f"Material: {sample.get('material','')}\n"
              f"Color: {sample.get('color','')}\n"
              f"Description:")
    ids = tok.encode(prompt, return_tensors='pt')
    with torch.no_grad():
        out = lm.generate(
            ids, max_length=ids.shape[1]+60, do_sample=True,
            top_p=0.92, top_k=40, temperature=0.8,
            pad_token_id=tok.eos_token_id
        )
    text = tok.decode(out[0], skip_special_tokens=True)
    return text.split("Description:")[-1].strip()

sample_row = df.iloc[0].to_dict()
gen = generate_description(sample_row)
print(gen)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.





In [11]:
def nearest_titles(query, k=5):
    docs = store.similarity_search(query, k=k)
    return [d.metadata.get('title','') for d in docs]

tests = ["wooden chair", "office desk", "sofa", "table lamp"]
for q in tests:
    print("\n", q, "→")
    for t in nearest_titles(q, 5):
        print("  -", t)



 wooden chair →
  - Black Leather Office Chair Mid Back Leather Desk Chair Modern Excutive Office Chair with Arms and Wheels for Home Office, by Artswish
  - PONTMENT Foot Stool Leather Footstool Solid Wood Vintage Foot Rest Faux Leather Ottoman Upholstered Footrest for Living Room/Sofa/Couch.
  - Adeco Euro Style Fabric Arm Bench Chair Footstool Cubic Ottomans, Brown
  - Christopher Knight Home Munro Recliner, Navy Blue + Teak
  - Armen Living Julius 30" Cream Faux Leather and Walnut Wood Bar Stool

 office desk →
  - ODK Small Computer Desk, 27.5 Inch, Compact Tiny Study Desk with Storage and Monitor Stand for Home Office, Small Spaces, Black
  - ODK Small Computer Desk, 27.5 inch Desk for Small Spaces with Storage, Compact Table with Monitor & Storage Shelves for Home Office, Modern Style Laptop Desk, Pure White
  - It's_Organized Gaming Desk 55 inch PC Computer Desk, K-Frame Home Office Desk Professional Gamer Workstation with Cup Holder Headphone Hook Gaming Handle Rack Free Mous

In [12]:
report = {
    "rows": int(len(df)),
    "n_categories": int(df['category_main'].nunique()),
    "faiss_saved": os.path.exists(os.path.join(SAVE_DIR, 'sbert_faiss', 'index.faiss')),
    "genai_sample": gen[:200] if isinstance(gen, str) else ""
}
os.makedirs(os.path.join('..','data','derived'), exist_ok=True)
with open(os.path.join('..','data','derived','model_report.json'), 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2)
report


{'rows': 312, 'n_categories': 7, 'faiss_saved': True, 'genai_sample': ''}