In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

PINECONE_API_KEY = user_secrets.get_secret("PINECONE_API_KEY")
COHERE_API_KEY    = user_secrets.get_secret("COHERE_API_KEY")

import os, math, ast, time, json, numpy as np, pandas as pd, requests
from tqdm import tqdm
from pinecone import Pinecone, ServerlessSpec
import torch
from torchvision import models, transforms
from PIL import Image
import io
import cohere

assert PINECONE_API_KEY, "Add PINECONE_API_KEY in Kaggle Secrets"
assert COHERE_API_KEY,   "Add COHERE_API_KEY in Kaggle Secrets"

pc = Pinecone(api_key=PINECONE_API_KEY)
co = cohere.Client(COHERE_API_KEY)


In [3]:
df = pd.read_csv('/kaggle/input/cleaned-intern-data/cleaned_intern_data.csv')
df.head(3)


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,Plant Repotting Mat MUYETOL Waterproof Transpl...,5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8
2,"Pickleball Doormat, Welcome Doormat Absorbent ...",VEWETOL,The decorative doormat features a subtle textu...,13.99,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/61vz1Igl...,Contrence,"24""L x 16""W",,Rubber,A5589,8fd9377b-cfa6-5f10-835c-6b8eca2816b5


In [4]:
df['combined_text'] = (
    df['title'].fillna('') + '. ' +
    df['description'].fillna('') + '. ' +
    df['categories'].fillna('') + '. Material: ' +
    df['material'].fillna('') + '. Color: ' +
    df['color'].fillna('')
)


In [5]:
TEXT_DIM = 384
EMBED_MODEL_ID = "embed-english-light-v2.0"

def cohere_embed_batch(texts, input_type="search_document"):
    resp = co.embed(texts=texts, model=EMBED_MODEL_ID, input_type=input_type)
    embs = np.array(resp.embeddings, dtype=np.float32)
    if embs.shape[1] >= TEXT_DIM:
        embs = embs[:, :TEXT_DIM]
    else:
        pad = np.zeros((embs.shape[0], TEXT_DIM - embs.shape[1]), dtype=np.float32)
        embs = np.hstack([embs, pad])
    return embs

def embed_texts(series, batch=96):
    out = []
    buf = []
    for s in tqdm(series.tolist(), desc="Text embedding (Cohere)"):
        buf.append(s if isinstance(s, str) else "")
        if len(buf) == batch:
            out.append(cohere_embed_batch(buf))
            buf = []
    if buf:
        out.append(cohere_embed_batch(buf))
    return np.vstack(out)

text_embeddings = embed_texts(df['combined_text'])
print(text_embeddings.shape)  # (N, 384)


Text embedding (Cohere): 100%|██████████| 210/210 [00:01<00:00, 126.46it/s]


(210, 384)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
resnet50.eval()
img_backbone = torch.nn.Sequential(*list(resnet50.children())[:-1]).to(device).eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

def get_first_image_url(val):
    if val is None or (isinstance(val, float) and np.isnan(val)): return None
    if isinstance(val, list): return val[0] if val else None
    if isinstance(val, str):
        s = val.strip()
        if s.startswith('[') and s.endswith(']'):
            try:
                L = ast.literal_eval(s)
                if isinstance(L, list) and L: return L[0]
            except Exception:
                pass
        return s
    return None

df['image_url'] = df['images'].apply(get_first_image_url)

def embed_image(url, timeout=10):
    DIM = 2048
    if not isinstance(url, str) or not url:
        return np.zeros(DIM, dtype=np.float32)
    try:
        r = requests.get(url, timeout=timeout); r.raise_for_status()
        img = Image.open(io.BytesIO(r.content)).convert('RGB')
        x = preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            v = img_backbone(x).view(1, -1).cpu().numpy().squeeze(0)
        return v.astype(np.float32)
    except Exception:
        return np.zeros(DIM, dtype=np.float32)

IMG_DIM = 2048
image_embeddings = np.zeros((len(df), IMG_DIM), dtype=np.float32)
for i, url in enumerate(tqdm(df['image_url'].tolist(), desc="Image embedding (ResNet50)")):
    image_embeddings[i] = embed_image(url)

print(image_embeddings.shape)  # (N, 2048)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 190MB/s] 
Image embedding (ResNet50): 100%|██████████| 210/210 [00:35<00:00,  5.89it/s]

(210, 2048)





In [7]:
assert text_embeddings.shape[0] == image_embeddings.shape[0]
multi_modal_embeddings = np.hstack([text_embeddings, image_embeddings])
print(multi_modal_embeddings.shape)  # (N, 2432)


(210, 2432)


In [8]:
INDEX_NAME = "product-recommendations"
DIM = multi_modal_embeddings.shape[1]  # 2432

if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    print("Index created.")
else:
    print("Index exists.")

desc = pc.describe_index(INDEX_NAME)
index = pc.Index(host=desc.host)


Index created.


In [9]:
def clean_meta(d):
    out = {}
    for k, v in d.items():
        if v is None: continue
        if isinstance(v, float) and np.isnan(v): continue
        out[k] = v
    return out

vecs = []
for i, row in df.reset_index(drop=True).iterrows():
    vid = str(row.get("uniq_id", i))
    vals = multi_modal_embeddings[i].astype(float).tolist()
    meta = clean_meta({
        "title": row.get("title"),
        "brand": row.get("brand"),
        "price": row.get("price"),
        "image_url": row.get("image_url"),
    })
    vecs.append({"id": vid, "values": vals, "metadata": meta})

B = 100
print("Upserting…")
for s in tqdm(range(0, len(vecs), B)):
    index.upsert(vectors=vecs[s:s+B])


Upserting…


100%|██████████| 3/3 [00:02<00:00,  1.30it/s]


In [10]:
time.sleep(10)
print(index.describe_index_stats())


{'dimension': 2432,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 210}},
 'total_vector_count': 210,
 'vector_type': 'dense'}


In [11]:
def encode_query_mm(q, w_text=1.0):
    resp = co.embed(texts=[q], model=EMBED_MODEL_ID, input_type="search_query")
    v = np.array(resp.embeddings[0], dtype=np.float32)
    if v.shape[0] >= TEXT_DIM:
        v = v[:TEXT_DIM]
    else:
        v = np.hstack([v, np.zeros(TEXT_DIM - v.shape[0], dtype=np.float32)])
    v = (w_text * v).tolist() + [0.0]*IMG_DIM
    return v

def search(query, top_k=8):
    qvec = encode_query_mm(query)
    res = index.query(vector=qvec, top_k=top_k, include_metadata=True)
    rows = []
    for m in res.get("matches", []):
        md = m.get("metadata", {}) or {}
        rows.append({
            "id": m.get("id"),
            "score": float(m.get("score", 0.0)),
            "title": md.get("title"),
            "brand": md.get("brand"),
            "price": md.get("price"),
            "image_url": md.get("image_url"),
        })
    return pd.DataFrame(rows)

search("sofa", top_k=5).head(5)


Unnamed: 0,id,score,title,brand,price,image_url
0,92d31095-e65e-5890-b90b-6c5b7efa78c1,0.445554,"Pekokavo Sofa Arm Clip Tray, Side Table for Re...",NDL Store,24.99,https://m.media-amazon.com/images/I/51yz-83kj+...
1,e037f8af-d28c-51a1-8f1c-3f524620910e,0.426796,AnRui Folding Floor Chair with Adjustable Back...,AnRui Store,52.99,https://m.media-amazon.com/images/I/51iuIrMVq+...
2,54e4f202-a43e-5859-b47e-3c81ef395b31,0.426572,Xchouxer Side Tables Natural Bamboo Sofa Armre...,Xchouxer Store,27.99,https://m.media-amazon.com/images/I/511LXRAxI+...
3,d5a75f7b-b874-5757-9778-790a1f33be14,0.413489,"Tiita Comfy Saucer Chair, Soft Faux Fur Oversi...",Tiita Store,79.99,https://m.media-amazon.com/images/I/41O7mY3lUv...
4,ee91361a-5882-5bd8-9d08-0da8c1863bd8,0.410236,nimboo Kids Couch - Modular Kids Play Couch Se...,nimboo Store,99.95,https://m.media-amazon.com/images/I/51He1KLeOs...
