In [23]:
import pandas as pd 
import numpy as np 

In [24]:
df = pd.read_csv('ProductStructuredInfo.csv', index_col=0)

In [25]:
def get_image_urls(row):
    image_urls = row['image_urls']
    image_urls = [url for url in image_urls.split("'") if "http" in url]
    return image_urls

In [26]:
import torch
import clip
from PIL import Image
from io import BytesIO
import requests

# Set up device and load the CLIP model and its image preprocessing pipeline.
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def get_image_embedding(image_paths):
    """
    Given a list of image paths for an item, this function:
    - Loads and preprocesses each image using CLIP's preprocessing.
    - Uses CLIP's image encoder to produce an embedding for each image.
    - Aggregates the embeddings (here, by averaging) to produce a single embedding.
    - Normalizes the resulting embedding so its length is 1.
    """
    embeddings = []
    for path in image_paths:
        # Open the image and convert to RGB in case it's grayscale or another format.
        response = requests.get(path)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        # Preprocess the image (resize, crop, normalize, etc.) and add a batch dimension.
        image_input = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            # Encode the image to get its embedding.
            image_embedding = model.encode_image(image_input)
        embeddings.append(image_embedding)
    
    # Concatenate embeddings along the batch dimension.
    embeddings = torch.cat(embeddings, dim=0)
    # Aggregate embeddings by taking the mean across all images.
    aggregated_embedding = embeddings.mean(dim=0, keepdim=True)
    # Normalize the aggregated embedding.
    aggregated_embedding = aggregated_embedding / aggregated_embedding.norm(dim=-1, keepdim=True)
    
    return aggregated_embedding

def get_image_embedding_for_item(row):
    """
    Given a row of the DataFrame, this function retrieves the image URLs,
    computes the image embeddings, and returns the aggregated embedding.
    """
    image_urls = [get_image_urls(row)[0]]
    if not image_urls:
        return torch.zeros(1, 512).to(device)
    return get_image_embedding(image_urls)

In [32]:
rows = df.to_dict('records')

In [35]:
from multiprocessing import Pool, cpu_count

# your “lambda” – turn it into a named function
def process_row(row_dict):
    print(row_dict)
    # row_dict is a plain dict of your columns
    # e.g. return your_lambda(row_dict)
    image_urls = get_image_urls(row_dict)
    if not image_urls:
        return torch.zeros(1, 512).to(device)
    else:
        return get_image_embedding(image_urls)

# 2) spin up a pool
with Pool(processes=cpu_count()) as pool:
    # 3) map your function onto all rows
    results = pool.map(process_row, rows)

Process SpawnPoolWorker-63:
Process SpawnPoolWorker-66:
Traceback (most recent call last):
Traceback (most recent call last):
Process SpawnPoolWorker-65:
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/multiprocessing/queues.py", line 389, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_row' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 1

KeyboardInterrupt: 

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Function to process a single row with a timeout mechanism
def process_row_with_timeout(row):
    try:
        start_time = time.time()
        image_urls = get_image_urls(row)
        if not image_urls:
            return torch.zeros(1, 512).to(device)
        # Check if processing exceeds 10 seconds
        if time.time() - start_time > 10:
            print(f"Skipping row {row.name} due to timeout.")
            return None
        return get_image_embedding(image_urls)
    except Exception as e:
        print(f"Error processing row {row.name}: {e}")
        return None

# Process rows in parallel
embeddings_list = []
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(get_image_embedding, row): row for _, row in df.iterrows()}
    for future in as_completed(futures):
        result = future.result()
        if result is not None:
            embeddings_list.append(result)

# Combine embeddings into a single tensor
embeddings = torch.cat(embeddings_list, dim=0) if embeddings_list else torch.empty(0, 512)

# Replace NaN cosine similarity values with 0
cosine_similarity_matrix = embeddings @ embeddings.T if embeddings.size(0) > 0 else torch.empty(0, 0)
cosine_similarity_matrix = torch.nan_to_num(cosine_similarity_matrix, nan=0.0)

MissingSchema: Invalid URL 'beams plus multi stripe pocket t-shirt': No scheme supplied. Perhaps you meant https://beams plus multi stripe pocket t-shirt?

In [None]:
import multiprocessing
from multiprocessing import Pool, TimeoutError
from tqdm import tqdm
import torch

# Your existing helper functions:
#   get_image_urls(row)
#   get_image_embedding(image_urls)
# And `device` is already defined.

def process_row(row_dict):
    """
    Given a row as a dict, compute its embedding (or zero vector on failure).
    """
    # Turn it back into a Series if you need the original API:
    # row = pd.Series(row_dict)
    image_urls = get_image_urls(row_dict)
    if not image_urls:
        return torch.zeros(1, 512).to(device)
    return get_image_embedding(image_urls)


# 1) Make each row picklable
rows = df.to_dict("records")

# 2) Spin up a pool using all CPUs (you can set `processes=` manually)
with Pool(processes=multiprocessing.cpu_count()) as pool:
    # 3) Launch all rows asynchronously
    async_results = [pool.apply_async(process_row, (row,)) for row in rows]

    # 4) Collect embeddings, enforcing a 10 s timeout per row
    embeddings_list = []
    for async_res in tqdm(async_results, desc="Processing rows"):
        try:
            emb = async_res.get(timeout=10)  # waits up to 10 s for this row
        except TimeoutError:
            # took too long → skip / zero‐vector
            emb = torch.zeros(1, 512).to(device)
        except Exception:
            # any other error → zero‐vector
            emb = torch.zeros(1, 512).to(device)

        embeddings_list.append(emb)

# 5) Stack and compute similarity as before
embeddings = torch.cat(embeddings_list, dim=0)  # (num_items, 512)
cosine_similarity_matrix = embeddings @ embeddings.T
cosine_similarity_matrix = torch.nan_to_num(cosine_similarity_matrix, nan=0.0)

In [22]:
from tqdm import tqdm

# Compute embeddings for all items.
embeddings_list = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):

    image_urls = get_image_urls(row)
    if not image_urls:
        embeddings_list.append(torch.zeros(1, 512).to(device))
    else:
        embeddings_list.append(get_image_embedding(image_urls))

embeddings = torch.cat(embeddings_list, dim=0)  # Shape: (num_items, embedding_dim)

# Since embeddings are normalized, cosine similarity is just the dot product.
cosine_similarity_matrix = embeddings @ embeddings.T

# Replace the nan cosine similarity values with 0
cosine_similarity_matrix = torch.nan_to_num(cosine_similarity_matrix, nan=0.0)

Processing rows:  32%|███▏      | 4790/14912 [14:47:42<31:15:52, 11.12s/it]


KeyboardInterrupt: 

In [21]:
df.iloc[398]

product_name                       acne studios olando fluid viscose jacket
category                                                           clothing
sub-category                                                coats & jackets
specific_category                                                       NaN
brand                                                          acne-studios
fashion_season                                                          NaN
primary_colour                                                         navy
secondary_colour                                                        NaN
pattern                                                               plain
fabric                                                              viscose
price                                                                 600.0
price_category                                                       luxury
size                                                                    NaN
fit         

In [18]:
df.iloc[398]['image_urls'][0]

'['

In [None]:
import requests

requests.get("https://media.endclothing.com/media/f_auto,q_auto:eco,w_200/prodmedia/media/catalog/product/1/3/13-09-2024-LB_B90785-BG3_1_1.jpg").content


b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00\x84\x00\x06\x06\x06\x06\x07\x06\x07\x08\x08\x07\n\x0b\n\x0b\n\x0f\x0e\x0c\x0c\x0e\x0f\x16\x10\x11\x10\x11\x10\x16"\x15\x19\x15\x15\x19\x15"\x1e$\x1e\x1c\x1e$\x1e6*&&*6>424>LDDL_Z_||\xa7\x01\x06\x06\x06\x06\x07\x06\x07\x08\x08\x07\n\x0b\n\x0b\n\x0f\x0e\x0c\x0c\x0e\x0f\x16\x10\x11\x10\x11\x10\x16"\x15\x19\x15\x15\x19\x15"\x1e$\x1e\x1c\x1e$\x1e6*&&*6>424>LDDL_Z_||\xa7\xff\xc2\x00\x11\x08\x00\xc8\x00\xc8\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x003\x00\x01\x00\x00\x07\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x08\x07\t\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\xff\xda\x00\x0c\x03\x01\x00\x02\x10\x03\x10\x00\x00\x00\xea\x90\x00\x00\x004\x1c\xf7\x06\xeeu6\xcd\xc3t-\xfab\xd46\xfc\x02\x00\x00\x00\x00\x00\x01\xab\x1bO\x8f\xf8G\x99u\x9b\x1e\xb3S%\xb9\x83\xb0\xde\xf1<\xf5\xb0t\xb7.[w\xc7\xd0\xd7\x11\xf5g\x9f{\x80\xc0\x00\x00\x00\x0

In [None]:
df['image_embeddings'] = [embeddings_list[i].cpu().numpy() for i in range(len(embeddings_list))]
df.to_csv('ProductStructuredInfoImageEmbeddings.csv', index=False)

NameError: name 'embeddings_list' is not defined

In [None]:
top_k = 3  # Adjust this to see more similar items.
for i in [150, 300, 1200, 2500, 5000]:
    # Get similarity scores for the i-th item.
    sim_scores = cosine_similarity_matrix[i]
    # Sort indices in descending order of similarity.
    # Exclude the item itself (which will have a similarity of 1.0).
    similar_indices = torch.topk(sim_scores, top_k + 1).indices.cpu().numpy()
    similar_indices = [idx for idx in similar_indices if idx != i][:top_k]
    print(f"Item {i} (descriptors: {df.iloc[i]['image_urls'].split("'")[1]}) is similar to items: {[(idx, df.iloc[idx]['image_urls'].split("'")[1]) for idx in similar_indices]}")