In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import urllib.request

import tqdm

data_dir = Path("../data").absolute()
images_dir = Path("../images").absolute()

SAMPLE_SIZE = 2000

In [2]:
df = pd.read_parquet(data_dir / "product_images.parquet")
df.sample(10)

Unnamed: 0,asin,title,primary_image
3339,B07J3BWTDZ,Under Armour Women's Tech V-Neck Twist Short-S...,https://m.media-amazon.com/images/I/414fCX0YIb...
93493,B08QMZJQVR,AOHITE Women's Knit Turtleneck Dress Balloon S...,https://m.media-amazon.com/images/I/51g8YUci3t...
68420,B087Q4Y2RV,Heavy Duty Large Rolling Garment Rack Stainles...,https://m.media-amazon.com/images/I/41Ahst8wn0...
499,B07VQY3QFF,New Balance Men's 608 V5 Casual Comfort Cross ...,https://m.media-amazon.com/images/I/418wOvAI4y...
90496,B091TYK8GQ,BEAUTEX Fleece Throw Blanket for Couch Sofa or...,https://m.media-amazon.com/images/I/51vSSiTBmg...
69917,B08CHNMXJ2,Washable Kids Makeup Girl Toys - Non Toxic Rea...,https://m.media-amazon.com/images/I/61nAYRECd0...
38592,B07XKYRYT5,DWVO Radiator Compatible with Ford F-150 F-200...,https://m.media-amazon.com/images/I/51BOgmVbLM...
2096,B07RL5SKDW,New Balance Women's Fresh Foam Roav V1 Sneaker...,https://m.media-amazon.com/images/I/31EG99piKL...
80688,B092M3XXQT,SheIn Women's Cloak Long Sleeve Mini Cape Dres...,https://m.media-amazon.com/images/I/41eNrBzFel...
44084,B08MWXWCZZ,Speedo Women's Swimsuit One Piece Zip Front Lo...,https://m.media-amazon.com/images/I/41GPlW64oB...


## (One-time) sample ids and download images from Amazon dataset

In [None]:
# sample some records
sampled_df = df.sample(SAMPLE_SIZE)

# store the sampled ids
clip_ids = list(sampled_df["asin"])
with (data_dir / "original_clip_ids.json").open("w") as f:
    json.dump(clip_ids, f)


In [None]:
# download the sampled records' images
image_urls = list(sampled_df["primary_image"])
for img_url in tqdm.tqdm_notebook(image_urls):
    local_file_name = img_url.rsplit("/")[-1]
    urllib.request.urlretrieve(img_url, images_dir / local_file_name)


## Generate embeddings for the sampled images

In [None]:
# A NOTE FOR THE REVIEWER:
#   embedding 2000 images and 2000 texts requires tens of GBs, assuming the goal is not to solve this performance problem,
#   I just worked around it by embedding the texts and images in two different runs of the notebook (while resetting it) and saving to two different files,
#   controlled by the EMBEDDING_TYPE const

import sys, os
import torch
import clip
import enum

class EmbeddingType(enum.Enum):
    EMB_TEXT = "txt"
    EMB_IMAGES = "img"

EMBEDDING_TYPE = EmbeddingType.EMB_TEXT
# EMBEDDING_TYPE = EmbeddingType.EMB_IMAGES

# load local module
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src import embedding

# read the ids sampled from the dataset
with (data_dir/"original_clip_ids.json").open('r') as f:
    embedding_ids = json.load(f)

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# generate embeddings for text and images
count = 0
clip_emb = np.zeros(shape=(SAMPLE_SIZE, 512))
verified_embedding_ids = []
for sample_id in tqdm.tqdm_notebook(embedding_ids[:SAMPLE_SIZE]):
    sample_record = df[df["asin"] == sample_id].iloc[0]
    title = sample_record["title"]
    img_url = sample_record["primary_image"]
    img_file_name = images_dir / img_url.rsplit("/")[-1]

    # NOTE FOR REVIEWER: not all images can be downloaded from Amazon, hence only the verified ones are embedded
    if os.path.exists(img_file_name):
        verified_embedding_ids.append(sample_id)
        if EMBEDDING_TYPE == EmbeddingType.EMB_TEXT:
            clip_emb[count] = embedding.embed_text(model, device, title)
        elif EMBEDDING_TYPE == EmbeddingType.EMB_IMAGES:
            clip_emb[count] = embedding.embed_image(model, preprocess, device, img_file_name)
        count += 1

# save the embeddings
np.save(data_dir / f"clip_{EMBEDDING_TYPE.value}_emb.npy", clip_emb)

# store the actually embedded sampled ids
print(f"there are {len(verified_embedding_ids)} embeddings")
with (data_dir / "clip_ids.json").open("w") as f:
    json.dump(verified_embedding_ids, f)


## Show some embedded products names

In [5]:
# read the ids sampled from the dataset
with (data_dir/"clip_ids.json").open('r') as f:
    verified_embedding_ids = json.load(f)

# show the embedded titles
display(df[df["asin"].isin(verified_embedding_ids)].sample(10))

Unnamed: 0,asin,title,primary_image
89867,B098D7QN6C,BGment Grey Bedroom Blackout Curtains 84 Inch ...,https://m.media-amazon.com/images/I/51k2DZiBqi...
76076,B09DSS93YX,JW PEI Women's Gabbi Ruched Hobo Handbag (Cora...,https://m.media-amazon.com/images/I/31yvbnLl8u...
24734,B08BND9BFK,New Balance Baby Fresh Foam Roav V1 Bungee Run...,https://m.media-amazon.com/images/I/411LOIJTir...
78308,B09DPR11NS,iPhone 12 13 Fast Charger [Apple MFi Certified...,https://m.media-amazon.com/images/I/31hAsFWRyS...
46595,B01IP89LVW,Rokinon Xeen 135mm T2.2 Professional Cine Lens...,https://m.media-amazon.com/images/I/51G7mwK1Fe...
55408,B08ZXDLHJW,WAOWAO Baby Gate Extra Wide Pressure Mounted W...,https://m.media-amazon.com/images/I/51eCGy2cPF...
59648,B073VC7R4B,Sidefeel Women Off The Shoulder Ruffles Party ...,https://m.media-amazon.com/images/I/41yiBC-laq...
41871,B08XN3Q447,NUVISION LIGHTING Pair H4 9003 Brightest LED H...,https://m.media-amazon.com/images/I/513GsFITPP...
19235,B079FYSSMF,Disney Junior Doc McStuffins Pet Rescue 8.5 In...,https://m.media-amazon.com/images/I/41qeBvzieX...
67364,B094CL27QS,"Yoga Mat Double-Sided Non Slip, 72'' x 32'' x ...",https://m.media-amazon.com/images/I/41MgCYYRV5...
