In [2]:
!pip install open-clip-torch faiss-cpu --quiet

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install openpyxl --quiet

In [5]:
# 📂 STEP 3: Set up paths and import libraries
import os
import pandas as pd
import torch
import open_clip
import faiss
import pickle
import numpy as np
from PIL import Image
from tqdm import tqdm

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
model = model.to(device)
tokenizer = open_clip.get_tokenizer("ViT-B-32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
products_df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/product_data.xlsx")


In [8]:
!unzip -q "/content/drive/MyDrive/Colab Notebooks/images.zip" -d "/content/drive/MyDrive/Colab Notebooks/images"


replace /content/drive/MyDrive/Colab Notebooks/images/images/116103_11358.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace /content/drive/MyDrive/Colab Notebooks/images/images/116103_11359.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: none
replace /content/drive/MyDrive/Colab Notebooks/images/images/116103_11360.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [9]:
image_dir = "/content/drive/MyDrive/Colab Notebooks/images/images"

In [10]:
def row_to_text(row):
    parts = []
    for col in row.index:
        value = row[col]
        if pd.notnull(value):
            parts.append(f"{col}: {value}")
    return " ".join(parts)


In [12]:
# # Map base IDs to image filenames
available_images = {}
for f in os.listdir(image_dir):
    if f.endswith(".jpg"):
        base_id = f.split("_")[0]  # Extract the ID before underscore
        if base_id not in available_images:
            available_images[base_id] = f  # Only keep the first match

# # Embed
# embeddings = []
# meta_data = []

# print("🔁 Generating embeddings from local images...")

# for idx, row in tqdm(products_df.iterrows(), total=len(products_df)):
#     product_id = str(row["id"])

#     matching_file = available_images.get(product_id)
#     if not matching_file:
#         continue

#     try:
#         image_path = os.path.join(image_dir, matching_file)
#         image = Image.open(image_path).convert("RGB")
#         image_input = preprocess(image).unsqueeze(0).to(device)

#         text = f"{row['title']} {row.get('category', '')}"
#         text_input = tokenizer(text).unsqueeze(0).to(device)

#         with torch.no_grad():
#             image_feat = model.encode_image(image_input)
#             text_feat = model.encode_text(text_input)
#             combined = (image_feat + text_feat) / 2
#             combined = combined / combined.norm(dim=-1, keepdim=True)

#         embeddings.append(combined.cpu().squeeze().numpy())
#         meta_data.append(row.to_dict())

#     except Exception as e:
#         print(f"❌ Failed to process ID {row['id']}: {e}")
# Embed only products that have a corresponding image
embeddings = []
meta_data = []

for idx, row in tqdm(products_df.iterrows(), total=len(products_df)):
    try:
        product_id = str(row["id"])
        matching_file = available_images.get(product_id)

        if not matching_file:
            continue  # Skip if image is not available

        # Combine all row fields into a single text
        text = row_to_text(row)

        # Load image
        image_path = os.path.join(image_dir, matching_file)
        image = Image.open(image_path).convert("RGB")
        image_input = preprocess(image).unsqueeze(0).to(device)  # [1, 3, H, W]

        # Tokenize text (DO NOT UNSQUEEZE)
        text_input = tokenizer(text).to(device)  # already [1, seq_len]

        # Encode both
        with torch.no_grad():
            image_feat = model.encode_image(image_input)
            text_feat = model.encode_text(text_input)

            combined = (image_feat + text_feat) / 2
            combined = combined / combined.norm(dim=-1, keepdim=True)

        embeddings.append(combined.cpu().squeeze().numpy())
        meta_data.append(row.to_dict())

    except Exception as e:
        print(f"❌ Failed to process ID {row['id']}: {e}")


100%|██████████| 969/969 [02:23<00:00,  6.77it/s]


In [13]:
print(f"Total successful embeddings: {len(embeddings)}")

Total successful embeddings: 967


In [14]:

# Save FAISS index
embedding_matrix = np.vstack(embeddings).astype("float32")
index = faiss.IndexFlatIP(embedding_matrix.shape[1])
os.makedirs("utils", exist_ok=True)
faiss.write_index(index, "utils/faiss_catalog.index")
with open("utils/catalog_meta.pkl", "wb") as f:
    pickle.dump(meta_data, f)

print(f"\n✅ Embedded and saved FAISS index for {len(meta_data)} products.")

# 🎁 Zip FAISS output for download
import shutil
shutil.make_archive("faiss_output", 'zip', "utils")
print("📦 Zipped faiss_output.zip and ready for download.")

# ⬇️ Download result from Colab
from google.colab import files
files.download("faiss_output.zip")


✅ Embedded and saved FAISS index for 967 products.
📦 Zipped faiss_output.zip and ready for download.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>