In [1]:

import IPython
import os
import pickle
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import io
import random



In [2]:
# Create a directory to save the downloaded images
file_path = "/".join(
        IPython.extract_module_locals()[1]["__vsc_ipynb_file__"].split("/")[-5:]
    )

dir_path = os.path.dirname(os.path.realpath(file_path))

index = -1
while(file_path[index] != '/'):
    index -= 1

file_path = file_path[:index]

image_dir = f"product_images"
os.makedirs(image_dir, exist_ok=True)


In [3]:
#Import dataset
dataset_path = f"dataset.csv"
dataset = pd.read_csv(dataset_path)

dataset.head(), dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12041 entries, 0 to 12040
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   category_1      12041 non-null  object 
 1   category_2      12041 non-null  object 
 2   category_3      12041 non-null  object 
 3   title           12023 non-null  object 
 4   product_rating  11961 non-null  float64
 5   selling_price   12013 non-null  object 
 6   mrp             11666 non-null  object 
 7   seller_name     11825 non-null  object 
 8   seller_rating   11827 non-null  float64
 9   description     5021 non-null   object 
 10  highlights      6560 non-null   object 
 11  image_links     12041 non-null  object 
dtypes: float64(2), object(10)
memory usage: 1.1+ MB


(               category_1 category_2 category_3  \
 0  Sports, Books and More     Sports   Cricket    
 1  Sports, Books and More     Sports   Cricket    
 2  Sports, Books and More     Sports   Cricket    
 3  Sports, Books and More     Sports   Cricket    
 4  Sports, Books and More     Sports   Cricket    
 
                                                title  product_rating  \
 0  ITWOSERVICES CRICKET NET 100X10 CRICKET NET NY...             4.4   
 1  ITWOSERVICES CRICKET NET GROUND BOUNDARY NET 1...             4.4   
 2  VICTORY Medium Weight ( Pack of 1 ) Rubber Cri...             3.7   
 3  VICTORY Cricket Wind Ball (Pack of 1) - Made i...             3.8   
 4  CEAT Hitman Full Size Double Blade Poplar Cric...             3.4   
 
   selling_price     mrp     seller_name  seller_rating  \
 0        ₹1,615  ₹4,000      I2SERVICES            4.4   
 1          ₹152    ₹600      I2SERVICES            4.4   
 2           ₹59    ₹199  VictoryOutlets            4.7   
 3        

In [4]:
# Filter valid image links
valid_image_links = [url for url in dataset['image_links'].dropna().unique() if url.startswith("http")]

valid_image_links

['https://rukminim1.flixcart.com/image/612/612/xif0q/net/s/f/l/cricket-net-100x10-cricket-net-nylon-hdpe-i2services19-10-original-imagj78bgyphbt7v.jpeg?q=70',
 'https://rukminim1.flixcart.com/image/612/612/xif0q/net/c/u/m/cricket-net-ground-boundary-net-10x10-feet-i2services10-3-048-original-imagj5d4ypybak3h.jpeg?q=70',
 'https://rukminim1.flixcart.com/image/612/612/xif0q/ball/m/b/8/110-medium-weight-pack-of-1-rubber-cricket-tennis-ball-standard-original-imagk6zzxzz6uvez.jpeg?q=70',
 'https://rukminim1.flixcart.com/image/612/612/klb78nk0/wicket/7/n/o/heavy-cricket-kit-15-age-group-victory-original-imagygyqncgf8t6a.jpeg?q=70',
 'https://rukminim1.flixcart.com/image/612/612/jmp79u80/bat/p/d/f/1-2-long-handle-hitman-full-size-double-blade-poplar-cricket-bat-original-imaf9jagezy8zf7r.jpeg?q=70',
 'https://rukminim1.flixcart.com/image/612/612/xif0q/ball/k/g/g/110-cricket-wind-ball-pack-of-1-made-in-india-smooth-cricket-original-imagk6zysvatjvck.jpeg?q=70',
 'https://rukminim1.flixcart.com/i

In [5]:
# Step 1: Extract Image Embeddings and Cache Locally using ResNet
EMBEDDING_CACHE = "embeddings_cache.pkl"
embeddings_cache = {}

In [6]:
# Define a transformation pipeline for images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
transform

Compose(
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=True)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)

In [7]:
# Load a pre-trained ResNet model
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
model = torch.nn.Sequential(*(list(model.children())[:-1]))  # Remove the classification layer
model.eval()


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [8]:
f = open(EMBEDDING_CACHE, "rb")

In [9]:
# Load cached embeddings if available
if os.path.exists(EMBEDDING_CACHE) and os.path.getsize(EMBEDDING_CACHE) > 0:
    with open(EMBEDDING_CACHE, "rb") as f:
        embeddings_cache = pickle.load(f)
    print("Embeddings cache loaded successfully.")
else:
    print("No cache!!!")

Embeddings cache loaded successfully.


In [10]:
def extract_embedding_from_url(image_url):
    # Check cache first
    if image_url in embeddings_cache:
        return embeddings_cache[image_url]
    
    try:
        # Use requests.get() without stream to ensure full content is downloaded
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()  # Raise an exception for bad HTTP responses
        
        # Use BytesIO to create a file-like object
        image_data = io.BytesIO(response.content)
        
        # Open image with explicit error handling
        try:
            image = Image.open(image_data).convert("RGB")
        except UnidentifiedImageError:
            print(f"Could not identify image from {image_url}")
            return None
        
        # Apply transformations
        image = transform(image).unsqueeze(0)  # Add batch dimension
        
        with torch.no_grad():
            embedding = model(image).squeeze().numpy()
        
        # Cache the embedding
        embeddings_cache[image_url] = embedding
        return embedding
    
    except requests.RequestException as e:
        print(f"Error fetching image from {image_url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error processing {image_url}: {e}")
        return None

In [11]:
# Save the cache after each run
def save_embeddings_cache():
    with open(EMBEDDING_CACHE, "wb") as f:
        pickle.dump(embeddings_cache, f)

In [12]:
progress_index = 0
progress_length = len(valid_image_links)

# for url in valid_image_links:
#     extract_embedding_from_url(url)
#     print(f"{progress_index}/{progress_length}: extracting embeddings from {url}")
#     progress_index += 1

def extract_embedding_multithread(image_links, max_thread = 10):
    with ThreadPoolExecutor(max_workers=max_thread) as executor:
        results = list(tqdm(
            executor.map(extract_embedding_from_url, image_links),
            total = len(image_links),
            desc = "Extracting embeddings"
        ))
    total_images = len(image_links)
    successful_embeddings = sum(1 for result in results if result is not None)
    failed_count = total_images - successful_embeddings
    print(f"Extraction complete. "
                 f"Total images: {total_images}, "
                 f"Successful: {successful_embeddings}, "
                 f"Failed: {failed_count}")

extract_embedding_multithread(valid_image_links)

Extracting embeddings: 100%|██████████| 11133/11133 [00:00<00:00, 522481.16it/s]

Extraction complete. Total images: 11133, Successful: 11133, Failed: 0





In [13]:
save_embeddings_cache()

In [14]:
# res = random.choice(valid_image_links)
# print(str(res))
# len(res)

In [15]:
# Step 2: Image Similarity Matching Logic

# Function to find similar images from the cache
# Args:
#   query_image_url (str): URL of the query image to compare.
#   image_urls (list, optional): List of additional image URLs (not currently used).
#   top_k (int): Number of top similar images to retrieve. Default is 5.
# Returns:
#   list: A list of top_k similar image URLs from the cache.

def find_similar_images_from_urls(query_image_url, image_urls=None, top_k=5):
    # Step 2.1: Extract embedding for the query image
    print("Extracting query image embedding...")
    query_embedding = extract_embedding_from_url(query_image_url)
    if query_embedding is None:  # Handle invalid or failed extraction
        return []

    embeddings = []  # List to store cached embeddings
    valid_image_urls = []  # List to store corresponding image URLs

    # Step 2.2: Load embeddings from the cache
    print("Loading embeddings from cache...")
    for url, embedding in embeddings_cache.items():
        embeddings.append(embedding)  # Append embedding to the list
        valid_image_urls.append(url)  # Append corresponding URL

    # Step 2.3: Check if the cache is empty
    if not embeddings:
        print("No embeddings available in the cache.")
        return []

    # Step 2.4: Compute cosine similarity between the query embedding and cached embeddings
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    print(similarities.shape)

    # Step 2.5: Sort similarities and retrieve indices of the top_k most similar embeddings
    top_indices = similarities.argsort()[-top_k:][::-1]  # Sort in descending order
    print("\nTop Similar Images:")
    for i, idx in enumerate(top_indices, 1):
        print(f"{i}. Image: {valid_image_links[idx]}")
        print(f"   Similarity Score: {similarities[idx]:.4f}")

    # Step 2.6: Save the updated cache (if any updates were made elsewhere)
    save_embeddings_cache()

res = random.choice(valid_image_links)
print(res)
find_similar_images_from_urls(res)


https://rukminim1.flixcart.com/image/612/612/xif0q/sunglass/c/n/b/free-size-cat-eye-kingsunglasses-original-imaggv5hefwr25qg.jpeg?q=70
Extracting query image embedding...
Loading embeddings from cache...
(11133,)

Top Similar Images:
1. Image: https://rukminim1.flixcart.com/image/612/612/xif0q/sunglass/c/n/b/free-size-cat-eye-kingsunglasses-original-imaggv5hefwr25qg.jpeg?q=70
   Similarity Score: 1.0000
2. Image: https://rukminim1.flixcart.com/image/612/612/l55nekw0/sunglass/k/d/t/this-product-is-sold-as-small-by-the-brand-this-product-is-sold-original-imagfwahwwcgvy9k.jpeg?q=70
   Similarity Score: 0.8816
3. Image: https://rukminim1.flixcart.com/image/612/612/j7m7y4w0/sunglass/f/h/t/free-size-p357bk2-fastrack-original-imaexth2vqtbyrth.jpeg?q=70
   Similarity Score: 0.8406
4. Image: https://rukminim1.flixcart.com/image/612/612/xif0q/smart-glass/w/v/0/glares-s-ambrane-original-imaghyfv5bfbft3m.jpeg?q=70
   Similarity Score: 0.8391
5. Image: https://rukminim1.flixcart.com/image/612/612/k

In [16]:

# Step 2: Build Similarity Search Engines
# # Build FAISS index
# def build_faiss_index(embeddings):
#     dimension = embeddings[0].shape[0]
#     index = faiss.IndexFlatL2(dimension)  # L2 distance
#     faiss_embeddings = np.vstack(embeddings)
#     index.add(faiss_embeddings)
#     return index

# # Build HNSW index
# def build_hnsw_index(embeddings, space="cosine"):
#     dimension = embeddings[0].shape[0]
#     index = hnswlib.Index(space=space, dim=dimension)
#     index.init_index(max_elements=len(embeddings), ef_construction=200, M=16)
#     for i, embedding in enumerate(embeddings):
#         index.add_items(embedding, i)
#     index.set_ef(50)  # Controls recall quality at query time
#     return index

# Function to find similar images using cosine similarity
def find_similar_cosine(query_embedding, embeddings, top_k=5):
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    return top_indices, similarities

# Step 3: Find Similar Images from Cache
def find_similar_images(query_image_url, top_k=20):
    print("Extracting query image embedding...")
    query_embedding = extract_embedding_from_url(query_image_url)
    if query_embedding is None:
        return []

    embeddings = []
    valid_image_urls = []

    print("Loading embeddings from cache...")
    for url, embedding in embeddings_cache.items():
        embeddings.append(embedding)
        valid_image_urls.append(url)

    if not embeddings:
        print("No embeddings available in the cache.")
        return []

    embeddings = np.vstack(embeddings)
    query_embedding = np.array(query_embedding)

    top_indices, similarities = find_similar_cosine(query_embedding, embeddings, top_k)
    # if method == "cosine":
    #     # Cosine Similarity Search
    # elif method == "faiss":
    #     # FAISS Search
    #     faiss_index = build_faiss_index(embeddings)
    #     _, top_indices = faiss_index.search(np.expand_dims(query_embedding, axis=0), top_k)
    #     top_indices = top_indices[0]
    #     similarities = None
    # elif method == "hnsw":
    #     # HNSW Search
    #     hnsw_index = build_hnsw_index(embeddings)
    #     labels, _ = hnsw_index.knn_query(query_embedding, k=top_k)
    #     top_indices = labels[0]
    #     similarities = None
    # else:
    #     raise ValueError("Invalid method. Choose from ['cosine', 'faiss', 'hnsw']")

    # save_embeddings_cache()  # Save updated cache

    print("Top similar images:")
    
    for i, index in enumerate(top_indices, 1):
        print(f"{index}.{similarities[index]}: {valid_image_urls[index]}")
    return [valid_image_urls[i] for i in top_indices]

# Example usage
query_image_url = random.choice(valid_image_links) # Replace with a query image URL
query_image_url = "https://thumbs.dreamstime.com/b/beach-ball-12760024.jpg"
print(query_image_url)
similar_images = find_similar_images(query_image_url)  # Options: "cosine", "faiss", "hnsw"
print("Top similar images:")
for img_url in similar_images:
    print(img_url)


https://thumbs.dreamstime.com/b/beach-ball-12760024.jpg
Extracting query image embedding...
Loading embeddings from cache...
Top similar images:
11133.0.9999997615814209: https://thumbs.dreamstime.com/b/beach-ball-12760024.jpg
8089.0.8672886490821838: https://rukminim1.flixcart.com/image/612/612/k6pd7680/inflatable-product/c/u/8/beach-ball-transparent-blow-up-water-ball-pool-toys-games-arsh-original-imafp3m4ahybjf2z.jpeg?q=70
125.0.8192352056503296: https://rukminim1.flixcart.com/image/612/612/kp4difk0/ball/r/h/y/420-470-5-storm-niviafb353-football-nivia-original-imag3fdvg5vrezfc.jpeg?q=70
155.0.8164737820625305: https://rukminim1.flixcart.com/image/612/612/k44hksw0/ball/7/p/y/400-5-28-germany-machine-stitched-football-with-black-28-cm-pump-original-imafn2yy9mbv3wep.jpeg?q=70
1.0.8079051375389099: https://rukminim1.flixcart.com/image/612/612/klb78nk0/wicket/7/n/o/heavy-cricket-kit-15-age-group-victory-original-imagygyqncgf8t6a.jpeg?q=70
119.0.8017953634262085: https://rukminim1.flixcar