In [1]:
import chromadb
import os
import pandas as pd
from typing import List, Dict
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
import base64
import io
import numpy as np
from PIL import Image
from io import BytesIO

In [2]:
def create_chroma_client(path: str) -> chromadb.Client:
    """
    Create a Chroma persistent client where database files are stored on disk.

    Args:
        path (str): Folder where ChromaDB will store its persistent files.

    Returns:
        chromadb.Client: ChromaDB client instance.
    """
    return chromadb.PersistentClient(path=path)

In [3]:

def create_instance_embedding_function() -> OpenCLIPEmbeddingFunction:
    """Create an OpenCLIP embedding function instance."""
    return OpenCLIPEmbeddingFunction()


In [4]:
client = create_chroma_client(path="./chroma_db")

In [5]:
embedding_function = create_instance_embedding_function()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
collection = client.create_collection(name='zara_men_shoes',embedding_function=embedding_function)            

In [7]:
collection.count()

0

In [8]:
image_folder_path = "data/images/iso_men_shoes"

In [9]:
def get_image_uris(image_folder_path: str) -> List[str]:
    """Return a list of image URIs from a directory."""
    if not os.path.exists(image_folder_path):
        raise FileNotFoundError(f"The folder '{image_folder_path}' does not exist.")

    return [
        os.path.join(image_folder_path, img).replace("\\", "/")
        for img in os.listdir(image_folder_path)
    ]


In [10]:
list_of_image_uris = get_image_uris(image_folder_path)
print(f"Number of images found: {len(list_of_image_uris)}")
print(f"Sample image URIs: {list_of_image_uris[:5]}")

Number of images found: 156
Sample image URIs: ['data/images/iso_men_shoes/01870444-021f-4b58-a7e5-193c90c189f9.jpg', 'data/images/iso_men_shoes/020eef1c-b590-43a1-aad8-ea5ba1a4e710.jpg', 'data/images/iso_men_shoes/02a6fcc1-67dd-4cd3-9a1d-37130bbb0741.jpg', 'data/images/iso_men_shoes/05d57587-40dc-4e4e-982f-d300e9c237d4.jpg', 'data/images/iso_men_shoes/062d4222-deb5-4c79-8c8d-ee37da3f566d.jpg']


In [11]:
def generate_images_ids(list_of_image_uris: List[str]) -> List[str]:
    """Generate unique ID strings for each image."""
    return [str(i) for i in range(len(list_of_image_uris))]

In [12]:
ids = generate_images_ids(list_of_image_uris)
print(f"Generated IDs: {ids[:5]}")

Generated IDs: ['0', '1', '2', '3', '4']


In [13]:
def image_to_ndarray(image_path: str) -> np.ndarray:
    try:
        with open(image_path, "rb") as f:
            img = Image.open(f).convert("RGB")
            return np.asarray(img)
    except Exception as e:
        raise ValueError(f"Failed to load image: {image_path}") from e


In [14]:

def image_to_base64(image_path: str) -> str:
    """
    Convert an image at the given path to a base64-encoded string.

    Args:
        image_path (str): The path to the image file.

    Returns:
        str: The base64-encoded image string.
    """
    with open(image_path, "rb") as image_file:
        image_data = image_file.read()
        base64_image = base64.b64encode(image_data).decode("utf-8")
    return base64_image

In [15]:
images_ndarray = []
b64_images = []

for img in list_of_image_uris:
    images_ndarray.append(image_to_ndarray(img))
    b64_images.append(image_to_base64(img))

print(f"Number of numpy images loaded: {len(images_ndarray)}")
print(f"Number of b64 images loaded: {len(b64_images)}")

Number of numpy images loaded: 156
Number of b64 images loaded: 156


In [16]:
images_ndarray[0]

array([[[240, 241, 246],
        [241, 242, 247],
        [242, 243, 248],
        ...,
        [241, 242, 247],
        [241, 242, 247],
        [241, 242, 247]],

       [[240, 241, 246],
        [240, 241, 246],
        [241, 242, 247],
        ...,
        [241, 242, 247],
        [241, 242, 247],
        [241, 242, 247]],

       [[240, 241, 246],
        [240, 241, 246],
        [241, 242, 247],
        ...,
        [241, 242, 247],
        [241, 242, 247],
        [241, 242, 247]],

       ...,

       [[240, 241, 246],
        [240, 241, 246],
        [240, 241, 246],
        ...,
        [243, 244, 249],
        [242, 243, 248],
        [240, 241, 246]],

       [[241, 242, 247],
        [241, 242, 247],
        [241, 242, 247],
        ...,
        [244, 245, 250],
        [242, 243, 248],
        [240, 241, 246]],

       [[242, 243, 248],
        [242, 243, 248],
        [242, 243, 248],
        ...,
        [244, 245, 250],
        [242, 243, 248],
        [241, 242, 247]]

In [17]:
b64_images[0][:30]

'/9j/4AAQSkZJRgABAQIAdgB2AAD/4g'

In [18]:
def get_image_id(uri: str) -> str:
    """Extract the base filename (without extension) from a URI."""
    return os.path.splitext(os.path.basename(uri))[0]

In [19]:
def get_metadata_uris(path_csv: str, list_of_image_uris: List[str]) -> List[Dict]:
    """Load metadata for each image from a CSV file."""
    df = pd.read_csv(path_csv)
    metadata_list = []

    for uri in list_of_image_uris:
        identifier = get_image_id(uri)

        # Get the base64 image
        base_img = image_to_base64(uri)

        # Safe lookup
        row = df[df["iso_image"] == identifier]
        if row.empty:
            raise ValueError(f"No metadata found for image ID: {identifier}")

        row = row.iloc[0]

        metadata_list.append({
            "name": row["name"],
            "description": row["description"],
            "price": float(row["price"]),
            "category": row["category"],
            "iso_image": row["iso_image"] + ".jpg",
            "base64_image": base_img
        })

    return metadata_list


In [20]:
metadatas = get_metadata_uris("csv/iso_men_shoes.csv", list_of_image_uris)

In [21]:
print(f" Number of metadatas loaded: {len(metadatas)}")
print(f"Sample metadatas: {metadatas[:1]}")

 Number of metadatas loaded: 156
Sample metadatas: [{'name': 'lug sole buckle boots', 'description': 'Biker boots. Smooth upper. Side and front buckle detail. Chunky lug soles.', 'price': 89.9, 'category': 'boots', 'iso_image': '01870444-021f-4b58-a7e5-193c90c189f9.jpg', 'base64_image': '/9j/4AAQSkZJRgABAQIAdgB2AAD/4gIcSUNDX1BST0ZJTEUAAQEAAAIMbGNtcwIQAABtbnRyUkdCIFhZWiAH3AABABkAAwApADlhY3NwQVBQTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLWxjbXMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApkZXNjAAAA/AAAAF5jcHJ0AAABXAAAAAt3dHB0AAABaAAAABRia3B0AAABfAAAABRyWFlaAAABkAAAABRnWFlaAAABpAAAABRiWFlaAAABuAAAABRyVFJDAAABzAAAAEBnVFJDAAABzAAAAEBiVFJDAAABzAAAAEBkZXNjAAAAAAAAAANjMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB0ZXh0AAAAAEZCAABYWVogAAAAAAAA9tYAAQAAAADTLVhZWiAAAAAAAAADFgAAAzMAAAKkWFlaIAAAAAAAAG+iAAA49QAAA5BYWVogAAAAAAAAYpkAALeFAAAY2lhZWiAAAAAAAAAkoAAAD4QAALbPY3VydgAAAAAAAAAaAAAAywHJA2MFkghrC/YQPxVRGzQh8Sm

In [22]:
collection.add(
    ids=ids,
    images=images_ndarray,
    metadatas=metadatas
    )



In [23]:
collection.count()

156

In [24]:
collection.name

'zara_men_shoes'

In [26]:
collection.query(query_texts=['Military Boots'],  n_results=1)
  

{'ids': [['112']],
 'embeddings': None,
 'documents': [[None]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'description': 'High shaft boots. Lacing with eight pairs of eyelets. Back pull tab for ease. Rounded shape. Chunky lug soles.',
    'price': 89.9,
    'name': 'chunky sole canvas laceup boots',
    'iso_image': 'c2741986-658b-4b7c-a4cc-f42a0cc5dbaf.jpg',
    'category': 'boots',
    'base64_image': '/9j/4AAQSkZJRgABAQIAdgB2AAD/4gIcSUNDX1BST0ZJTEUAAQEAAAIMbGNtcwIQAABtbnRyUkdCIFhZWiAH3AABABkAAwApADlhY3NwQVBQTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLWxjbXMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApkZXNjAAAA/AAAAF5jcHJ0AAABXAAAAAt3dHB0AAABaAAAABRia3B0AAABfAAAABRyWFlaAAABkAAAABRnWFlaAAABpAAAABRiWFlaAAABuAAAABRyVFJDAAABzAAAAEBnVFJDAAABzAAAAEBiVFJDAAABzAAAAEBkZXNjAAAAAAAAAANjMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB0ZXh0AAAAAEZCAABYW