In [1]:
import pandas as pd
import chromadb
import shutil
import os
import pandas as pd
from typing import List, Dict
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
import base64
import io
import numpy as np
from PIL import Image
from io import BytesIO

 Load CSV

In [2]:
df = pd.read_csv("csv/iso_men_shoes.csv")
df.head()

Unnamed: 0,name,description,price,currency,category,iso_image
0,chunky sole chelsea boots,Chelsea boots. Shaft with elastic goring on bo...,59.9,USD,boots,30d15b5c-bb33-4804-9c90-59b884a9be63
1,chunky sole sneakers,Sneakers. Upper in a combination of materials ...,49.9,USD,sneakers,a6e76df6-a31d-46f4-b44f-9c0118bf098b
2,squared toe leather boots,Leather ankle boots. Square shaped. Side zippe...,159.0,USD,boots,435cc093-08b5-48e0-91ba-cfce112d4105
3,polished leather penny loafers,Loafers. Made of leather. Smooth exterior. Orn...,119.0,USD,loafers,e1e53a03-4ecc-4925-9425-9fb8a614a097
4,monochrome chunky sole sneakers,Sneakers with tonal pieces at upper. Lacing wi...,59.9,USD,sneakers,083247f1-2158-46bc-a926-cacfd9a5537c


Reduce the size of the Dataframe

In [3]:
df_small = df.loc[:59]

Create Chromadb Client

In [4]:
def create_chroma_client(path: str) -> chromadb.Client:
    """
    Create a Chroma persistent client where database files are stored on disk.

    Args:
        path (str): Folder where ChromaDB will store its persistent files.

    Returns:
        chromadb.Client: ChromaDB client instance.
    """
    return chromadb.PersistentClient(path=path)

Create a OpenCLIP embedding function instance

In [5]:

def create_instance_embedding_function() -> OpenCLIPEmbeddingFunction:
    """Create an OpenCLIP embedding function instance."""
    return OpenCLIPEmbeddingFunction()


In [6]:
client = create_chroma_client(path="./chroma_db")

In [7]:
embedding_function = create_instance_embedding_function()

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
collection = client.create_collection(name='small_zara_men_shoes',embedding_function=embedding_function)            

In [9]:
collection.count() 

0

Create a new directory to save a small collection of images.

In [10]:
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)
    else:
        print(f"Directory {path} already exists.")

In [11]:
# Create the directory for 'small_iso_men_shoes' images
create_directory('data/images/small_iso_men_shoes')

Directory data/images/small_iso_men_shoes already exists.


Copy imagen from origin directory to  new directory

In [12]:

def copy_images_to_folder(df: pd.DataFrame, column:str, origin_image_path:str, destination_dir: str):
    
    """
    Copy images from a given origin directory to a destination directory based on a DataFrame column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the column with image references.
    column (str): The column name in the DataFrame containing the image references.
    origin_image_path (str): The path to the origin directory containing the images.
    destination_dir (str): The path to the destination directory where the images will be copied.

    Returns:
    None
    """

    image_references = []
    for lista in df[column]:
        image_references.append(lista)
    print(f"Total images to copy: {len(image_references)}")
    for reference in image_references:
        or_image_path = origin_image_path + reference + ".jpg"
        shutil.copy2(or_image_path, destination_dir)

In [13]:
copy_images_to_folder(df_small, 'iso_image', 'data/images/iso_men_shoes/', 'data/images/small_iso_men_shoes/')

Total images to copy: 60


In [14]:
image_folder_path = "data/images/small_iso_men_shoes"

Get list of uris of images from  'data/images/small_iso_men_shoes'

In [16]:
def get_image_uris(image_folder_path: str) -> List[str]:
    """Return a list of image URIs from a directory."""
    if not os.path.exists(image_folder_path):
        raise FileNotFoundError(f"The folder '{image_folder_path}' does not exist.")

    return [
        os.path.join(image_folder_path, img).replace("\\", "/")
        for img in os.listdir(image_folder_path)
    ]

In [17]:
list_of_image_uris = get_image_uris(image_folder_path)
print(f"Number of images found: {len(list_of_image_uris)}")
print(f"Sample image URIs: {list_of_image_uris[:5]}")

Number of images found: 60
Sample image URIs: ['data/images/small_iso_men_shoes/02a6fcc1-67dd-4cd3-9a1d-37130bbb0741.jpg', 'data/images/small_iso_men_shoes/0676dd1d-7c68-4824-b213-1b7b06002297.jpg', 'data/images/small_iso_men_shoes/083247f1-2158-46bc-a926-cacfd9a5537c.jpg', 'data/images/small_iso_men_shoes/08826527-9bed-4b99-8c78-e7aea39326ed.jpg', 'data/images/small_iso_men_shoes/1aa15975-5f98-46af-825b-2d3f3b8013d9.jpg']


Get the unique ID strings for each image

In [18]:
def generate_images_ids(list_of_image_uris: List[str]) -> List[str]:
    """Generate unique ID strings for each image."""
    return [str(i) for i in range(len(list_of_image_uris))]

In [19]:
ids = generate_images_ids(list_of_image_uris)
print(f"Generated IDs: {ids[:5]}")

Generated IDs: ['0', '1', '2', '3', '4']


Get a list of numpy array images and b64 images from my list_of_image_uris

In [20]:
def image_to_ndarray(image_path: str) -> np.ndarray:
    """Convert an image file to a NumPy ndarray.
    
    Args:
        image_path (str): Path to the image file.
    
    Returns:
        np.ndarray: The image as a NumPy ndarray.
    """
    try:
        with open(image_path, "rb") as f:
            img = Image.open(f).convert("RGB")
            return np.asarray(img)
    except Exception as e:
        raise ValueError(f"Failed to load image: {image_path}") from e

In [21]:
def image_to_base64(image_path: str) -> str:
    """
    Convert an image at the given path to a base64-encoded string.

    Args:
        image_path (str): The path to the image file.

    Returns:
        str: The base64-encoded image string.
    """
    with open(image_path, "rb") as image_file:
        image_data = image_file.read()
        base64_image = base64.b64encode(image_data).decode("utf-8")
    return base64_image

In [22]:
images_ndarray = []
b64_images = []

for img in list_of_image_uris:
    images_ndarray.append(image_to_ndarray(img))
    b64_images.append(image_to_base64(img))

print(f"Number of numpy images loaded: {len(images_ndarray)}")
print(f"Number of b64 images loaded: {len(b64_images)}")

Number of numpy images loaded: 60
Number of b64 images loaded: 60


Get metadata and ID of each Image

In [23]:
def get_image_id(uri: str) -> str:
    """Extract the base filename (without extension) from a URI."""
    return os.path.splitext(os.path.basename(uri))[0]

In [24]:
def get_metadata_uris(path_csv: str, list_of_image_uris: List[str]) -> List[Dict]:
    """Load metadata for each image from a CSV file."""
    df = pd.read_csv(path_csv)
    metadata_list = []

    for uri in list_of_image_uris:
        identifier = get_image_id(uri)

        # Get the base64 image
        base_img = image_to_base64(uri)

        # Safe lookup
        row = df[df["iso_image"] == identifier]
        if row.empty:
            raise ValueError(f"No metadata found for image ID: {identifier}")

        row = row.iloc[0]

        metadata_list.append({
            "name": row["name"],
            "description": row["description"],
            "price": float(row["price"]),
            "category": row["category"],
            "iso_image": row["iso_image"] + ".jpg",
            "base64_image": base_img
        })

    return metadata_list


In [25]:
metadatas = get_metadata_uris("csv/iso_men_shoes.csv", list_of_image_uris)
print(f"Number of metadatas loaded: {len(metadatas)}")
print(f"Sample metadatas: {metadatas[:1]}")

Number of metadatas loaded: 60
Sample metadatas: [{'name': 'suede boat shoes', 'description': 'Wallabee style shoes. Made of leather with suede finish. Tonal seamed ridge detail. Lacing with two pairs of eyelets. Welt around upper. Contrasting chunky sole.', 'price': 69.9, 'category': 'shoes', 'iso_image': '02a6fcc1-67dd-4cd3-9a1d-37130bbb0741.jpg', 'base64_image': '/9j/4AAQSkZJRgABAQIAdgB2AAD/4gIcSUNDX1BST0ZJTEUAAQEAAAIMbGNtcwIQAABtbnRyUkdCIFhZWiAH3AABABkAAwApADlhY3NwQVBQTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLWxjbXMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApkZXNjAAAA/AAAAF5jcHJ0AAABXAAAAAt3dHB0AAABaAAAABRia3B0AAABfAAAABRyWFlaAAABkAAAABRnWFlaAAABpAAAABRiWFlaAAABuAAAABRyVFJDAAABzAAAAEBnVFJDAAABzAAAAEBiVFJDAAABzAAAAEBkZXNjAAAAAAAAAANjMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB0ZXh0AAAAAEZCAABYWVogAAAAAAAA9tYAAQAAAADTLVhZWiAAAAAAAAADFgAAAzMAAAKkWFlaIAAAAAAAAG+iAAA49QAAA5BYWVogAAAAAAAAYpkAALe

Ingestion : Add numpy array images, ids and metadata to  Chroma Collection DataBase

In [26]:
collection.add(
    ids=ids,
    images=images_ndarray,
    metadatas=metadatas
    )

In [27]:
collection.count()

60

In [28]:
collection.name

'small_zara_men_shoes'

Text Query Search

In [29]:
collection.query(query_texts=['Military Boots'],  n_results=1)

{'ids': [['20']],
 'embeddings': None,
 'documents': [[None]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'base64_image': '/9j/4AAQSkZJRgABAQIAdgB2AAD/4gIcSUNDX1BST0ZJTEUAAQEAAAIMbGNtcwIQAABtbnRyUkdCIFhZWiAH3AABABkAAwApADlhY3NwQVBQTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLWxjbXMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAApkZXNjAAAA/AAAAF5jcHJ0AAABXAAAAAt3dHB0AAABaAAAABRia3B0AAABfAAAABRyWFlaAAABkAAAABRnWFlaAAABpAAAABRiWFlaAAABuAAAABRyVFJDAAABzAAAAEBnVFJDAAABzAAAAEBiVFJDAAABzAAAAEBkZXNjAAAAAAAAAANjMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB0ZXh0AAAAAEZCAABYWVogAAAAAAAA9tYAAQAAAADTLVhZWiAAAAAAAAADFgAAAzMAAAKkWFlaIAAAAAAAAG+iAAA49QAAA5BYWVogAAAAAAAAYpkAALeFAAAY2lhZWiAAAAAAAAAkoAAAD4QAALbPY3VydgAAAAAAAAAaAAAAywHJA2MFkghrC/YQPxVRGzQh8SmQMhg7kkYFUXdd7WtwegWJsZp8rGm/fdPD6TD////bAIQABQUFBQUFBgYGBggJCAkIDAsKCgsMEg0ODQ4NEhsRFBERFBEbGB0YFhgdGCsiHh4