In [23]:
import os
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

# Ensure the variables are defined
persist_directory = '../data/chromadb'
slides_path = '../data/slides'

collection_name = 'CLIP_slides_collection'

# Initialize ChromaDB client with the existing settings
client = chromadb.PersistentClient(
    path=persist_directory,
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

# List all collections in ChromaDB
collections = client.list_collections()
print("Existing collections:")
for collection_n in collections:
    collection = client.get_collection(collection_n)
    print(collection.name)

# Create or get the collection in ChromaDB
collection = client.create_collection(collection_name, get_or_create=True)
print("Created or got collection 'CLIP_slides_collection'")

# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
print("Loaded CLIP model and processor")

# Function to embed an image using CLIP
def embed_image(image_path):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embeddings = model.get_image_features(**inputs)
    return embeddings.squeeze().tolist()

# Iterate over each image in the slides folder and embed it
for filename in os.listdir(slides_path):
    if filename.endswith('.png'):
        presentation = filename.split('_')[0]
        image_path = os.path.join(slides_path, filename)
        embeddings = embed_image(image_path)
        collection.add(
            ids=[filename],  # Use filename as the unique ID
            documents=[filename],
            embeddings=[embeddings],
            metadatas=[{'presentation': presentation}]
        )
        print(f"Added {filename} to collection with presentation {presentation}")


Existing collections:
Created or got collection 'CLIP_slides_collection'
Loaded CLIP model and processor
Added AirBnB_Pitch_Deck_slide10.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide11.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide13.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide12.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide9.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide16.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide17.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide8.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide15.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide14.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide6.png to collection with presentation AirBnB
Added AirBnB_Pitch_Deck_slide18.png to collection with presentation AirBn

In [26]:
# if needed, delete the collection
# client.delete_collection(collection_name)

In [25]:
# Get a specific collection
collection = client.get_collection(collection_name)

# Retrieve and print the metadata of the embeddings for this specific collection
documents = collection.get(ids=[filename for filename in os.listdir(slides_path) if filename.endswith('.png')], 
                         include=['documents', 'embeddings', 'metadatas'])
for doc_id, doc, embedding, metadata in zip(documents['ids'], documents['documents'], 
                                          documents['embeddings'], documents['metadatas']):
    print(f"Collection: {collection_name}")
    print(f"ID: {doc_id}")
    print(f"Document: {doc}")
    print(f"Metadata: {metadata}")
    print(f"Embedding (first 5 elements): {embedding[:5]}...")
    print("-" * 50)

Collection: CLIP_slides_collection
ID: AirBnB_Pitch_Deck_slide10.png
Document: AirBnB_Pitch_Deck_slide10.png
Metadata: {'presentation': 'AirBnB'}
Embedding (first 5 elements): [-0.29650497 -0.45568323  0.12390557  0.14443952 -0.04381865]...
--------------------------------------------------
Collection: CLIP_slides_collection
ID: AirBnB_Pitch_Deck_slide11.png
Document: AirBnB_Pitch_Deck_slide11.png
Metadata: {'presentation': 'AirBnB'}
Embedding (first 5 elements): [-0.31927934 -0.19493011  0.05249168  0.12463375  0.28774643]...
--------------------------------------------------
Collection: CLIP_slides_collection
ID: AirBnB_Pitch_Deck_slide13.png
Document: AirBnB_Pitch_Deck_slide13.png
Metadata: {'presentation': 'AirBnB'}
Embedding (first 5 elements): [-0.20980214 -0.07775372  0.08181545 -0.63865143  0.54446042]...
--------------------------------------------------
Collection: CLIP_slides_collection
ID: AirBnB_Pitch_Deck_slide12.png
Document: AirBnB_Pitch_Deck_slide12.png
Metadata: {'pre