In [29]:
import chromadb
from pathlib import Path
from pydantic import BaseModel  
from datetime import datetime
import json
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import os

class VideoDescriptionMetadata(BaseModel):
    id: str
    file_name: str
    folder_name: str
    video_description: str
    location: str
    created: str
    duration: int

EMBEDDING_MODEL = SentenceTransformer("all-MiniLM-L6-v2")

def create_embedding(text:str, embedding_model=EMBEDDING_MODEL):
    return embedding_model.encode(text).tolist()


# Initialize ChromaDB client (Persistent storage can be enabled by specifying a path)
chroma_db_path = Path( "./chromadb")
chroma_db_path.mkdir(exist_ok=True)
chroma_client = chromadb.PersistentClient(path=str(chroma_db_path))  # Use chromadb.EphemeralClient() for in-memory
os.chmod(str(chroma_db_path), 0o777)

# Create (or get existing) collection
collection = chroma_client.get_or_create_collection(name="video_description")

def write_video_description_to_vector_db(collection, embedding: list, metadata: VideoDescriptionMetadata):
    collection.add(
        ids=[metadata.id],
        documents=[metadata.video_description],
        embeddings=[embedding],
        metadatas=[metadata.model_dump()] 
    )
    print("Data successfully added to ChromaDB!")
    return None

def id_exists_in_vector_db(collection, id: str) -> bool:
    result = collection.get(ids=[id])  # Retrieve by ID
    return bool(result['ids'])  # Check if any ID is returned

def add_directory_to_vector_db(collection, directory_path: Path):

    total_folders = sum(1 for item in directory_path.iterdir() if item.is_dir())

    for folder_path in tqdm(directory_path.iterdir(), total=total_folders):

        file_name = folder_path.name
        folder_name = folder_path.parent.name

        with open(folder_path / "video_description.txt", "r") as f:
            video_description = f.read()

        with open(folder_path / "metadata.json", "r", encoding="utf-8") as file:
            metadata = json.load(file)

        location = metadata["location"]
        created = str(metadata["created"])
        duration = metadata["duration"]
        id = f"{folder_name}_{file_name}"

        if id_exists_in_vector_db(collection=collection, id=id):
            print(f"{id} already present in vector db. Skipping the writing process.")
            continue

        video_description_metadata = VideoDescriptionMetadata(
            id=id,
            file_name = file_name,
            folder_name = folder_name,
            video_description = video_description,
            location = location,
            created = created,
            duration = duration
        )
        embedding = create_embedding(text=video_description)

        write_video_description_to_vector_db(collection=collection, embedding=embedding, metadata=video_description_metadata)
    
    print(f"{directory_path.name} added to vector db")
    

In [30]:
directory_path = Path("./save_dir/Switz")
add_directory_to_vector_db(collection=collection, directory_path=directory_path)

  0%|          | 0/1 [00:00<?, ?it/s]

Data successfully added to ChromaDB!
Switz added to vector db
