# storage

project to explore using Chromadb databases

based on RealPython tutorial

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from ragsc.markdown import MarkdownDirectory
from loguru import logger
import sys
import dotenv

dotenv.load_dotenv()

In [None]:
CHROMA_DATA_PATH="chroma_data/"
EMBED_MODE = "all-MiniLM-L6-v2"
COLLECTION_NAME = "ragsc"

In [None]:
# client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)
#
# use transient client for now
#
client = chromadb.Client()

In [None]:
import os
# embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODE)
# embedding_func = embedding_functions.OpenAIEmbeddingFunction(model_name="text-embedding-3-large", api_key=os.environ["OPENAI_API_KEY"])
embedding_func = embedding_functions.OpenAIEmbeddingFunction(model_name="text-embedding-ada-002", api_key=os.environ["OPENAI_API_KEY"])

In [None]:
logger.remove()
logger.add(sys.stderr, level="INFO")

try:
    collection = client.get_collection(COLLECTION_NAME)
    client.delete_collection(collection.name)
except ValueError:
    logger.info("No collection named {COLLECTION_NAME}")

collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func, # type: ignore
    metadata={"hnsw:space":"cosine"}
)

In [None]:
from pathlib import Path 
folder = MarkdownDirectory(Path("data"), max_pages =500)
folder.store_in_chroma(collection=collection)

In [None]:
query_results = collection.query(
    query_texts=["what is the status of the expansion of Gatineau ",
                 "how many students will there be in next year's class"],
    
    include=["documents","metadatas"],
    where_document={"$not_contains": "gagnon"},
    # # where_document={"$or": [  # type: ignore
    # #         {
    # #             "keywords":{"$contains": "ugme"}
    # #         },
    # #         {
    # #         "present":{"$contains":"mondou"}
    # #         }
    # #     ]      
    # # },
    n_results=10,
)

In [None]:
query_results["documents"]

In [None]:
metadata = query_results['metadatas'] # type: ignore
if metadata is not None:
    for data in metadata: 
        print([x["filename"] for x in data])