# storage

project to explore using Chromadb databases

based on RealPython tutorial

In [1]:
import chromadb
from chromadb.utils import embedding_functions
from ragsc.markdown import MarkdownDirectory
from loguru import logger
import sys
import dotenv

dotenv.load_dotenv()

True

In [2]:
CHROMA_DATA_PATH="chroma_data/"
EMBED_MODE = "all-MiniLM-L6-v2"
COLLECTION_NAME = "ragsc"

In [3]:
# client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)
#
# use transient client for now
#
client = chromadb.Client()

In [5]:
import os
# embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBED_MODE)
# embedding_func = embedding_functions.OpenAIEmbeddingFunction(model_name="text-embedding-3-large", api_key=os.environ["OPENAI_API_KEY"])
embedding_func = embedding_functions.OpenAIEmbeddingFunction(model_name="text-embedding-ada-002", api_key=os.environ["OPENAI_API_KEY"])

In [13]:
logger.remove()
logger.add(sys.stderr, level="INFO")

try:
    collection = client.get_collection(COLLECTION_NAME)
    client.delete_collection(collection.name)
except ValueError:
    logger.info("No collection named {COLLECTION_NAME}")

collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func, # type: ignore
    metadata={"hnsw:space":"cosine"}
)

In [14]:
from pathlib import Path 
folder = MarkdownDirectory(Path("data"), max_pages =500)
folder.store_in_chroma(collection=collection)

[32m2024-05-05 15:14:21.958[0m | [1mINFO    [0m | [36mragsc.markdown[0m:[36m__init__[0m:[36m171[0m - [1mRead 500 Markdown pages[0m
100%|██████████| 150/150 [00:39<00:00,  3.76it/s]
[32m2024-05-05 15:15:01.842[0m | [1mINFO    [0m | [36mragsc.markdown[0m:[36mstore_list_of_pages[0m:[36m227[0m - [1mProcessed 703 chunks out of 703 chunks[0m
100%|██████████| 150/150 [00:37<00:00,  3.96it/s]
[32m2024-05-05 15:15:39.682[0m | [1mINFO    [0m | [36mragsc.markdown[0m:[36mstore_list_of_pages[0m:[36m227[0m - [1mProcessed 817 chunks out of 817 chunks[0m
100%|██████████| 150/150 [00:36<00:00,  4.09it/s]
[32m2024-05-05 15:16:16.391[0m | [1mINFO    [0m | [36mragsc.markdown[0m:[36mstore_list_of_pages[0m:[36m227[0m - [1mProcessed 744 chunks out of 744 chunks[0m
100%|██████████| 50/50 [00:11<00:00,  4.45it/s]
[32m2024-05-05 15:16:27.643[0m | [1mINFO    [0m | [36mragsc.markdown[0m:[36mstore_list_of_pages[0m:[36m227[0m - [1mProcessed 251 chunks out o

In [36]:
query_results = collection.query(
    query_texts=["what is the status of the expansion of Gatineau ",
                 "how many students will there be in next year's class"],
    
    include=["documents","metadatas"],
    where_document={"$not_contains": "gagnon"},
    # # where_document={"$or": [  # type: ignore
    # #         {
    # #             "keywords":{"$contains": "ugme"}
    # #         },
    # #         {
    # #         "present":{"$contains":"mondou"}
    # #         }
    # #     ]      
    # # },
    n_results=10,
)

In [37]:
query_results["documents"]

[['advisory committee\ncan advertize\ncmq gatineau\nseptember 14\nthe folks there can handle it',
  'comments of the aire de détente in gatineau\neveryone has to find something that needs to be reported',
  'gabay-20230417.md\nmodular\nluc desbiens\ntext stefane that we are prioritized\ngot permission to negotiate\nnovember 2023\nplan b for gmf-u\nramping up the number of inscriptions\nwe are announcing that we are pulling patients ouest\npointe claire\nhopital ste-anne',
  'ministerial announcement about gmf-u\ninfrastructure piece is nto yet confirmed\nsent in our proposal just before the holidays\nlm to meet with bruno about this, but it is luc desbiens who has to make the call',
  'no strings attached funding for 2 years\ncan hire 5 people to work on his project\nalready connected to people at mcgill\ncitf\nwe are supposed to finish in march 2023\nasked for an extension to complete studies\njean-louis moreau',
  'danyèle is willing to help but has too many meetings\nmaniwaki\nmovin

In [38]:
metadata = query_results['metadatas'] # type: ignore
if metadata is not None:
    for data in metadata: 
        print([x["filename"] for x in data])

['dove-20220726.md', 'melanie-20220622.md', 'gabay-20230417.md', 'mcvey-20220117.md', 't_evans-20220920.md', 'melanie-20230510.md', 'lacombe-20220826.md', 'gilles-20221220.md', 'doresavard-20220126.md', 'regina-20230606.md']
['mann-20230213.md', 'gagnon-20220815.md', 'melanie-20220112.md', 'melanie-20220112.md', 'feldman-20220209.md', 'melanie-20220419.md', 'mcvey-20220321.md', 'farhan-20230703.md', 'demetra-20221122.md', 'opatrny-20220729.md']
