In [1]:
# Importing the necessary modules from the chromadb package:
# chromadb is used to interact with the Chroma DB database,
# embedding_functions is used to define the embedding model

import chromadb
from chromadb.utils import embedding_functions
from pprint import pprint 

In [2]:
# Define the embedding function using SentenceTransformers

ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

In [3]:
# Create a new instance of ChromaClient to interact with the Chroma DB

client = chromadb.Client()

In [4]:
# Define the name for the collection to be created or retrieved

collection_name = "my_grocery_collection"

In [5]:
# Create a collection in the Chroma database with a specified name, 
# distance metric, and embedding function. In this case, we are using 
# cosine distance

collection = client.create_collection(
    name=collection_name,
    metadata={"description": "A collection for storing grocery data"},
    configuration={
        "hnsw": {"space": "cosine"},
        "embedding_function": ef
    }
)

print(f"Collection created: {collection.name}")

Collection created: my_grocery_collection


In [6]:
# Array of grocery-related text items with professional humor

texts = [
    'fresh red apples',
    'organic bananas',
    'ripe mangoes',
    'whole wheat bread',
    'farm-fresh eggs',
    'natural yogurt',
    'frozen vegetables',
    'grass-fed beef',
    'free-range chicken',
    'fresh salmon fillet',
    'aromatic coffee beans',
    'pure honey',
    'golden apple',
    'red fruit'
]

In [7]:
# Create a list of unique IDs for each text item in the 'texts' array
# Each ID follows the format 'food_<index>', where <index> starts from 1

ids = [f"food_{index + 1}" for index, _ in enumerate(texts)]

In [8]:
# Add documents and their corresponding IDs to the collection
# The `add` method inserts the data into the collection
# The documents are the actual text items, and the IDs are unique identifiers
# ChromaDB will automatically generate embeddings using the configured embedding function

collection.add(
    documents=texts,
    metadatas=[{"source": "grocery_store", "category": "food"} for _ in texts],
    ids=ids
)

In [9]:
# Retrieve all the items (documents) stored in the collection
# The `get` method fetches all data from the collection

all_items = collection.get()

# Log the retrieved items to the console for inspection
# This will print out all the documents, IDs, and metadata stored in the collection

print("Collection contents:")
print(f"Number of documents: {len(all_items['documents'])}")

Collection contents:
Number of documents: 14


In [10]:
# Define the query term you want to search for in the collection

query_term = ["red", "fresh"]
if isinstance(query_term, str):
    query_term = [query_term]

# Perform a query to search for the most similar documents to the 'query_term'

results = collection.query(
    query_texts=query_term,
    n_results=3  # Retrieve top 3 results
)
print(f"Query results for '{query_term}':")
pprint(results)

Query results for '['red', 'fresh']':
{'data': None,
 'distances': [[0.3132774829864502, 0.45399630069732666, 0.7393019199371338],
               [0.4773761034011841, 0.4854104518890381, 0.6252565979957581]],
 'documents': [['red fruit', 'fresh red apples', 'golden apple'],
               ['fresh red apples', 'farm-fresh eggs', 'pure honey']],
 'embeddings': None,
 'ids': [['food_14', 'food_1', 'food_13'], ['food_1', 'food_5', 'food_12']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[{'category': 'food', 'source': 'grocery_store'},
                {'category': 'food', 'source': 'grocery_store'},
                {'category': 'food', 'source': 'grocery_store'}],
               [{'category': 'food', 'source': 'grocery_store'},
                {'category': 'food', 'source': 'grocery_store'},
                {'category': 'food', 'source': 'grocery_store'}]],
 'uris': None}


In [11]:
for q in range(len(query_term)):
                    pprint(f'Top 3 similar documents to "{query_term[q]}":')
                    # Access the nested arrays in 'results["ids"]' and 'results["distances"]'
                    for i in range(min(3, len(results['ids'][q]))):
                        doc_id = results['ids'][q][i]  # Get ID from 'ids' array
                        score = results['distances'][q][i]  # Get score from 'distances' array
                        # Retrieve text data from the results
                        text = results['documents'][q][i]
                        if not text:
                            pprint(f' - ID: {doc_id}, Text: "Text not available", Score: {score:.4f}')
                        else:
                            pprint(f' - ID: {doc_id}, Text: "{text}", Score: {score:.4f}')

'Top 3 similar documents to "red":'
' - ID: food_14, Text: "red fruit", Score: 0.3133'
' - ID: food_1, Text: "fresh red apples", Score: 0.4540'
' - ID: food_13, Text: "golden apple", Score: 0.7393'
'Top 3 similar documents to "fresh":'
' - ID: food_1, Text: "fresh red apples", Score: 0.4774'
' - ID: food_5, Text: "farm-fresh eggs", Score: 0.4854'
' - ID: food_12, Text: "pure honey", Score: 0.6253'
