In [1]:
# Setup
import warnings
warnings.filterwarnings('ignore')

import chromadb
from chromadb.utils import embedding_functions

ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create a Chroma client
client = chromadb.Client()

In [None]:
# Create a collection with HNSW configuration

# collection = client.create_collection(
#     name="my_collection_name",
#     metadata={"topic": "query testing"},
#     configuration={
#         "hnsw": {
#             "space": "cosine",
#             "ef_search": 100,
#             "ef_construction": 100,
#             "max_neighbors": 16
#         },
#         "embedding_function": ef
#     }
# )

In [6]:
# Create a collection with HNSW configuration

collection = client.create_collection(
    name="my_collection",
    metadata={
        "topic": "query testing",
        "hnsw:space": "cosine"
    },
    embedding_function=ef,
    get_or_create=True
)

collection.name

'my_collection'

In [7]:
collection.add(
    documents=[
        "Giant pandas are a bear species that lives in mountainous areas.",
        "A pandas DataFrame stores two-dimensional, tabular data",
        "I think everyone agrees that pandas are some of the cutest animals on the planet",
        "A direct comparison between pandas and polars indicates that polars is a more efficient library than pandas.",
    ],
    metadatas=[
        {"topic": "animals"},
        {"topic": "data analysis"},
        {"topic": "animals"},
        {"topic": "data analysis"},
    ],
    ids=["id1", "id2", "id3", "id4"]
)

In [None]:
# Querying in Chroma DB

collection.query(
    query_texts=["dogs"], #cats
    n_results=10
)

Number of requested results 10 is greater than number of elements in index 4, updating n_results = 4


{'ids': [['id3', 'id1', 'id4', 'id2']],
 'distances': [[0.7642663717269897,
   0.8570343255996704,
   0.9149167537689209,
   0.9717403650283813]],
 'metadatas': [[{'topic': 'animals'},
   {'topic': 'animals'},
   {'topic': 'data analysis'},
   {'topic': 'data analysis'}]],
 'embeddings': None,
 'documents': [['I think everyone agrees that pandas are some of the cutest animals on the planet',
   'Giant pandas are a bear species that lives in mountainous areas.',
   'A direct comparison between pandas and polars indicates that polars is a more efficient library than pandas.',
   'A pandas DataFrame stores two-dimensional, tabular data']],
 'uris': None,
 'data': None}

In [11]:
collection.query(
    query_texts=["polar bear"],
    n_results=1,
    where={"topic": "animals"}
)

{'ids': [['id1']],
 'distances': [[0.7096826434135437]],
 'metadatas': [[{'topic': 'animals'}]],
 'embeddings': None,
 'documents': [['Giant pandas are a bear species that lives in mountainous areas.']],
 'uris': None,
 'data': None}

In [12]:
collection.query(
    query_texts=["polar bear"],
    n_results=1,
    where_document={'$not_contains': 'library'}
)

{'ids': [['id1']],
 'distances': [[0.7096826434135437]],
 'metadatas': [[{'topic': 'animals'}]],
 'embeddings': None,
 'documents': [['Giant pandas are a bear species that lives in mountainous areas.']],
 'uris': None,
 'data': None}

In [13]:
collection.query(
    query_texts=["polar bear"],
    n_results=1,
    where={'topic': 'animals'},
    where_document={'$not_contains': 'library'}
)

{'ids': [['id1']],
 'distances': [[0.7096826434135437]],
 'metadatas': [[{'topic': 'animals'}]],
 'embeddings': None,
 'documents': [['Giant pandas are a bear species that lives in mountainous areas.']],
 'uris': None,
 'data': None}