In [1]:
%cd ../../
import chromadb
from chromadb.config import Settings

client = chromadb.EphemeralClient(settings=Settings(allow_reset=True))

client.reset()
col = client.get_or_create_collection("test")
docs = ["AI is increasing technological innovation pace.", 
        "This article is about telecommunication technology.",
        "RAG and fine-tuning have their strengths and weakness.", 
        "Mobile technologies are changing the world of telecommunication at a steady pace.",
        "Latest technology can be found in the spanish telenovela."]
col.upsert(ids=[f"{i}" for i in range(len(docs))], documents=docs)
col.query(query_texts=["Technological pace"], where_document={"$or": [{"$contains": "technology"}, {"$contains":"pace"}]})


/Users/tazarov/experiments/chroma-experiments/document_keyword_search


Number of requested results 10 is greater than number of elements in index 5, updating n_results = 5


{'ids': [['0', '3', '4', '1']],
 'distances': [[0.6140892505645752,
   0.8985650539398193,
   1.242105484008789,
   1.30079185962677]],
 'metadatas': [[None, None, None, None]],
 'embeddings': None,
 'documents': [['AI is increasing technological innovation pace.',
   'Mobile technologies are changing the world of telecommunication at a steady pace.',
   'Latest technology can be found in the spanish telenovela.',
   'This article is about telecommunication technology.']],
 'uris': None,
 'data': None}

The above is great and we get the results we want, but what if we don't want articles about mobile or telecommunications?
Enter keyword search in FTS5 and MATCH queries (read more here [https://sqlite.org/fts5.html](https://sqlite.org/fts5.html), particularly section 3.6)

Let's see an example where we only exclude mobile

In [2]:
col.query(query_texts=["Technological pace"], where_document={
          "$keyword": "(technology OR pace) NOT mobile"})
# as expected three documents are returned

Number of requested results 10 is greater than number of elements in index 5, updating n_results = 5


{'ids': [['0', '4', '1']],
 'distances': [[0.6140892505645752, 1.242105484008789, 1.30079185962677]],
 'metadatas': [[None, None, None]],
 'embeddings': None,
 'documents': [['AI is increasing technological innovation pace.',
   'Latest technology can be found in the spanish telenovela.',
   'This article is about telecommunication technology.']],
 'uris': None,
 'data': None}

Now let's try excluding another term, telecommunication

In [3]:
col.query(query_texts=["Technological pace"], where_document={
          "$keyword": "(technology OR pace) NOT mobile NOT telecommunication"})
# as expected two documents are returned

Number of requested results 10 is greater than number of elements in index 5, updating n_results = 5


{'ids': [['0', '4']],
 'distances': [[0.6140892505645752, 1.242105484008789]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['AI is increasing technological innovation pace.',
   'Latest technology can be found in the spanish telenovela.']],
 'uris': None,
 'data': None}

Let's have some more fun and try to exclude only keywords that start with telecom. This will include the telenovela document, but exclude the telecommunications document.

In [7]:
col.query(query_texts=["Technological pace"], where_document={
          "$keyword": "(technology OR pace) NOT mobile NOT NEAR(telecom)"})
# as expected two documents are returned, one of which is our telenovela

Number of requested results 10 is greater than number of elements in index 5, updating n_results = 5


{'ids': [['0', '4']],
 'distances': [[0.6140892505645752, 1.242105484008789]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['AI is increasing technological innovation pace.',
   'Latest technology can be found in the spanish telenovela.']],
 'uris': None,
 'data': None}