# Where Filtering
This notebook demonstrates how to use where filtering to filter the data returned from get or query.

In [1]:
import chromadb

In [2]:
client = chromadb.Client()

In [3]:
# Create a new chroma collection
collection_name = "filter_example_collection"
collection = client.create_collection(name=collection_name)

In [4]:
# Add some data to the collection
collection.add(
    embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas=[
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
    ],
    documents=["A document that discusses domestic policy", "A document that discusses international affairs", "A document that discusses kittens", "A document that discusses dogs", "A document that discusses chocolate", "A document that is sixth that discusses government", "A document that discusses international affairs", "A document that discusses global affairs"],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
)

In [5]:
# Get documents that are read and about affairs
collection.get(where={"status": "read"}, where_document={"$contains": "affairs"})

{'ids': ['id7'],
 'embeddings': None,
 'metadatas': [{'status': 'read'}],
 'documents': ['A document that discusses international affairs'],
 'uris': None,
 'data': None}

In [6]:
# Get documents that are about global affairs or domestic policy
collection.get(where_document={"$or": [{"$contains": "global affairs"}, {"$contains": "domestic policy"}]})

{'ids': ['id1', 'id8'],
 'embeddings': None,
 'metadatas': [{'status': 'read'}, {'status': 'unread'}],
 'documents': ['A document that discusses domestic policy',
  'A document that discusses global affairs'],
 'uris': None,
 'data': None}

In [7]:
# Get 5 closest vectors to [0, 0, 0] that are about affairs
# Outputs 3 docs because collection only has 3 docs about affairs
collection.query(query_embeddings=[[0, 0, 0]], where_document={"$contains": "affairs"}, n_results=5)

{'ids': [['id7', 'id8', 'id2']],
 'distances': [[16.740001678466797, 87.22000122070312, 87.22000122070312]],
 'metadatas': [[{'status': 'read'},
   {'status': 'unread'},
   {'status': 'unread'}]],
 'embeddings': None,
 'documents': [['A document that discusses international affairs',
   'A document that discusses global affairs',
   'A document that discusses international affairs']],
 'uris': None,
 'data': None}

# Where Filtering With Logical Operators
This section demonstrates how one can use the logical operators in `where` filtering.

Chroma currently supports: `$and` and `$or`operators.

> Note: Logical operators can be nested

In [8]:
# Or Logical Operator Filtering
# import chromadb
client = chromadb.Client()
collection = client.get_or_create_collection("test-where-list")
collection.add(documents=["Article by john", "Article by Jack", "Article by Jill"],
               metadatas=[{"author": "john"}, {"author": "jack"}, {"author": "jill"}], ids=["1", "2", "3"],
               embeddings=[[1.1, 2.3, 3.2],
                           [4.5, 6.9, 4.4],
                           [1.1, 2.3, 3.2]])

collection.get(where={"$or": [{"author": "john"}, {"author": "jack"}]})

{'ids': ['1', '2'],
 'embeddings': None,
 'metadatas': [{'author': 'john'}, {'author': 'jack'}],
 'documents': ['Article by john', 'Article by Jack'],
 'uris': None,
 'data': None}

In [9]:
# And Logical Operator Filtering
collection = client.get_or_create_collection("test-where-list")
collection.upsert(documents=["Article by john", "Article by Jack", "Article by Jill", "article by Mindy"],
                  metadatas=[{"author": "john", "category": "chroma"}, 
                             {"author": "jack", "category": "ml"}, 
                             {"author": "jill", "category": "lifestyle"},
                             {"author": "mindy", "category": "chromadb"}], 
                  ids=["1", "2", "3", "4"],
                  embeddings=[[1.1, 2.3, 3.2],
                              [4.5, 6.9, 4.4],
                              [1.1, 2.3, 3.2],
                              [4.5, 6.9, 4.4]])

collection.get(where={"$and": [{"category": "chroma"}, {"author": "john"}]})

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'}],
 'documents': ['Article by john'],
 'uris': None,
 'data': None}

In [10]:
# And logical that doesn't match anything
collection.get(where={"$and": [{"category": "chroma"}, {"author": "jill"}]})

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [11]:
# Combined And and Or Logical Operator Filtering
collection.get(where={"$and": [{"category": "chroma"}, {"$or": [{"author": "john"}, {"author": "jack"}]}]})

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'}],
 'documents': ['Article by john'],
 'uris': None,
 'data': None}

In [12]:
collection.get(where_document={"$contains": "Article"},where={"$and": [{"category": "chroma"}, {"$or": [{"author": "john"}, {"author": "jack"}]}]})

{'ids': ['1'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'}],
 'documents': ['Article by john'],
 'uris': None,
 'data': None}

# Where and WhereDocument Filtering With String Operators: `$like` and `$regex`

This section will demonstrate the usage of `$like` and `$regex` operators.

In [13]:
# Documents mentioning "article" or "Article"
collection.get(where_document={'$regex': "[Aa]rticle"})

{'ids': ['1', '2', '3', '4'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'},
  {'author': 'jack', 'category': 'ml'},
  {'author': 'jill', 'category': 'lifestyle'},
  {'author': 'mindy', 'category': 'chromadb'}],
 'documents': ['Article by john',
  'Article by Jack',
  'Article by Jill',
  'article by Mindy'],
 'uris': None,
 'data': None}

In [14]:
# Empty regex gives back all entries
collection.get(where={'category': {"$regex": ""}})

{'ids': ['1', '2', '3', '4'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'},
  {'author': 'jack', 'category': 'ml'},
  {'author': 'jill', 'category': 'lifestyle'},
  {'author': 'mindy', 'category': 'chromadb'}],
 'documents': ['Article by john',
  'Article by Jack',
  'Article by Jill',
  'article by Mindy'],
 'uris': None,
 'data': None}

In [15]:
# Documents belonging to categories beginning with "chroma"
collection.get(where={'category': {"$like": "chroma%"}})

{'ids': ['1', '4'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'},
  {'author': 'mindy', 'category': 'chromadb'}],
 'documents': ['Article by john', 'article by Mindy'],
 'uris': None,
 'data': None}

In [16]:
# We can use the regex operator to achieve the same result
# Documents belonging to categories beginning with "chroma"
collection.get(where={'category': {"$regex": "chroma.*"}})

{'ids': ['1', '4'],
 'embeddings': None,
 'metadatas': [{'author': 'john', 'category': 'chroma'},
  {'author': 'mindy', 'category': 'chromadb'}],
 'documents': ['Article by john', 'article by Mindy'],
 'uris': None,
 'data': None}