# **ChromaDB tutorial**

This is the sample notebook from the ChromaDB site.


# **How to run in Colab:**

This notebook can be run on Google Colab and stand alone python development environments.  Click here to run on colab:

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/datariders/tutorials/blob/main/vectordb/chromadb/chromadb_tutorial.ipynb)


# **References:**

https://docs.trychroma.com/getting-started

In [1]:
!pip3 install -U chromadb

Collecting chromadb
  Downloading chromadb-0.5.4-py3-none-any.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.4/581.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.5 (from chromadb)
  Downloading chroma_hnswlib-0.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.p

In [2]:
import chromadb

chroma_client = chromadb.Client()
print(" chroma_client: ", chroma_client)

 chroma_client:  <chromadb.api.client.Client object at 0x796a16605570>


In [3]:
# Using get_or_create_collection instead of `create_collection` to avoid creating a new collection every time
collection = chroma_client.get_or_create_collection(name="my_collection")
print(" collection: ", collection)

 collection:  Collection(id=f6cc630a-f1a9-43d1-8310-475b5e413bd8, name=my_collection)


In [16]:
# switch `add` to `upsert` to avoid adding the same documents every time
collection.upsert(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges",
        "This is a document about wines",
        "This is a document about almonds",
        "This is a document about potatoes",
        "This is a document about wheat",
        "This is a document about soyabeans",
        "This is a document about corn",
        "This is a document about apples"
    ],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9"]
)

In [17]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id1', 'id7', 'id2', 'id5']]
 k:  distances 	 v:  [[1.0404009819030762, 1.12140953540802, 1.2430799007415771, 1.267379879951477]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about pineapple', 'This is a document about soyabeans', 'This is a document about oranges', 'This is a document about potatoes']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']


In [18]:
results = collection.query(
    query_texts=["This is a query document about florida"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id2', 'id8', 'id1', 'id3']]
 k:  distances 	 v:  [[1.1462137699127197, 1.1964797973632812, 1.3015384674072266, 1.309969425201416]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about oranges', 'This is a document about corn', 'This is a document about pineapple', 'This is a document about wines']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']


In [19]:
results = collection.query(
    query_texts=["This is a query document about california"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id4', 'id9', 'id3', 'id5']]
 k:  distances 	 v:  [[1.1915949583053589, 1.2140405178070068, 1.2222235202789307, 1.2248032093048096]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about almonds', 'This is a document about apples', 'This is a document about wines', 'This is a document about potatoes']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']


In [20]:
results = collection.query(
    query_texts=["This is a query document about idaho"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id5', 'id6', 'id7', 'id8']]
 k:  distances 	 v:  [[1.1280782222747803, 1.191328763961792, 1.2481242418289185, 1.2623882293701172]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about potatoes', 'This is a document about wheat', 'This is a document about soyabeans', 'This is a document about corn']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']


In [21]:
results = collection.query(
    query_texts=["This is a query document about north dakota"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id8', 'id7', 'id6', 'id5']]
 k:  distances 	 v:  [[1.113612174987793, 1.194001317024231, 1.224453091621399, 1.2415412664413452]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about corn', 'This is a document about soyabeans', 'This is a document about wheat', 'This is a document about potatoes']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']


In [22]:
results = collection.query(
    query_texts=["This is a query document about illinois"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id8', 'id6', 'id5', 'id2']]
 k:  distances 	 v:  [[1.174592137336731, 1.2126613855361938, 1.3442327976226807, 1.3791215419769287]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about corn', 'This is a document about wheat', 'This is a document about potatoes', 'This is a document about oranges']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']


In [23]:
results = collection.query(
    query_texts=["This is a query document about iowa"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id8', 'id6', 'id7', 'id9']]
 k:  distances 	 v:  [[1.0420067310333252, 1.0799365043640137, 1.1819730997085571, 1.1856138706207275]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about corn', 'This is a document about wheat', 'This is a document about soyabeans', 'This is a document about apples']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']


In [24]:
results = collection.query(
    query_texts=["This is a query document about washington"], # Chroma will embed this for you
    n_results=4 # how many results to return
)

for k, v in results.items():
    print(" k: ", k, "\t v: ", v)

 k:  ids 	 v:  [['id6', 'id9', 'id1', 'id2']]
 k:  distances 	 v:  [[1.179661512374878, 1.2146059274673462, 1.2220364809036255, 1.2369394302368164]]
 k:  metadatas 	 v:  [[None, None, None, None]]
 k:  embeddings 	 v:  None
 k:  documents 	 v:  [['This is a document about wheat', 'This is a document about apples', 'This is a document about pineapple', 'This is a document about oranges']]
 k:  uris 	 v:  None
 k:  data 	 v:  None
 k:  included 	 v:  ['metadatas', 'documents', 'distances']
