# Intro to ChromaDB
* This note book is created to play around and get a basic understanding of `ChromaDB`

## Import Libraries

In [1]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction, DefaultEmbeddingFunction
# from sentence_transformers import SentenceTransformer

## Initialize Client

In [2]:
## initialize chroma
## this starts the server and opens the client in ephemeral (short living) mode
## the data is not persisted and is reset after closing the kernel
chroma_client = chromadb.Client()

## Create a Collection

In [3]:
## define a an embedding function for the data. 
embedding_func = SentenceTransformerEmbeddingFunction('all-MiniLM-L6-v2')
collection = chroma_client.create_collection(name="greetings_collection", embedding_function=embedding_func)

## Add documents to the collection

In [4]:
documents = ["Hello world", "Aloha world", "Namaste World",
             "Mahalo World", "Adios World", "The dog says bow wow"]
metadatas = [{"season": 1, "episode": 1}, {"season": 1, "episode": 2}, {"season": 2, "episode": 1}, {
    "season": 2, "episode": 2}, {"season": 3, "episode": 1}, {"season": 3, "episode": 2}]
# IDs are required while adding the document
# IDs need to be unique and string
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=[f"id{num}" for num in range(len(documents))])

## Querying with Embeddings

In [5]:
results = collection.query(
    query_texts=["animals"],
    n_results=2,
        include=["documents",  "metadatas", "distances"]
)

print(results)

{'ids': [['id5', 'id2']], 'embeddings': None, 'documents': [['The dog says bow wow', 'Namaste World']], 'uris': None, 'data': None, 'metadatas': [[{'episode': 2, 'season': 3}, {'episode': 1, 'season': 2}]], 'distances': [[1.313508152961731, 1.6261656284332275]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


Observations:
* So `ChromaDB` uses `Euclidean Distance` to find the similarity between text vectors, instead of `Cosine Similarity`. 

## Querying with Metadata

In [6]:
metadata_query = {"season":{"$eq":1}}

results = collection.get(where=metadata_query)
print(results)

{'ids': ['id0', 'id1'], 'embeddings': None, 'documents': ['Hello world', 'Aloha world'], 'uris': None, 'data': None, 'metadatas': [{'episode': 1, 'season': 1}, {'episode': 2, 'season': 1}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


## Combining Embeddings & Metadata

In [11]:
metadata_query = {"season": {"$eq": 3}}

results = collection.query(
    query_texts=["animals"],
    where=metadata_query,
    n_results=1
)
print(results)

{'ids': [['id5']], 'embeddings': None, 'documents': [['The dog says bow wow']], 'uris': None, 'data': None, 'metadatas': [[{'episode': 2, 'season': 3}]], 'distances': [[1.313508152961731]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


## Upserting Documents

In [14]:
collection.upsert(
    ids=["id6","id7"],
    documents=["the cat says meow", "Vancouver is a city in Western Canada"],
    metadatas=[{"season": 4, "episode": 1}, {"season": 4, "episode": 2}]
)

In [19]:
results = collection.query(
    query_texts=["animals"],
    n_results=2
)
print(results)

{'ids': [['id5', 'id6']], 'embeddings': None, 'documents': [['The dog says bow wow', 'the cat says meow']], 'uris': None, 'data': None, 'metadatas': [[{'episode': 2, 'season': 3}, {'episode': 1, 'season': 4}]], 'distances': [[1.313508152961731, 1.3540990352630615]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


## Deleting Documents

In [20]:
collection.delete(
    ids=['id5', 'id6']
)

In [21]:
results = collection.query(
    query_texts=["animals"],
    n_results=2
)
print(results)

{'ids': [['id2', 'id1']], 'embeddings': None, 'documents': [['Namaste World', 'Aloha world']], 'uris': None, 'data': None, 'metadatas': [[{'episode': 1, 'season': 2}, {'episode': 2, 'season': 1}]], 'distances': [[1.6261656284332275, 1.628368854522705]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
