<a href="https://colab.research.google.com/github/deepakgarg08/llm-diary/blob/main/chromadb_CRUD_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SIMPLE EXAMPLE -  CRUD OPERATION BELOW

In [7]:
# !pip install chromadb

In [8]:
import chromadb
from chromadb.config import Settings

# Create Chroma client with default settings
client = chromadb.Client(Settings())

# Create a new collection (or get existing)
collection = client.create_collection(name="my_documents")

# Sample data
documents = [
    "Paris is the capital of France",
    "The Eiffel Tower is in Paris",
    "Berlin is the capital of Germany",
    "The Berlin Wall fell in 1989",
]

# Add data to the collection
collection.add(
    documents=documents,
    ids=["doc1", "doc2", "doc3", "doc4"]
)

# Query the collection
results = collection.query(
    query_texts=["What is the capital of Germany?"],
    n_results=2
)



In [9]:
print(results)

{'ids': [['doc3', 'doc1']], 'embeddings': None, 'documents': [['Berlin is the capital of Germany', 'Paris is the capital of France']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[0.37425845861434937, 1.1175110340118408]]}


# CRUD

In [None]:
!pip install chromadb

Now import and configure the client

In [18]:
import chromadb
from chromadb.config import Settings

# Create the Chroma client
client = chromadb.Client(Settings())

# Create or get a collection
collection = client.get_or_create_collection(name="my_collection2")

# CREATE - ADD DOCUMENTS

In [19]:
documents = [
    "The capital of France is Paris",
    "Berlin is the capital of Germany",
    "Tokyo is the capital of Japan"
]

collection.add(
    documents=documents,
    ids=["doc1", "doc2", "doc3"],
    metadatas=[{"country": "France"}, {"country": "Germany"}, {"country": "Japan"}]
)


# READ: Query Documents

In [34]:
result = collection.query(
    query_texts=["largest city?"],
    n_results=5
)

print(result["documents"])


[['Berlin is the largest city in Germany', 'The capital of France is Paris']]


# UPDATE/MODIFY THE DOCUMENTS
THERE IS NO delete operation in chromadb, you first delete and add it again

In [29]:
collection.delete(ids=["doc2"])


In [40]:
collection.add(
    documents=["Rome is the capital of Italy"],
    ids=["doc4"],
    metadatas=[{"country": "Italy"}]
)

# Confirm it's added
doc = collection.get(ids=["doc4"])
print(doc)


{'ids': ['doc4'], 'embeddings': None, 'documents': ['Rome is the capital of Italy'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'country': 'Italy'}]}


# DELETE: Remove Documents

In [39]:
collection.delete(ids=["doc3"])

# Try to fetch it again for confirmation that delete worked or not
doc = collection.get(ids=["doc3"])
print(doc)


### You can also delete based on metadata:

In [33]:
collection.delete(where={"country": "France"})


# Additional: Get All Documents (Hacky Read-All)
#### Chroma doesn’t have a native get_all() method, but if you track IDs, you can fetch documents like this:

In [35]:
all_docs = collection.get(ids=["doc1", "doc2"])
print(all_docs["documents"])


['The capital of France is Paris', 'Berlin is the largest city in Germany']


### Or filter by metadata:

In [36]:
results = collection.get(where={"updated": True})
print(results["documents"])


['Berlin is the largest city in Germany']


Optional: Build a Helper Function
You can wrap these checks in reusable functions

In [41]:
def is_doc_present(collection, doc_id):
    result = collection.get(ids=[doc_id])
    return len(result["documents"]) > 0

# Usage
if is_doc_present(collection, "doc4"):
    print("Document exists.")
else:
    print("Document not found.")


Document exists.


# some other operatoins

In [42]:
collection.count()

3

In [46]:
# # for all other types of possible operations
# https://chatgpt.com/share/6849a443-e5e8-8008-80f4-07ce750899eb

In [45]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

# Initialize Chroma client
client = chromadb.Client(Settings())

# Create or get collection
collection = client.get_or_create_collection(name="my_custom_embed_collection")

# Load a Hugging Face sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Example documents
documents = [
    "Paris is the capital of France",
    "Berlin is the capital of Germany",
    "Tokyo is the capital of Japan"
]
ids = ["doc1", "doc2", "doc3"]

# Generate custom embeddings
embeddings = embedder.encode(documents).tolist()

# Add documents with custom embeddings
collection.add(
    documents=documents,
    ids=ids,
    embeddings=embeddings,
    metadatas=[{"city": "Paris"}, {"city": "Berlin"}, {"city": "Tokyo"}]
)

# Now query using a custom embedding (e.g., from a user query)
query = "What city is the capital of Germany?"
query_embedding = embedder.encode([query]).tolist()

# Perform semantic search
results = collection.query(
    query_embeddings=query_embedding,
    n_results=2,
    include=["documents", "distances"]
)

# Print results
print("Top matches:")
for doc, dist in zip(results["documents"][0], results["distances"][0]):
    print(f"- {doc} (distance: {dist:.4f})")


Top matches:
- Berlin is the capital of Germany (distance: 0.3362)
- Paris is the capital of France (distance: 1.1139)
