# Chroma DB Demo

Chroma is a AI-native open-source vector database focused on developer productivity and happiness. Chroma is licensed under Apache 2.0.

https://docs.trychroma.com/guides

# Load dependencies

In [None]:
pip install langchain langchain_community langchain_chroma sentence_transformers  langchain-openai

Collecting langchain
  Downloading langchain-0.2.10-py3-none-any.whl (990 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.0/990.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_community
  Downloading langchain_community-0.2.9-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_chroma
  Downloading langchain_chroma-0.1.2-py3-none-any.whl (9.3 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.22 (from langchain)
  Downloading langchain_core-0.2.22-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.5/373.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecti

In [None]:
import langchain
import os
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_text_splitters import CharacterTextSplitter
import logging

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

logging.basicConfig(level=logging.DEBUG,
                    format='[%(levelname)s] - %(message)s ',
                    handlers=[
                        logging.FileHandler('/content/langchaindemo.log', mode='w'),
                        logging.StreamHandler(),
                    ],
                    force=True)
logger = logging.getLogger(__name__)
logger.info("Langchain Demo Initialized")

[INFO] - Langchain Demo Initialized 


# Create Documents from scratch

In [None]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [None]:
# Split docs into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
chunks = text_splitter.split_documents(docs)

# Init embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Load them into Chroma
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_function
)


[INFO] - Use pytorch device_name: cpu 
[INFO] - Load pretrained SentenceTransformer: all-MiniLM-L6-v2 
[DEBUG] - Resetting dropped connection: huggingface.co 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.json HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json HTTP/1.1" 200 0 
[DEBUG] - https://huggingface.co:443 "HEAD /sentence-tr

In [None]:
# Prompt
query = "what are cats?"
docs = vectorstore.similarity_search(query)

print(docs[0].page_content)

Cats are independent pets that often enjoy their own space.


In [None]:
docs

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech.')]

# Persist to disk


In [30]:
# save to disk
vectorstore_disk = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")
docs = vectorstore_disk.similarity_search(query)
docs

[INFO] - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information. 
[DEBUG] - Starting component System 
[DEBUG] - Starting component Posthog 
[DEBUG] - Starting component OpenTelemetryClient 
[DEBUG] - Starting component SqliteDB 
[DEBUG] - Starting component QuotaEnforcer 
[DEBUG] - Starting component LocalSegmentManager 
[DEBUG] - Starting component SegmentAPI 
[DEBUG] - Starting component PersistentLocalHnswSegment 


[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Rabbits are social animals that need plenty of space to hop around.')]

[DEBUG] - Resetting dropped connection: us-api.i.posthog.com 


In [31]:
# load from disk
vectorstore_disk_loaded = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
docs = vectorstore_disk.similarity_search(query)
print(docs[0].page_content)

[DEBUG] - Collection langchain already exists, returning existing collection. 


Cats are independent pets that often enjoy their own space.


[DEBUG] - https://us-api.i.posthog.com:443 "POST /batch/ HTTP/1.1" 200 15 


# Generate a Response

In [None]:
pip install -qU langchain-anthropic

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.5/865.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from langchain_anthropic import ChatAnthropic

llm = ChatAnthropic(model="claude-3-sonnet-20240229")

[DEBUG] - load_ssl_context verify=True cert=None trust_env=True http2=False 
[DEBUG] - load_verify_locations cafile='/usr/local/lib/python3.10/dist-packages/certifi/cacert.pem' 
[DEBUG] - load_ssl_context verify=True cert=None trust_env=True http2=False 
[DEBUG] - load_verify_locations cafile='/usr/local/lib/python3.10/dist-packages/certifi/cacert.pem' 


In [None]:

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)


message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

In [None]:
response = rag_chain.invoke("tell me about cats")

print(response.content)

[DEBUG] - Request options: {'method': 'post', 'url': '/v1/messages', 'timeout': 600, 'files': None, 'json_data': {'max_tokens': 1024, 'messages': [{'role': 'user', 'content': "\nAnswer this question using the provided context only.\n\ntell me about cats\n\nContext:\n[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')]\n"}], 'model': 'claude-3-sonnet-20240229'}} 
[DEBUG] - Sending HTTP Request: POST https://api.anthropic.com/v1/messages 
[DEBUG] - connect_tcp.started host='api.anthropic.com' port=443 local_address=None timeout=600 socket_options=None 
[DEBUG] - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x78841878f9a0> 
[DEBUG] - start_tls.started ssl_context=<ssl.SSLContext object at 0x78841890ec40> server_hostname='api.anthropic.com' timeout=600 
[DEBUG] - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x78841878fee0> 
[DEBUG] - send_request_hea

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Claude API. Please go to Plans & Billing to upgrade or purchase credits.'}}