# Vector Databases (ChromaDB) Practice

In [1]:
import chromadb

# persistent chroma client
client = chromadb.PersistentClient(path="./data/chroma_db")

### Creating a collection

In [2]:
collection1 = client.create_collection(name='collection1')

### Adding documents to the collection

In [3]:
collection1.add(ids=["id1", "id2"], 
                documents=[
                    "This is document about pineapple",
                    "This is document about oranges"
                ])

### Querying the chroma db using the document

In [4]:
collection1.query(
    query_texts=["This is a query document about hawaii"], 
    n_results=2
)

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['This is document about pineapple',
   'This is document about oranges']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.0800682306289673, 1.2615830898284912]]}

### Read the collection documents data using ID

In [5]:
collection1.get(ids=["id1"], include=["documents"])

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['This is document about pineapple'],
 'uris': None,
 'included': ['documents'],
 'data': None,
 'metadatas': None}

### Get or create collection

In [6]:
collection2 = client.get_or_create_collection(name="collection2")

### Adding embeddings, metadatas(optional) to the collection

In [7]:
collection2.add(
    ids=["id3"],
    embeddings=[[0.1, 0.2, 0.4]],
    metadatas=[{"name":"sample metadata"}]
)

### List collections

In [8]:
print("Available collections:", client.list_collections())

Available collections: [Collection(name=collection1), Collection(name=collection2)]


### Fetching the collection details using include argument

In [9]:
print("Fetching data from collection 2", collection2.get(include=["documents", "metadatas"]))

Fetching data from collection 2 {'ids': ['id3'], 'embeddings': None, 'documents': [None], 'uris': None, 'included': ['documents', 'metadatas'], 'data': None, 'metadatas': [{'name': 'sample metadata'}]}


### Updating the data in the collection

In [10]:
collection2.update(
    ids=["id3"],
    metadatas=[{"name":"updated metadata"}]
)

print(collection2.get(include=['metadatas', 'embeddings']))

{'ids': ['id3'], 'embeddings': array([[0.1       , 0.2       , 0.40000001]]), 'documents': None, 'uris': None, 'included': ['metadatas', 'embeddings'], 'data': None, 'metadatas': [{'name': 'updated metadata'}]}


### Delete Collections

In [11]:
all_collections = client.list_collections()

for collection in all_collections:
    client.delete_collection(name=collection.name)

client.list_collections

<bound method Client.list_collections of <chromadb.api.client.Client object at 0x1062052b0>>

### Generate the Ollama Embeddings and store in chromadb

In [12]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma

# creating chroma instance with ollama embeddings function
vector_db = Chroma(
        collection_name = "test-collection",
        embedding_function = OllamaEmbeddings(
            model="nomic-embed-text"
        ),
        persist_directory = "./data/ollama_embeddings"
    )

In [13]:
sample_texts = [
    "Educosys has started the second batch of the Gen-AI course",
    "Educosys has other software engineering courses like HHLD, HLD, LLD, C++",
    "Generative AI is the popular trend right now in the software engineering",
]

#### adding the texts to the vector database

In [14]:
vector_db.add_texts(sample_texts)

['b4dd171c-813d-47aa-9f20-0a6a0d3a8173',
 'd3581a27-ca20-4b0d-9b6b-b5f1e3fecb73',
 '97c8d557-c0f7-43e7-9331-bc4173041fdf']

#### creating the embeddings retriever

In [15]:
retriever = vector_db.as_retriever(search_kwargs={"k":2})

#### running the query on the vector database

In [17]:
response = retriever.invoke("What is the trending software engineering technology")

response[0].page_content

'Generative AI is the popular trend right now in the software engineering'