# Towards AI Online Course: Chroma DB - Vector Database


In [1]:
import chromadb
import csv
import logging as log
import os
import shutil

In [2]:
log.basicConfig(level=log.INFO, format='[%(levelname)5s] %(asctime)s: %(message)s',
                  datefmt='%H:%M:%S')

In [3]:
os.environ['TAI_DATASET_ROOT']
assert 'TAI_DATASET_ROOT' in os.environ
miniArticlesFilepath= os.path.join(os.environ['TAI_DATASET_ROOT'],'rag_ai_tutor','mini-llama-articles.txt')
log.info(f'miniArticlesFilepath: {miniArticlesFilepath}')

[ INFO] 14:43:11: miniArticlesFilepath: /home/minguzzi/repo/towards_ai_course/dataset/rag_ai_tutor/mini-llama-articles.txt


In [4]:
assert os.path.exists(  miniArticlesFilepath)

## Loads the mini articles in one string: text

In [5]:
text= str()
with open( miniArticlesFilepath, mode="r", encoding="utf-8") as file:
    csvReader= csv.reader(file)

    for idx, row in enumerate( csvReader):
        if idx == 0:
            continue
        text += row[1]

# The number of characters in the dataset.
print( len( text))

171044


### Divides the text into chunks

In [6]:
text[0: 128]

"LLM Variants and Meta's Open Source Before shedding light on four major trends, I'd share the latest Meta's Llama 2 and Code Lla"

In [7]:
text[171000:]

'ode Large Language Models with Evol-Instruct'

In [8]:
chunkSize= 512
chunks= []
for i in range( 0, len(text), chunkSize):
    chunks.append( text[ i: i+chunkSize])

numChunks= len(chunks)    
log.info(f'numChunks:{numChunks}')

[ INFO] 14:43:11: numChunks:335


In [9]:
( len(chunks[333]),chunks[333])

(512,
 "ing, including InstructCodeT5+, StarCoder-GPTeacher, and Instruct-Codegen-16B. In conclusion, WizardCoder's success is attributed to its unique dataset and the innovative use of Evol-Instruct to enhance instruction complexity, leading to its outstanding performance across various code-related tasks and benchmarks.  References YouTube: WizardCoder 34B: Complex Fine-Tuning Explained GitHub Paper: WizardLM- Empowering Large Language Models to Follow Complex Instructions Paper: WizardCoder: Empowering Code Larg")

In [10]:
( len(chunks[334]),chunks[334])

(36, 'e Language Models with Evol-Instruct')

## Using Chroma DB with LlamaIndex

In [11]:
from llama_index.core import Document
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.vector_stores.chroma import ChromaVectorStore

In [12]:
# Converts the text chunks into Documents
documents= [ Document( text=t) for t in chunks]

In [13]:
ChromaDbPath= './mini-dataset'
if os.path.exists(ChromaDbPath):
    print(f'The path {ChromaDbPath} exists')
    shutil.rmtree(ChromaDbPath, ignore_errors=True)
    
# Creates the persistent client (i.e. not in memory).
chromaClient=     chromadb.PersistentClient(path=ChromaDbPath)
chromaCollection= chromaClient.create_collection("MiniDataset")

[ INFO] 14:43:13: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [14]:
vectorStore =    ChromaVectorStore(chroma_collection=chromaCollection)
storageContext = StorageContext.from_defaults(vector_store=vectorStore)

In [15]:
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    storage_context=storageContext,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/335 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/335 [00:00<?, ?it/s]

[ INFO] 14:43:20: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[ INFO] 14:43:22: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[ INFO] 14:43:24: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[ INFO] 14:43:25: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [16]:
llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)
queryEngine = index.as_query_engine(llm=llm, similarity_top_k=5)

In [17]:
response = queryEngine.query("How many parameters LLaMA2 model has?")
print(response)

[ INFO] 14:43:28: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


The Llama 2 model comes in four sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.



## Using Chroma DB with LangChain

In [30]:
from langchain_chroma import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA

In [21]:
print(f'Number of Chunks:{ len(chunks)}')
documents = [Document(page_content=t) for t in chunks]
print(f'Number of documents:{ len(documents)}')

Number of Chunks:335
Number of documents:335


In [26]:
LcChromaDbPath= './mini-chunked-dataset'
LcCollectionName= 'mini-chunked-dataset'
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
if os.path.exists(LcChromaDbPath):
    print(f'The path {LcChromaDbPath} exists.')    
    shutil.rmtree(LcChromaDbPath)
else:
    print(f'The path {LcChromaDbPath} does not exist.')    

The path ./mini-chunked-dataset does not exist.


In [27]:
lcChromaDb = Chroma.from_documents( 
    documents=documents,
    embedding=embeddings,
    persist_directory=LcChromaDbPath,
    collection_name=LcCollectionName,
)

[ INFO] 14:49:22: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
[ INFO] 14:49:31: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [29]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=512,)

In [31]:
query = "How many parameters LLaMA2 model has?"

retriever = lcChromaDb.as_retriever(search_kwargs={"k": 4})

# Define a RetrievalQA chain that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

response = chain.invoke(query)
print(response["result"])

[ INFO] 14:51:51: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


The Llama 2 model is available in four different sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.
