### Install Chromadb and Pandas

In [None]:
pip install chromadb

In [None]:
pip install pandas

In [None]:
import chromadb #https://docs.trychroma.com/docs/overview/getting-started
import pandas as pd

In [None]:
import os
os.environ['SSL_CERT_FILE'] = 'ca-bundle-full.crt'

### Create a database

In [None]:
#chroma_client = chromadb.Client() # If you want the data to be memory only.

chroma_client = chromadb.PersistentClient(path="./chroma_db") # to persist the database


### Create a collection

In [None]:
# create the collection to store documents and their embeddings
collection_documents = chroma_client.create_collection(name="documents")

In [None]:
# get or create the collection if it already exists
collection_documents = chroma_client.get_or_create_collection(name="documents")

### Add data into the database

In [None]:
#this code will also download the embedding model on the first run
#every document will be converted into a vector using the embedding model
#and stored in the collection along with the document text and metadata https://docs.trychroma.com/docs/embeddings/embedding-functions
collection_documents.add(
    ids=["id1", "id2"],
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ]
)

### Query the data

In [None]:
from pprint import pprint
# query the collection for similar documents
results = collection_documents.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
pprint(results) #https://docs.trychroma.com/docs/querying-collections/query-and-get#results-shape

### Filter search

In [None]:
 # adding filter https://docs.trychroma.com/docs/querying-collections/metadata-filtering
results = collection_documents.query(
    query_texts=["This is a query document about hawaii"],
    n_results=2,
    where_document={"$contains": "pineapple"} # filter and only return documents that contain the word pineapple
)
pprint(results)

## Lets try adding a csv file into a database

### Lets try to use a different embedding model

In [None]:
#use different embedding model
import chromadb.utils.embedding_functions as embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction( #https://docs.trychroma.com/integrations/embedding-models/openai
                api_key="<API_KEY>",
                api_base="<ENDPOINT_URL>",
                api_type="azure",
                api_version="2024-10-21",
                model_name="text-embedding-3-small",
                deployment_id='text-embedding-3-small' #https://platform.openai.com/docs/guides/embeddings
            )

### Load the file (Corpus Composition and Ingestion)

In [None]:
data = pd.read_csv("Data/sample_book_data.csv")
data.head()

### Is the data clean?

### Chunking data 

In [None]:
#Concat the data making it semantic and suitable for embedding. This is what we will add to the database.
data['concatdata'] ="title:"+ data['title'] + " description:" + data['description'] + " category:" + data['category'] + " summary:" + data['summary']

In [None]:
data['concatdata'].head()

### Embed the data

In [None]:
vectors = openai_ef(data['concatdata'].astype(str).tolist())
print(vectors)

### Create a collection to store the book data

In [None]:
collection_books = chroma_client.get_or_create_collection(name="books")

In [None]:
collection_list = chroma_client.list_collections()
print(collection_list)

### Add metadata for filtering and adding context to chunks

In [None]:
metadata = data[['id', 'title', 'category']].to_dict(orient='records')

### Add the data into the collection

In [None]:
collection_books.add( #https://docs.trychroma.com/reference/python/collection#add
    documents=data['concatdata'].astype(str).tolist(),
    embeddings=vectors,
    ids=[str(i+1) for i in range(len(data))],
    metadatas=metadata
)

In [None]:
metadata

In [None]:
data['concatdata'].astype(str).tolist()

In [None]:
[str(i+1) for i in range(len(data))]

In [None]:
collection_books.count()

### Lets try querying for some books

In [None]:
query = "A book about python"
query_embedding = openai_ef([query])
results = collection_books.query(  #https://docs.trychroma.com/reference/python/collection#query
    query_embeddings=query_embedding,
    n_results=3
)
pprint(results)

### Trying filter search with metadata

In [None]:
query = "A book about python"
query_embedding = openai_ef([query])
results = collection_books.query(
    query_embeddings=query_embedding,
    n_results=5,
    where={"category": "Finance"}
)
pprint(results)

In [None]:
# Get a specific document by ID
results = collection_books.get(ids=["1", "2"])
pprint(results)

## Lets try implementing the vector database into our AI

In [None]:
from langchain_openai import AzureChatOpenAI
model = AzureChatOpenAI(
    openai_api_version="2024-02-01",
    deployment_name="gpt-4o-2024-08-06",
    azure_endpoint="<ENDPOINT_URL>",
    openai_api_type="azure",
    openai_api_key="<API_KEY>",

    temperature=1,
    max_tokens=500,
    top_p=0.5,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
    )

In [None]:
from langchain_core.tools import tool

@tool
def search_book_by_content(query: str) -> list:
    """Searches for books in the vector database that match the query."""
    query_embedding = openai_ef([query])
    results = collection_books.query(
        query_embeddings=query_embedding,
        n_results=3
    )
    return results

@tool
def add_book_to_database(title: str, description: str, category: str, summary: str) -> str:
    """Adds a new book to the vector database."""
    concatdata = "title:"+ title + " description:" + description + " category:" + category + " summary:" + summary
    vector = openai_ef([concatdata])[0]
    new_id = str(collection_books.count() + 1)
    metadata = {'id': new_id, 'title': title, 'category': category}

    collection_books.add(
        documents=[concatdata],
        embeddings=[vector],
        ids=[new_id],
        metadatas=[metadata]
    )

    return f"Book '{title}' added with ID {new_id}."

tools = [search_book_by_content, add_book_to_database]

In [None]:
from langgraph.prebuilt import create_react_agent
agent_executor = create_react_agent(model, tools)

In [None]:
from langchain_core.messages import HumanMessage
for step in agent_executor.stream(
    {"messages": [HumanMessage(content="find me a book about rome")]},
    stream_mode="values"
):
    step["messages"][-1].pretty_print()


In [None]:
for step in agent_executor.stream(
    {"messages": [HumanMessage(content="""can you add a book into the database? The title is sym,
                                description is 'a book about symbology',
                               category is 'History',
                               summary is 'This book covers the life and times of symbology'""")]},
    stream_mode="values"
):
    step["messages"][-1].pretty_print()

### Lets check if it was able to add it into the database

In [None]:
collection_books.count()

In [None]:
query = "A book about symbology"
query_embedding = openai_ef([query])
results = collection_books.query(
    query_embeddings=query_embedding,
    n_results=3
)
pprint(results)