In [58]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.bedrock import BedrockEmbeddings

import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma

from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

In [59]:
def get_embedding_function():
    embeddings = OllamaEmbeddings(model='mistral')

    return embeddings

In [60]:
CHROMA_PATH = "chroma_test"
DATA_PATH = "."

In [61]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 800,
        chunk_overlap = 80,
        length_function = len,
        is_separator_regex = False
    )
    return text_splitter.split_documents(documents)

def add_to_chroma(chunks: list[Document]):
    #Loading the existing database.

    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    #Calculating page ids.
    chunks_with_ids = calculate_chunk_ids(chunks)

    #Adding or updating the documents.
    existing_items = db.get(include=[]) #Ids are always included by default.
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    #Only adding documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()

    else:
        print("No new documents to add")

def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        #If the page Id is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        #Calculate the chunk Id.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        #Add it to the page metadata.
        chunk.metadata["id"] = chunk_id

    return chunks

def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)


def main(reset=False):
    if reset:
        print("✨ Clearing Database")
        clear_database()

    documents = load_documents()
    chunks = split_documents(documents)
    add_to_chroma(chunks)

main(reset=True)

✨ Clearing Database




Number of existing documents in DB: 0
Adding new documents: 281


  db.persist()


In [65]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


def query_rag(query_text: str):
    #Prepare the DB
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    #Search the DB
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = Ollama(model="mistral")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text


def main(query_text: str):
    query_rag(query_text)


main("What is Ant colony optimization")

    

Human: 
Answer the question based only on the following context:

Genetic programming

---

COMP 575 –Part II
page 78
GP flowchart

---

RBFs

---

sequence of actions in the same trajectory, it magnifies the effect beyond our intention. •This destabilizes the learning process and sounds like dogs chasing their tails.

---

Mutation –Discrete

---

Answer the question based on the above context: What is Ant colony optimization

Response:  The provided context does not include any information about Ant Colony Optimization (ACO). Therefore, it's impossible to answer the question using only the given context. Ant Colony Optimization is a metaheuristic algorithm inspired by the behavior of ants in finding the shortest path between the food source and their colony. In ACO, artificial ants are used to find approximate solutions to optimization problems based on the solution constructed by other ants.
Sources: ['Part II, Evolutionary Optimisation, Chapter-5.pdf:1:1', 'Part II, Evolutionary Op