In [1]:
file_path = ("data/")

In [2]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

In [3]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(file_path)
    return document_loader.load()


In [4]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


In [5]:
documents = load_documents()
print(documents[0])
chunks = split_documents(documents)

page_content='' metadata={'source': 'data/harrypotter.pdf', 'page': 0}


In [6]:
#from langchain_community.embeddings.ollama import OllamaEmbeddings
#from langchain_community.embeddings.bedrock import BedrockEmbeddings

from langchain_ollama import OllamaEmbeddings

def get_embedding_function():
    #embeddings = BedrockEmbeddings(
    #    credentials_profile_name="default", region_name="us-east-1"
    #)
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

In [7]:
#from get_embedding_function import get_embedding_function
#from langchain.vectorstores.chroma import Chroma
from langchain_chroma import Chroma

CHROMA_PATH = "chroma"

In [8]:
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")


def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [15]:
def add_to_chroma(chunks: list[Document]):
    """Add new documents to the Chroma DB, skipping those that already exist."""
    
    try:
        # Load the existing Chroma database.
        db = Chroma(
            persist_directory=CHROMA_PATH, 
            embedding_function=get_embedding_function()
        )

        # Calculate IDs for chunks.
        chunks_with_ids = calculate_chunk_ids(chunks)

        # Get existing documents in the DB.
        existing_items = db.get(include=[])  # IDs are always included by default
        existing_ids = set(existing_items["ids"])
        print(f"Number of existing documents in DB: {len(existing_ids)}")

        # Filter out the chunks that already exist in the DB.
        new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

        if new_chunks:
            print(f"👉 Adding new documents: {len(new_chunks)}")
            new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
            db.add_documents(new_chunks, ids=new_chunk_ids)
        else:
            print("✅ No new documents to add")
    except Exception as e:
        print(f"❌ Error while adding to Chroma: {str(e)}")

def calculate_chunk_ids(chunks: list[Document]) -> list[Document]:
    """Generate unique IDs for each document chunk."""
    
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # Increment chunk index if still on the same page.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Create the chunk ID using source, page, and chunk index.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        chunk.metadata["id"] = chunk_id
        last_page_id = current_page_id

    return chunks

def clear_database():
    """Delete the existing Chroma DB."""
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        print(f"✅ Database at {CHROMA_PATH} cleared.")
    else:
        print(f"⚠️ No database found at {CHROMA_PATH} to clear.")


In [17]:
add_to_chroma(chunks)

Number of existing documents in DB: 10215
✅ No new documents to add


In [13]:
os.system("ollama pull nomic-embed-text")

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ 

0

In [25]:
os.system("ollama pull llama3.2")

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠦ 

0

In [32]:
import argparse
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

#CHROMA_PATH = "chroma"

def query_rag(query_text: str):
    """Query the LLM using RAG with a context-based search."""
    
    print("🚀 Starting the query process...")
    
    try:
        # Prepare the embedding function and Chroma DB.
        print("📂 Loading the Chroma database...")
        embedding_function = get_embedding_function()  # Ensure this is defined elsewhere.
        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

        # Search for relevant documents in the DB.
        print(f"🔍 Searching the database for relevant context to the query: '{query_text}'")
        results = db.similarity_search_with_score(query_text, k=5)
        if not results:
            print("⚠️ No relevant documents found in the database.")
            return "No relevant documents found to answer the query."

        # Extract the context from the search results.
        print(f"📄 Found {len(results)} relevant documents.")
        context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
        
        # Prepare the prompt for the LLM using the context.
        print("✍️ Preparing the prompt for the LLM...")
        prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
        prompt = prompt_template.format(context=context_text, question=query_text)

        # Query the LLM (using the Ollama model).
        print("🤖 Querying the LLM model...")
        model = Ollama(model="llama3.2")
        response_text = model.invoke(prompt)

        # Extract the source IDs from the documents.
        print("📑 Formatting the response...")
        sources = [doc.metadata.get("id", None) for doc, _score in results]
        formatted_response = f"Response: {response_text}\nSources: {sources}"
        
        print(f"✅ Query successful! Here's the response:\n{formatted_response}")
        return response_text
    
    except Exception as e:
        print(f"❌ Error occurred during the query process: {str(e)}")
        return "An error occurred during the query."

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

query_text = "How did harry parents died?"
query_rag(query_text)


🚀 Starting the query process...
📂 Loading the Chroma database...
🔍 Searching the database for relevant context to the query: 'How did harry parents died?'
📄 Found 5 relevant documents.
✍️ Preparing the prompt for the LLM...
🤖 Querying the LLM model...
📑 Formatting the response...
✅ Query successful! Here's the response:
Response: Harry's parents, James and Lily Potter, were murdered by Voldemort. According to Harry, his father was killed first, while Voldemort tried to kill him but ended up killing his mother instead, instructing her to move aside so that he could kill the baby Harry, who miraculously survived a curse from Voldemort at the age of one year old.
Sources: ['data/harrypotter.pdf:1126:1', 'data/harrypotter.pdf:1603:2', 'data/harrypotter.pdf:283:3', 'data/harrypotter.pdf:2825:0', 'data/harrypotter.pdf:573:1']


"Harry's parents, James and Lily Potter, were murdered by Voldemort. According to Harry, his father was killed first, while Voldemort tried to kill him but ended up killing his mother instead, instructing her to move aside so that he could kill the baby Harry, who miraculously survived a curse from Voldemort at the age of one year old."