# Document Loading

In [1]:
from langchain_community.document_loaders import PyMuPDFLoader

pdf_path = "D:\Computer Programming\Python\RAG-Playground\data\A_Simple_Guide_to_Retrieval_Augmented_Generation_(Abhinav Kimothi)_bibis.ir.pdf"
loader = PyMuPDFLoader(pdf_path)

In [2]:
# Load the documents from the PDF
docs = loader.load()

# Print a part of the first document and metadata to verify loading
docs[0]

import pprint
pprint.pprint(docs[0].metadata)

{'author': 'Abhinav Kimothi',
 'creationDate': "D:20250617062545+03'30'",
 'creator': 'Adobe InDesign 20.1 (Macintosh)(Foxit Advanced PDF Editor)',
 'file_path': 'D:\\Computer '
              'Programming\\Python\\RAG-Playground\\data\\A_Simple_Guide_to_Retrieval_Augmented_Generation_(Abhinav '
              'Kimothi)_bibis.ir.pdf',
 'format': 'PDF 1.5',
 'keywords': '',
 'modDate': "D:20250617062545+03'30'",
 'page': 0,
 'producer': 'GPL Ghostscript 9.50',
 'source': 'D:\\Computer '
           'Programming\\Python\\RAG-Playground\\data\\A_Simple_Guide_to_Retrieval_Augmented_Generation_(Abhinav '
           'Kimothi)_bibis.ir.pdf',
 'subject': '',
 'title': 'A Simple Guide to Retrieval Augmented Generation',
 'total_pages': 258,
 'trapped': ''}


# Chunking and Preprocessing

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
texts = text_splitter.split_documents(docs)
print(f"Number of text chunks: {len(texts)}")
# Print a part of the first text chunk to verify splitting
print(texts[0].page_content[:500])  # Print first 500 characters of the first chunk

Number of text chunks: 740
M A N N I N G
 Abhinav Kimothi
Retrieval Augmented
Generation
A SIMPLE GUIDE TO


In [4]:
### Text splitting get into chunks

def split_documents(documents,chunk_size=200,chunk_overlap=200): # Kept small due to low memory
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [5]:
chunks=split_documents(texts)

Split 740 documents into 8702 chunks

Example chunk:
Content: M A N N I N G
 Abhinav Kimothi
Retrieval Augmented
Generation
A SIMPLE GUIDE TO...
Metadata: {'source': 'D:\\Computer Programming\\Python\\RAG-Playground\\data\\A_Simple_Guide_to_Retrieval_Augmented_Generation_(Abhinav Kimothi)_bibis.ir.pdf', 'file_path': 'D:\\Computer Programming\\Python\\RAG-Playground\\data\\A_Simple_Guide_to_Retrieval_Augmented_Generation_(Abhinav Kimothi)_bibis.ir.pdf', 'page': 0, 'total_pages': 258, 'format': 'PDF 1.5', 'title': 'A Simple Guide to Retrieval Augmented Generation', 'author': 'Abhinav Kimothi', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 20.1 (Macintosh)(Foxit Advanced PDF Editor)', 'producer': 'GPL Ghostscript 9.50', 'creationDate': "D:20250617062545+03'30'", 'modDate': "D:20250617062545+03'30'", 'trapped': ''}


# Building Vector Store and Embeddings

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
import os
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbeddingManager:
    """
    Handles document embedding generation using SentenceTransformer
    """
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initializes the embedding model.
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
            """Load the SentenceTransformer model"""
            try:
                print(f"Loading embedding model: {self.model_name}")
                self.model = SentenceTransformer(self.model_name)
                print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
            except Exception as e:
                print(f"Error loading model {self.model_name}: {e}")
                raise
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generates embeddings for a list of texts.
        """
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        try:
            print(f"Generating embeddings for {len(texts)} texts.")
            embeddings = np.array(
            self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True, device='cuda' if self.model.device.type == 'cuda' else 'cpu')
            )
            return embeddings
        except Exception as e:
            print(f"Error generating embeddings: {e}")
            raise

In [8]:
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x20c8c289d10>

In [9]:
class VectorStore:
    """
    Manages a ChromaDB vector store for document embeddings.
    """
    def __init__(self, collection_name: str = "pdf_docs", persist_directory: str = "../data/vector_store"):
        """
        Initializes the ChromaDB client and collection.
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    def _initialize_store(self):
        """
        Initializes the ChromaDB client and collection.
        """
        try:
            # Create persistent ChromaDB client if not exists
            print(f"Initializing ChromaDB client with persistence at: {self.persist_directory}")
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Create or get collection
            try:
                self.collection = self.client.get_or_create_collection(
                    name=self.collection_name,
                    metadata={"description": "PDF Document Embeddings"}
                )
            except Exception as e:
                print(f"get_or_create_collection failed or returned None: {e}")
                self.collection = None

            # If collection is None, try to create one explicitly
            if self.collection is None:
                try:
                    print(f"Attempting to create collection '{self.collection_name}' explicitly.")
                    self.collection = self.client.create_collection(
                        name=self.collection_name,
                        metadata={"description": "PDF Document Embeddings"}
                    )
                except Exception as e:
                    print(f"Failed to create collection: {e}")
                    raise

            print(f"Collection '{self.collection_name}' initialized successfully.")
                
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
        
    def add_documents(self, docs: List[Any], embeddings: np.ndarray):
        if len(docs) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        print(f"Adding {len(docs)} documents to the vector store.")
        print(f"Embeddings shape: {embeddings.shape}")

        ids, metadatas, document_texts, embeddings_list = [], [], [], []
        if embeddings.ndim == 1:
            embeddings = np.expand_dims(embeddings, axis=0)

        for i, (doc, embedding) in enumerate(zip(docs, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            document_texts.append(doc.page_content)

            if isinstance(embedding, np.ndarray):
                embeddings_list.append(embedding.tolist())
            else:
                raise TypeError(f"Expected embedding to be ndarray, got {type(embedding)}")
            

        try:
            batch_size = 500
            for i in range(0, len(ids), batch_size):
                end = i + batch_size
                self.collection.add(
                    ids=ids[i:end],
                    embeddings=embeddings_list[i:end],
                    metadatas=metadatas[i:end],
                    documents=document_texts[i:end],
                )
                print(f"Added batch {i // batch_size + 1} ({end} docs)")

            print(f"Successfully added {len(docs)} documents to vector store")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise
       

In [10]:
vector_store = VectorStore()
vector_store

Initializing ChromaDB client with persistence at: ../data/vector_store
Collection 'pdf_docs' initialized successfully.


<__main__.VectorStore at 0x20c8c53cf90>

In [None]:
# ### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

Generating embeddings for 8702 texts.


Batches:   0%|          | 0/272 [00:00<?, ?it/s]

: 

In [None]:
# Add the documents and embeddings to the vector store
vector_store.add_documents(chunks,embeddings)

Adding 8702 documents to the vector store.
Embeddings shape: (8702, 384)


# Retrieval Part

In [None]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever

        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            # Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vector_store,embedding_manager)

In [None]:
rag_retriever.retrieve("What is retrieval augmentation generation")

In [None]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv

load_dotenv()

groq_api = os.getenv('GROQ_API')

llm = ChatGroq(
    groq_api_key=groq_api,
    verbose=True,
    model_name = "llama-3.1-8b-instant",
    temperature = 0.1,
    max_tokens = 1024
)

In [None]:
# Simple RAG Function

def rag_simple(query, retreiver, top_k = 5):
  results = retreiver.retrieve(query, top_k)
  context = "\n\n".join([doc['content'] for doc in results]) if results else ""
  if not context:
    return "No relevant context found"

  prompt = f"""use the following context to answer the question correctly
            Context:
            {context}

            Question:
            {query}
            """

  response = llm.invoke([prompt.format(context = context, query = query)])
  return response.content

In [None]:
answer = rag_simple("What is retrieval augmentation generation", retreiver=rag_retriever)
answer

In [None]:
def rag_advanced(query, retriever, top_k = 5, min_score = 0.2, return_context = False):
  """
  RAG Pipeline with some extra features
  """
  results = retriever.retrieve(query, top_k = top_k, score_threshold=min_score)
  if not results:
    return "No relevant answer found from the document"
  else:
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source' : doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page' : doc['metadata'].get('page', 'unknown'),
        'score' : doc['similarity_score'],
        'preview' : doc['content'][:100] + "....."
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])

    prompt = f"""use the following context to answer the question correctly
            Context:
            {context}

            Question:
            {query}
    """
    response = llm.invoke([prompt.format(context = context, query = query)])
    output = {
        'answer' : response.content,
        'sources' : sources,
        'confidence' : confidence
    }

    if return_context:
      output['context'] = context
    return output

In [None]:
answer = rag_advanced("What is retrieval augmentation generation", retriever=rag_retriever, return_context=True)

answer['answer']
answer['sources']
answer['confidence']
answer['context']

# Adding Agent in the RAG Pipeline

In [None]:
vector_store = VectorStore()
client = vector_store.client
collection = vector_store.collection

In [None]:
from langchain_classic.tools.retriever import create_retriever_tool
from langchain_community.vectorstores import Chroma

vec_store = Chroma(
    client=client,
    collection_name=vector_store.collection_name,
    persist_directory=vector_store.persist_directory
)

retriever_tool = create_retriever_tool(
    vec_store.as_retriever(),
    "pdf_document_retriever",
    "Searches and returns information regarding the RAG document."
)

In [None]:
tools = [retriever_tool]

In [None]:
from typing import Annotated, Sequence, TypedDict

from langchain_core.messages import BaseMessage

from langgraph.graph.message import add_messages


class AgentState(TypedDict):
    # The add_messages function defines how an update should be processed
    # Default is to replace. add_messages says "append"
    messages: Annotated[Sequence[BaseMessage], add_messages]

In [None]:
from typing import Annotated, Literal, Sequence, TypedDict

from langchain_classic import hub
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field

from langgraph.prebuilt import tools_condition

### Edges


def grade_documents(state) -> Literal["generate", "rewrite"]:
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (messages): The current state

    Returns:
        str: A decision for whether the documents are relevant or not
    """

    print("---CHECK RELEVANCE---")

    # Data model
    class grade(BaseModel):
        """Binary score for relevance check."""

        binary_score: str = Field(description="Relevance score 'yes' or 'no'")

    # LLM
    model = ChatGroq(
    groq_api_key=groq_api,
    verbose=True,
    model_name = "llama-3.1-8b-instant",
    temperature = 0.1,
    max_tokens = 1024
    )

    # LLM with tool and validation
    llm_with_tool = model.with_structured_output(grade)

    # Prompt
    prompt = PromptTemplate(
        template="""You are a grader assessing relevance of a retrieved document to a user question. \n 
        Here is the retrieved document: \n\n {context} \n\n
        Here is the user question: {question} \n
        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
        Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.""",
        input_variables=["context", "question"],
    )

    # Chain
    chain = prompt | llm_with_tool

    messages = state["messages"]
    last_message = messages[-1]

    question = messages[0].content
    docs = last_message.content

    scored_result = chain.invoke({"question": question, "context": docs})

    score = scored_result.binary_score

    if score == "yes":
        print("---DECISION: DOCS RELEVANT---")
        return "generate"

    else:
        print("---DECISION: DOCS NOT RELEVANT---")
        print(score)
        return "rewrite"


### Nodes


def agent(state):
    """
    Invokes the agent model to generate a response based on the current state. Given
    the question, it will decide to retrieve using the retriever tool, or simply end.

    Args:
        state (messages): The current state

    Returns:
        dict: The updated state with the agent response appended to messages
    """
    print("---CALL AGENT---")
    messages = state["messages"]
    model = ChatGroq(
    groq_api_key=groq_api,
    verbose=True,
    model_name = "llama-3.1-8b-instant",
    temperature = 0.1,
    max_tokens = 1024
    )
    model = model.bind_tools(tools)
    response = model.invoke(messages)
    # We return a list, because this will get added to the existing list
    return {"messages": [response]}


def rewrite(state):
    """
    Transform the query to produce a better question.

    Args:
        state (messages): The current state

    Returns:
        dict: The updated state with re-phrased question
    """

    print("---TRANSFORM QUERY---")
    messages = state["messages"]
    question = messages[0].content

    msg = [
        HumanMessage(
            content=f""" \n 
    Look at the input and try to reason about the underlying semantic intent / meaning. \n 
    Here is the initial question:
    \n ------- \n
    {question} 
    \n ------- \n
    Formulate an improved question: """,
        )
    ]

    # Grader
    model = ChatGroq(
    groq_api_key=groq_api,
    verbose=True,
    model_name = "llama-3.1-8b-instant",
    temperature = 0.1,
    max_tokens = 1024
    )
    response = model.invoke(msg)
    return {"messages": [response]}


def generate(state):
    """
    Generate answer

    Args:
        state (messages): The current state

    Returns:
         dict: The updated state with re-phrased question
    """
    print("---GENERATE---")
    messages = state["messages"]
    question = messages[0].content
    last_message = messages[-1]

    docs = last_message.content

    # Prompt
    prompt = hub.pull("rlm/rag-prompt")

    # LLM
    llm = ChatGroq(
    groq_api_key=groq_api,
    verbose=True,
    model_name = "llama-3.1-8b-instant",
    temperature = 0.1,
    max_tokens = 1024
    )

    # Post-processing
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Chain
    rag_chain = prompt | llm | StrOutputParser()

    # Run
    response = rag_chain.invoke({"context": docs, "question": question})
    return {"messages": [response]}


print("*" * 20 + "Prompt[rlm/rag-prompt]" + "*" * 20)
prompt = hub.pull("rlm/rag-prompt").pretty_print()  # Show what the prompt looks like

In [None]:
from langgraph.graph import END, StateGraph, START
from langgraph.prebuilt import ToolNode

# Define a new graph
workflow = StateGraph(AgentState)

# Define the nodes we will cycle between
workflow.add_node("agent", agent)  # agent
retrieve = ToolNode([retriever_tool])
workflow.add_node("retrieve", retrieve)  # retrieval
workflow.add_node("rewrite", rewrite)  # Re-writing the question
workflow.add_node(
    "generate", generate
)  # Generating a response after we know the documents are relevant
# Call agent node to decide to retrieve or not
workflow.add_edge(START, "agent")

# Decide whether to retrieve
workflow.add_conditional_edges(
    "agent",
    # Assess agent decision
    tools_condition,
    {
        # Translate the condition outputs to nodes in our graph
        "tools": "retrieve",
        END: END,
    },
)

# Edges taken after the `action` node is called.
workflow.add_conditional_edges(
    "retrieve",
    # Assess agent decision
    grade_documents,
)
workflow.add_edge("generate", END)
workflow.add_edge("rewrite", "agent")

# Compile
graph = workflow.compile()

In [None]:
from IPython.display import Image, display

try:
    display(Image(graph.get_graph(xray=True).draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass

In [None]:
import pprint

inputs = {
    "messages": [
        ("user", "Can you explain what exactly is mentioned about Retrieval Augmented Generation"),
    ]
}
for output in graph.stream(inputs):
    for key, value in output.items():
        pprint.pprint(f"Output from node '{key}':")
        pprint.pprint("---")
        pprint.pprint(value, indent=2, width=80, depth=None)
    pprint.pprint("\n---\n")