# Data Ingestion

In [None]:
### Document datastructure
from langchain_core.documents import Document

In [49]:
doc = Document(
  page_content = "text content of the documemt",
  metadata = {
    "source":"example.txt",
    "pages":1,
    "author":"CS",
    "data_created":"2025-11-01"
  }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'CS', 'data_created': '2025-11-01'}, page_content='text content of the documemt')

In [None]:
### Create a text file
import os
os.makedirs("../data/text_files", exist_ok=True)


In [None]:
### Creation of two files by code

sample_text = {
    "../data/text_files/rag_intro.txt": """RAG stands for Retrieval-Augmented Generation. It combines information retrieval 
with a language model. The retriever fetches relevant documents from a database or corpus, 
and the language model generates answers based on the retrieved documents. 
RAG helps reduce hallucinations in generated content and is widely used in chatbots, 
Q&A systems, and knowledge assistants.""",

"../data/text_files/ml_intro.txt": """Machine Learning (ML) is a subset of Artificial Intelligence (AI) 
that enables systems to learn from data and improve over time without being explicitly programmed. 
It involves training algorithms on historical data to make predictions or decisions. 
ML techniques include supervised learning, unsupervised learning, and reinforcement learning. 
Applications of ML are widespread, including image recognition, natural language processing, 
recommendation systems, and autonomous vehicles."""

}

for filePath, fileContent in sample_text.items():
  with open(filePath, 'w', encoding='utf-8') as f:
    f.write(fileContent)

print("Files in the sample_text dict are created!")


Files in the sample_text dict are created!


In [52]:
### TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/rag_intro.txt", encoding="utf-8")

document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/rag_intro.txt'}, page_content='RAG stands for Retrieval-Augmented Generation. It combines information retrieval \nwith a language model. The retriever fetches relevant documents from a database or corpus, \nand the language model generates answers based on the retrieved documents. \nRAG helps reduce hallucinations in generated content and is widely used in chatbots, \nQ&A systems, and knowledge assistants.')]


In [None]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

# Load all the text files from the dircetory
dir_loader = DirectoryLoader(
  "../data/text_files",
  glob = "**/*.txt", # Pattern to match files
  loader_cls = TextLoader,
  loader_kwargs={'encoding':'utf-8'},
  show_progress=False
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\ml_intro.txt'}, page_content='Machine Learning (ML) is a subset of Artificial Intelligence (AI) \nthat enables systems to learn from data and improve over time without being explicitly programmed. \nIt involves training algorithms on historical data to make predictions or decisions. \nML techniques include supervised learning, unsupervised learning, and reinforcement learning. \nApplications of ML are widespread, including image recognition, natural language processing, \nrecommendation systems, and autonomous vehicles.'),
 Document(metadata={'source': '..\\data\\text_files\\rag_intro.txt'}, page_content='RAG stands for Retrieval-Augmented Generation. It combines information retrieval \nwith a language model. The retriever fetches relevant documents from a database or corpus, \nand the language model generates answers based on the retrieved documents. \nRAG helps reduce hallucinations in generated content and is widely used in chatbo

In [None]:
### PDF Loader, process PDFs
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader

dir_loader = DirectoryLoader(
  "../data/pdf_files",
  glob = "**/*.pdf", # Pattern to match files
  loader_cls = PyMuPDFLoader,
  show_progress=False
)

pdf_documents = dir_loader.load()
pdf_documents 


[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-11-24T11:30:40-06:00', 'source': '..\\data\\pdf_files\\Agentic_AI_Roadmap.pdf', 'file_path': '..\\data\\pdf_files\\Agentic_AI_Roadmap.pdf', 'total_pages': 3, 'format': 'PDF 1.7', 'title': '', 'author': 'Chetan Salotra', 'subject': '', 'keywords': '', 'moddate': '2025-11-24T11:30:40-06:00', 'trapped': '', 'modDate': "D:20251124113040-06'00'", 'creationDate': "D:20251124113040-06'00'", 'page': 0}, page_content='What Are Agentic AI Systems? \n \nAgentic AI – The Next Evolution Beyond Chatbots \n \nDefinition   \nAgentic AI refers to autonomous systems that can:   \n- Understand complex, multi-step goals   \n- Break goals into tasks   \n- Use tools (APIs, browsers, code interpreters, databases, etc.)   \n- Plan, reason, reflect, and self-correct   \n- Act in loops until the objective is achieved or gracefully fail   \n \nUnlike traditional RAG or chat m

## RAG Pipelines - Data Ingestion to Vector DB pipeline

In [55]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [None]:
### Read all the pdfs inside the directory
def process_all_pdfs(pdf_directory):
  """Process all PDF files in a directory"""
  doc_list = []
  directory_path = Path(pdf_directory)

  # Find all pdf files recursively
  pdf_list = list(directory_path.rglob("**/*.pdf"))

  print(f"Total {len(pdf_list)} files found in the directory")

  for pdf_file in pdf_list:
    print(f"\nProcessing: {pdf_file.name}")
    try:
      loader = PyMuPDFLoader(pdf_file)
      documents = loader.load()

      # Add source information in the document metadata
      for doc in documents:
        doc.metadata['source_file'] = pdf_file.name
        doc.metadata['file_type'] = 'pdf'

      doc_list.extend(documents)
      print(f"Loaded {len(documents)} pages")

    except Exception as e:
      print(f"Error occured while processing pdf{e}")

  print(f"\ntotal documents loaded: {len(doc_list)}")
  return doc_list

all_pdf_documents = process_all_pdfs("../data")

Total 2 files found in the directory

Processing: Agentic_AI_Roadmap.pdf
Loaded 3 pages

Processing: RAG_Overview.pdf
Loaded 1 pages

total documents loaded: 4


In [57]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-11-24T11:30:40-06:00', 'source': '..\\data\\pdf_files\\Agentic_AI_Roadmap.pdf', 'file_path': '..\\data\\pdf_files\\Agentic_AI_Roadmap.pdf', 'total_pages': 3, 'format': 'PDF 1.7', 'title': '', 'author': 'Chetan Salotra', 'subject': '', 'keywords': '', 'moddate': '2025-11-24T11:30:40-06:00', 'trapped': '', 'modDate': "D:20251124113040-06'00'", 'creationDate': "D:20251124113040-06'00'", 'page': 0, 'source_file': 'Agentic_AI_Roadmap.pdf', 'file_type': 'pdf'}, page_content='What Are Agentic AI Systems? \n \nAgentic AI – The Next Evolution Beyond Chatbots \n \nDefinition   \nAgentic AI refers to autonomous systems that can:   \n- Understand complex, multi-step goals   \n- Break goals into tasks   \n- Use tools (APIs, browsers, code interpreters, databases, etc.)   \n- Plan, reason, reflect, and self-correct   \n- Act in loops until the objective is achiev

In [None]:
### Text splitting into chunks
def split_documents(documents, chunk_size=600, chunk_overlap=60):
  """Split documents into smaller chunks so the embedding model can represent them accurately"""
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
  )

  splitted_docs = text_splitter.split_documents(documents)

  print(f"split {len(documents)} documents into {len(splitted_docs)} chunks")

  # show example of a chunk
  if splitted_docs:
    print(f"\nExample chunk:")
    print(f"Content: {splitted_docs[0].page_content[:200]}...")
    print(f"Metadata: {splitted_docs[0].metadata}")

  return splitted_docs


In [80]:
chunks = split_documents(all_pdf_documents)

split 4 documents into 12 chunks

Example chunk:
Content: What Are Agentic AI Systems? 
 
Agentic AI – The Next Evolution Beyond Chatbots 
 
Definition   
Agentic AI refers to autonomous systems that can:   
- Understand complex, multi-step goals   
- Break ...
Metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-11-24T11:30:40-06:00', 'source': '..\\data\\pdf_files\\Agentic_AI_Roadmap.pdf', 'file_path': '..\\data\\pdf_files\\Agentic_AI_Roadmap.pdf', 'total_pages': 3, 'format': 'PDF 1.7', 'title': '', 'author': 'Chetan Salotra', 'subject': '', 'keywords': '', 'moddate': '2025-11-24T11:30:40-06:00', 'trapped': '', 'modDate': "D:20251124113040-06'00'", 'creationDate': "D:20251124113040-06'00'", 'page': 0, 'source_file': 'Agentic_AI_Roadmap.pdf', 'file_type': 'pdf'}


### Embeddings And VectorStoreDB 

In [81]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [82]:
class Embedder:
  """Handles document embedding generation using SentenceTransformer"""

  def __init__(self, model_name:str="all-MiniLM-L6-v2"):
    """
    Initialize the embedder class

    Args:
        model_name: HuggingFace model name for sentence embeddings

    """
    self.model_name = model_name
    self.model = None
    self._load_model()


  def _load_model(self):
    """Load the SentenceTransformer model"""
    try:
      print(f"Loading the embedding model: {self.model_name}")
      self.model = SentenceTransformer(self.model_name)
      print(f"Model loaded successfully. Embedding dimension : {self.model.get_sentence_embedding_dimension()}")
    except Exception as e:
      print(f"Error loading model {self.model_name} : {e}")
      raise ValueError("Model not loaded")


  def generate_embeddings(self, texts:List[str])-> np.ndarray:
    """
    Generate embeddings for a list of texts

    Args:
      texts: List of text strings to embed

    Returns:
      numpy array of embeddings with shape (len(texts), embedding_dim)
    """
    if not self.model:
      raise ValueError("Model not loaded")

    print(f"Generating embeddings for {len(texts)} texts...")
    embeddings = self.model.encode(texts, show_progress_bar = False)
    print(f"generating embeddings with shape: {embeddings.shape}")
    return embeddings
  

  def get_embedding_dimension(self)-> int:
    """Getting embedding dimension of the model"""
    if not self.model:
      raise ValueError("Model not loaded")
    return self.model.get_sentence_embedding_dimension()
  

#Initialize the Embedder
embedder = Embedder()
embedder

Loading the embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension : 384


<__main__.Embedder at 0x2313a9bcb90>

### Vector Store

In [None]:
class VectorStore:
  """Manages document embeddings in a chromaDB vector store"""

  def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
    """
    Initialize the vector store

    Args:
      collection_name: name of the ChromaDB collection
      persist_directory: Directory to persist the vector store 
    """
    self.collection_name = collection_name
    self.persist_directory = persist_directory
    self.client = None
    self.collection = None
    self._initialize_store()


  def _initialize_store(self):
    """Initialize ChromaDB client and collection"""
    try:
      os.makedirs(self.persist_directory, exist_ok= True)
      self.client = chromadb.PersistentClient(path=self.persist_directory)

      # Get or create collection
      self.collection = self.client.get_or_create_collection(
        name=self.collection_name,
        metadata={"description": "PDF document embeddings for RAG"}
      )
      print(f"Vector store initialized. Collection: {self.collection_name}")
      print(f"Existing documents in the collection: {self.collection.count()}")

    except Exception as e:
      print(f"Error initializing the vector store: {e}")
      raise ValueError("Vector store can be initialized")
      

  def add_documents(self, documents:List[Any], embeddings):
    """
    Add documents and their embeddings to the vector store

    Args:
        documents: List of LangChain documents
        embeddings: Corresponding embeddings for the documents
    """  

    if len(documents) != len(embeddings):
      raise ValueError("Number of documents must match number of embeddings")
    
    print(f" Adding {len(documents)} to the Vector store")

    # Prepare storage lists/data for ChromaDB 
    ids = []
    texts = []
    metas = []
    embeddings_list = []

    for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
      # Generate unique ID
      doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
      ids.append(doc_id)

      # Prepare metadata
      metadata = dict(doc.metadata)
      metadata['doc_index'] = i
      metadata['content_length'] = len(doc.page_content)
      metas.append(metadata)

      # Document content
      texts.append(doc.page_content)

      # Embedding
      embeddings_list.append(embedding.tolist())

    # Add to collection
    try:
      self.collection.add(
        ids=ids,
        embeddings=embeddings_list,
        metadatas=metas,
        documents=texts
      )
      print(f"Successfully added {len(documents)} documents")
      print(f"Total documents in collection: {self.collection.count()}")

    except Exception as e:
      print(f"Error adding documents to vector store: {e}")
      raise

VectorStore = VectorStore()
VectorStore

Vector store initialized. Collection: pdf_documents
Existing documents in the collection: 55


<__main__.VectorStore at 0x2313a9c9a90>

In [None]:
#Covert text to embeddings
texts = [doc.page_content for doc in chunks]

#Generate the Embeddings
embeddings = embedder.generate_embeddings(texts)

#store in the vector database
VectorStore.add_documents(chunks, embeddings)


Generating embeddings for 12 texts...
generating embeddings with shape: (12, 384)
 Adding 12 to the Vector store
Successfully added 12 documents
Total documents in collection: 67


### Retriever Pipeline From VectorStore

In [85]:
class RAGRetriever:
  """ Handles query-based retriever from the vector store"""

  def __init__(self, vector_store: VectorStore, embedder: Embedder):
    """
    Initialize the retriever

    Args:
        vector_store: Vector store containing document embeddings
        embedding_manager: Manager for generating query embeddings
    """
    self.vector_store = vector_store
    self.embedder = embedder


  def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
    """
    Retrieve relevant documents based on a query.

    Args:
        query: Text query string.
        top_k: Number of top results to return.
        score_threshold: Minimum similarity score to include a chunk.

    Returns:
        List of dicts -> [{ "text": ..., "metadata": ..., "score": ... }]
    """
    print(f"Retrieving documents for query: '{query}'")
    print(f"Top K: {top_k}, score threshold: {score_threshold}")

    #Generate embedding for the query
    query_embedding = self.embedder.generate_embeddings([query])[0]

    #Fetch from the vector store
    try:
        results = self.vector_store.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )

        #Process results
        retrieved_docs = []

        if results['documents'] and results['documents'][0]:
            documents = results['documents'][0]
            metadatas = results['metadatas'][0]
            distances = results['distances'][0]
            ids = results['ids'][0]

            for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
               #Convert distance to similarity score (chromadb uses cosine distance)
                similarity_score = (2 - distance)/2  

                print("similarity", similarity_score)
                if similarity_score >= score_threshold:
                    retrieved_docs.append({
                        'id':doc_id,
                        'content':document,
                        'metadata':metadata,
                        'similarity_score':similarity_score,
                        'distance':distance,
                        'rank':i+1
                    })
            print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")    
        else:
            print("No documents found")

        return retrieved_docs
    
    except Exception as e:
       print(f"Error during retrieval: {e}")
       return []
    

rag_retriever = RAGRetriever(VectorStore, embedder)
rag_retriever

<__main__.RAGRetriever at 0x2313a485d50>

In [86]:
rag_retriever.retrieve("Agents created ")

Retrieving documents for query: 'Agents created '
Top K: 5, score threshold: 0.0
Generating embeddings for 1 texts...
generating embeddings with shape: (1, 384)
similarity 0.5494636297225952
similarity 0.5373220443725586
similarity 0.5373220443725586
similarity 0.5311039984226227
similarity 0.5311039984226227
Retrieved 5 documents (after filtering)


[{'id': 'doc_ac0c039f_1',
  'content': 'Unlike traditional RAG or chat models, agentic systems don’t just answer — they act on the world. \n \nCore Characteristics of Agentic AI \n- Long-running & stateful (memory across turns)   \n- Tool use & function calling   \n- Planning & reasoning (Chain-of-Thought, Tree-of-Thought, ReAct)   \n- Self-reflection & critique   \n- Multi-agent collaboration (in advanced setups)   \n \nPopular Agentic Frameworks (2025) \n- LangGraph (LangChain) – stateful graphs & cycles',
  'metadata': {'creationdate': '2025-11-24T11:30:40-06:00',
   'content_length': 461,
   'trapped': '',
   'creationDate': "D:20251124113040-06'00'",
   'file_type': 'pdf',
   'total_pages': 3,
   'creator': 'Microsoft® Word for Microsoft 365',
   'keywords': '',
   'moddate': '2025-11-24T11:30:40-06:00',
   'format': 'PDF 1.7',
   'file_path': '..\\data\\pdf_files\\Agentic_AI_Roadmap.pdf',
   'author': 'Chetan Salotra',
   'doc_index': 1,
   'source_file': 'Agentic_AI_Roadmap.pdf'

### Integration Vectordb Context pipeline with LLM output

In [None]:
### RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM
api_key = os.getenv("GEMINI_API_KEY")

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    api_key=api_key  
)

##Simple RAG function: retrieve context + generate response
def rag_simple(query, retriever, llm, top_k=3):
  ## retrieve the context
  results = retriever.retrieve(query, top_k = top_k)
  context = "\n\n".join([doc['content'] for doc in results])

  if not context:
    return "No relevant context found to answer the question"
  
  ##generate the answer using GROQ LLM
  prompt=f"""Use the following context to answer the question conscisely.
      Context: {context}

      Question: {query}

      Answer:"""
  
  response = llm.invoke([prompt.format(context=context, query=query)])
  return response.content


In [88]:
answer = rag_simple("what is Agentic AI?", rag_retriever, llm)
print(answer)

Retrieving documents for query: 'what is Agentic AI?'
Top K: 3, score threshold: 0.0
Generating embeddings for 1 texts...
generating embeddings with shape: (1, 384)
similarity 0.7794758677482605
similarity 0.7765256464481354
similarity 0.7703945934772491
Retrieved 3 documents (after filtering)
Agentic AI refers to autonomous systems that can understand complex, multi-step goals, break them into tasks, and use tools (APIs, browsers, code interpreters, etc.). They plan, reason, reflect, self-correct, and act in loops until an objective is achieved or gracefully fails, distinguishing them from traditional RAG or chat models by acting on the world.


### Enhanced RAG Pipeline Features

In [None]:
### Enhanced RAG Pipeline Features
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
  """
  RAG pipeline with extra features:
  - Returns answer, sources, confidence score, and optionally full context.
  """
  results = retriever.retrieve(query, top_k=top_k, score_threshold =min_score)
  if not results:
    return {'answer': 'No relevenat context found.', 'sources':[], 'confidence':0.0, 'context':''}
  
  #Prepare context and sources
  context = "\n\n".join([doc['content'] for doc in results])
  sources = [{
    'source':doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
    'page':doc['metadata'].get('page', 'unknown'),
    'score':doc['similarity_score'],
    'preview':doc['content'][:120] + '...'
  } for doc in results]
  confidence = max([doc['similarity_score'] for doc in results])

  #Generate answer
  prompt = f"""Use the following context to answer the question concisely. \n{context}\n\nQuestion: {query} \n\nAnswer:"""
  response = llm.invoke([prompt.format(context=context, query=query)])

  output = {
    'answer':response.content,
    'sources':sources,
    'confidence':confidence
  }
  if return_context:
    output['context'] = context
  return output

#Example usage
result = rag_advanced("Give complete list of the core skills required t o build agentic AI?", rag_retriever, llm, top_k=5, min_score=0.1, return_context = True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])



  

Retrieving documents for query: 'Give complete list of the core skills required t o build agentic AI?'
Top K: 5, score threshold: 0.1
Generating embeddings for 1 texts...
generating embeddings with shape: (1, 384)
similarity 0.7320262491703033
similarity 0.7320262491703033
similarity 0.704714298248291
similarity 0.704714298248291
similarity 0.6919834911823273
Retrieved 5 documents (after filtering)
Answer: The core skills required to build agentic AI are:
*   Python proficiency (async, typing, pydantic)
*   Deep understanding of modern LLMs (GPT-4o, Claude 3.5/Opus, Grok-2, Llama-3.1/3.2 405B, DeepSeek-R1)
Sources: [{'source': 'Agentic_AI_Roadmap.pdf', 'page': 1, 'score': 0.7320262491703033, 'preview': 'Skills & Knowledge You Need to Design & Build Agentic AI \n \n1. Core Foundations (Must-Have) \n- Python proficiency (async...'}, {'source': 'Agentic_AI_Roadmap.pdf', 'page': 1, 'score': 0.7320262491703033, 'preview': 'Skills & Knowledge You Need to Design & Build Agentic AI \n \n1. Cor