# RAG Pipelines - Data Ingestion to Vector DB

In [22]:
import os
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [17]:
## Read all the pdf's inside the directory

def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""

    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")

        except Exception as e:
            print(f"Error: {e}")
    
    print(f"\n Total documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data")

Found 1 PDF files to process

Processing: feyn_surely.pdf
Loaded 133 pages

 Total documents loaded: 133


In [6]:
all_pdf_documents

[Document(metadata={'producer': 'Acrobat PDFWriter 5.0 for Windows NT', 'creator': '(ebook - RTF - Mathematics) Feynman, Richard - Surely You’re Joking Mr. Feynman.rtf - Microsoft Word', 'creationdate': 'D:20030429215337', 'title': '_ebook - RTF - Mathematics_ Feynman, Richard - Surely You’…', 'author': '-ducati996', 'source': '..\\data\\pdf\\feyn_surely.pdf', 'total_pages': 133, 'page': 0, 'page_label': '1', 'source_file': 'feyn_surely.pdf', 'file_type': 'pdf'}, page_content='"Surely You\'re Joking, Mr. Feynman!" \nAdventures of a Curious Character  \nby Richard P. Feynman  \nas told to Ralph Leighton'),
 Document(metadata={'producer': 'Acrobat PDFWriter 5.0 for Windows NT', 'creator': '(ebook - RTF - Mathematics) Feynman, Richard - Surely You’re Joking Mr. Feynman.rtf - Microsoft Word', 'creationdate': 'D:20030429215337', 'title': '_ebook - RTF - Mathematics_ Feynman, Richard - Surely You’…', 'author': '-ducati996', 'source': '..\\data\\pdf\\feyn_surely.pdf', 'total_pages': 133, 'pag

In [None]:
## Text splitting get into chunks

from langchain_core.documents.base import Document


def split_documents_to_chunks(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # example of a chunk
    if split_docs:
        print(f"\nExample Chunk:")
        print(f"Content: {split_docs[0].page_content}...")
        print(f"Metadata: {split_docs[0].metadata}")
    

    return split_docs


chunks: list[Document] = split_documents_to_chunks(all_pdf_documents)


Split 133 documents into 890 chunks

Example Chunk:
Content: "Surely You're Joking, Mr. Feynman!" 
Adventures of a Curious Character  
by Richard P. Feynman  
as told to Ralph Leighton...
Metadata: {'producer': 'Acrobat PDFWriter 5.0 for Windows NT', 'creator': '(ebook - RTF - Mathematics) Feynman, Richard - Surely You’re Joking Mr. Feynman.rtf - Microsoft Word', 'creationdate': 'D:20030429215337', 'title': '_ebook - RTF - Mathematics_ Feynman, Richard - Surely You’…', 'author': '-ducati996', 'source': '..\\data\\pdf\\feyn_surely.pdf', 'total_pages': 133, 'page': 0, 'page_label': '1', 'source_file': 'feyn_surely.pdf', 'file_type': 'pdf'}


## Embedding and VectorStoreDB

In [19]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""

    def __init__(self, model_name = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """

        self.model_name = model_name
        self.model = None
        self._load_model()
    
    # Protected Function
    def _load_model(self):
        """Load the SenteceTransformer model"""

        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]):
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed

        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
embeddingManager = EmbeddingManager()
embeddingManager


Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x2bba112eba0>

## VectorStore

In [29]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store

        Args:
            Collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""

        try:
            # Create persistent ChromaDB client
            os.makedirs(name=self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata = {"description": f"PDF document embeddings for RAG"}
            )
            
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
    
    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents or chunks
            embeddings: Corresponding embeddings for chunks 
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for chromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids = ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore
        

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 890


<__main__.VectorStore at 0x2bba112dbe0>

In [30]:
## get the text from chunks
texts = [doc.page_content for doc in chunks]

## Generate the embeddings
embeddings = embeddingManager.generate_embeddings(texts)

## Store into vector database
vectorstore.add_documents(chunks, embeddings)



Generating embeddings for 890 texts...


Batches: 100%|██████████| 28/28 [00:39<00:00,  1.42s/it]


Generated embeddings with shape: (890, 384)
Adding 890 documents to vector store...
Successfully added 890 documents to vector store
Total documents in collection: 1780


## Retriever Pipeline From VectorStore

In [33]:
class RAGRetriever:
    """Handles query-based retieval from the vector store"""

    def __init__(self, vectorstore: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vectorstore: vector store containing document embeddings
            embeddingManger: Manager for generating query embeddings
        """
        self.vectorstore = vectorstore
        self.embeddingManager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold:float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshould

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embeddings
        query_embedding = self.embeddingManager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vectorstore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            # Process Results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No document found")
            
            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        
ragRetriever = RAGRetriever(vectorstore, embeddingManager)
ragRetriever


<__main__.RAGRetriever at 0x2bba112e7b0>

In [34]:
ragRetriever.retrieve("who is Richard Feynman?")

Retrieving documents for query: 'who is Richard Feynman?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 67.48it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_a9c87737_2',
  'content': "Introduction \nI hope these won't be the only memoirs of Richard Feynman. Certainly the reminiscences here give a true picture of much of his character--his \nalmost compulsive need to solve puzzles, his provocative mischievousness, his indignant impatience with pretension and hypocrisy, and his talent for \none-upping anybody who tries to one-up him! This book is great reading: outrageous, shocking, still warm and very human. \nFor all that, it only skirts the keystone of his life: science. We see it here and there, as background material in one sketch or another, but never as \nthe focus of his existence, which generations of his students and colleagues know it to be. Perhaps nothing else is possible. There may be no way to \nconstruct such a series of delightful stories about himself and his work: the challenge and frustration, the excitement that caps insight, the deep \npleasure of scientific understanding that has been the wellspring of hap

## Integration VectorDB Context Pipeline with LLM output

In [35]:
import os
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from dotenv import load_dotenv
load_dotenv()

True

In [36]:
huggingface_api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [46]:
llm_endpoint = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    huggingfacehub_api_token=huggingface_api_key
)

llm = ChatHuggingFace(llm=llm_endpoint)

In [47]:
## Simple RAG function: retrieve context + generate response

def rag_simple(query, retriever, llm, top_k=3):
    
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## Generate the answer using Mistral 7B model
    prompt = f"""
    Use the foloowing context to answer the question concisely.
    context:
    {context}

    Question: {query}
    Answer: 
    """

    try:    
        response = llm.invoke([prompt.format(context=context, query=query)])
    except Exception as e:
        print(f"Error generating response : {e}")
        raise
    return response.content


In [48]:
answer = rag_simple("who was richard feynman? and describe his journey from Rockaway to MIT", ragRetriever, llm)
print(answer)

Retrieving documents for query: 'who was richard feynman? and describe his journey from Rockaway to MIT'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.87it/s]


Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
 
    Richard Feynman was a physicist known for his almost compulsive need to solve puzzles, provocative mischievousness, and indignant impatience with pretension and hypocrisy. This is revealed in his memoirs, which provide a true picture of his character. While his science is briefly discussed in some background sketches, it is not the focus of this book, perhaps due to the difficulty in constructing a series of delightful stories about himself and the excitement and deep pleasure of scientific understanding that is the wellspring of his life. Feynman's talent for one-upping others is also highlighted in these memoirs, which are described as great reading, both warm and very human. A chance encounter led to Feynman being invited to teach at Cornell, where his teaching abilities were commended by Bob Wilson, who noted that his ability was appreciated and any other expectations were a matter of luck as th