### RAG Pipeline - Data Ingestion to Vector DB Pipeline

## Import Libraries

In [48]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

### Read PDFs

In [49]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: Cover_Letter_Fatma_Fendri.pdf
  ✓ Loaded 1 pages

Processing: DAX Calculation.pdf
  ✓ Loaded 1 pages

Total documents loaded: 2


In [50]:
all_pdf_documents

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-09-25T21:05:59+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-09-25T21:05:59+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': '..\\data\\pdf\\Cover_Letter_Fatma_Fendri.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Cover_Letter_Fatma_Fendri.pdf', 'file_type': 'pdf'}, page_content="Fatma Fendri\nCurd-Jürgens-Str. 14\n81739 München\n■ fendrifatma2@gmail.com | ■ +49 151 58333426\nCHECK24 Vergleichsportal Mietwagen GmbH\nAugsburg\nMünchen, den 25. September 2025\nApplication for (Junior) Data Scientist (m/w/d) – Fokus auf LLMs und GenAI\nDear Hiring Team,\nI am excited to apply for the position of (Junior) Data Scientist – Fokus auf Large Language\nModels (LLMs) und GenAI at CHECK24. With a strong academic background in software\nengineering and practical experience in Machine Learn

### Text splitting get into chunks 


In [51]:

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " " ,""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"split  {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"content : {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    return split_docs

In [52]:
chunks=split_documents(all_pdf_documents)
chunks

split  2 documents into 4 chunks

Example chunk:
content : Fatma Fendri
Curd-Jürgens-Str. 14
81739 München
■ fendrifatma2@gmail.com | ■ +49 151 58333426
CHECK24 Vergleichsportal Mietwagen GmbH
Augsburg
München, den 25. September 2025
Application for (Junior) ...
Metadata: {'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-09-25T21:05:59+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-09-25T21:05:59+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': '..\\data\\pdf\\Cover_Letter_Fatma_Fendri.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Cover_Letter_Fatma_Fendri.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-09-25T21:05:59+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-09-25T21:05:59+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': '..\\data\\pdf\\Cover_Letter_Fatma_Fendri.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Cover_Letter_Fatma_Fendri.pdf', 'file_type': 'pdf'}, page_content='Fatma Fendri\nCurd-Jürgens-Str. 14\n81739 München\n■ fendrifatma2@gmail.com | ■ +49 151 58333426\nCHECK24 Vergleichsportal Mietwagen GmbH\nAugsburg\nMünchen, den 25. September 2025\nApplication for (Junior) Data Scientist (m/w/d) – Fokus auf LLMs und GenAI\nDear Hiring Team,\nI am excited to apply for the position of (Junior) Data Scientist – Fokus auf Large Language\nModels (LLMs) und GenAI at CHECK24. With a strong academic background in software\nengineering and practical experience in Machine Learn

### Embedding 

In [53]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    def __init__(self, model_name:str = "all-miniLM-L6-v2"):
        """
        Initialize the embedding manager 
        Args : 
            model_name : HuggingFace Model name for sentence embeddings 
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the sentenceTransformer model"""
        try: 
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}:{e}") 
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate Embeddings for a list of texts

        Args: 
            texts: List of text strings to embed
        
        returns:
            numpy array of embeddings with shape (len(texts),embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not Loaded")
        
        print(f"Generating embeddings for {len(texts)} texts ...")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

    """def get_embedding_dimension(self) -> int:
        Get the embedding dimension of the model
        if not self.model:
            raise ValueError("Model not loaded")
        return self.model.get_sentence_embedding_dimension()"""

In [55]:
### Initialize the embedding Manager 
embedding_manager = EmbeddingManager()
embedding_manager


Loading embedding model: all-miniLM-L6-v2
Model Loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x264332a2a50>

### VectorStore DB

In [56]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """

        if embeddings is None:
            raise ValueError("Embeddings array is None. Check the generate_embeddings function.")
    
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x264332a2ba0>

In [57]:
chunks

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-09-25T21:05:59+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-09-25T21:05:59+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': '..\\data\\pdf\\Cover_Letter_Fatma_Fendri.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Cover_Letter_Fatma_Fendri.pdf', 'file_type': 'pdf'}, page_content='Fatma Fendri\nCurd-Jürgens-Str. 14\n81739 München\n■ fendrifatma2@gmail.com | ■ +49 151 58333426\nCHECK24 Vergleichsportal Mietwagen GmbH\nAugsburg\nMünchen, den 25. September 2025\nApplication for (Junior) Data Scientist (m/w/d) – Fokus auf LLMs und GenAI\nDear Hiring Team,\nI am excited to apply for the position of (Junior) Data Scientist – Fokus auf Large Language\nModels (LLMs) und GenAI at CHECK24. With a strong academic background in software\nengineering and practical experience in Machine Learn

In [58]:
### Convert text to embeddings 
texts = [doc.page_content for doc in chunks]
texts 


['Fatma Fendri\nCurd-Jürgens-Str. 14\n81739 München\n■ fendrifatma2@gmail.com | ■ +49 151 58333426\nCHECK24 Vergleichsportal Mietwagen GmbH\nAugsburg\nMünchen, den 25. September 2025\nApplication for (Junior) Data Scientist (m/w/d) – Fokus auf LLMs und GenAI\nDear Hiring Team,\nI am excited to apply for the position of (Junior) Data Scientist – Fokus auf Large Language\nModels (LLMs) und GenAI at CHECK24. With a strong academic background in software\nengineering and practical experience in Machine Learning, NLP, and LLMs, I am eager to\ncontribute my expertise to further enhance your Mietwagenvergleich product.\nDuring my end-of-study internship at Neofacto Luxembourg, I worked extensively on the\nAira (AI Review Assistant) project, where I implemented and evaluated NLP and LLM models,\ncontributed to the transition from OpenAI/GPT to LLAMA, and applied techniques such as\nembedding, summarization, and classification. I also gained hands-on experience with',
 "contributed to the trans

In [59]:
### Generate the embeddings 
embeddings = embedding_manager.generate_embeddings(texts)
embeddings 


Generating embeddings for 4 texts ...


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.88it/s]

Generated embeddings with shape: (4, 384)





array([[-8.5021585e-02, -4.9779918e-02, -4.7065220e-03, ...,
        -2.9942289e-02, -4.5879532e-02,  3.0200059e-02],
       [-3.5634048e-02, -7.6290078e-02, -1.5442902e-02, ...,
        -4.3754607e-02, -3.9055906e-02, -1.8587785e-02],
       [ 1.3585211e-02, -6.1329506e-02,  2.9929355e-02, ...,
         5.2718416e-02, -7.7405073e-02, -3.7696049e-02],
       [ 4.2117448e-05,  7.7661268e-02,  1.0731682e-02, ...,
         7.7019085e-04, -1.4245086e-02, -8.2083717e-02]],
      shape=(4, 384), dtype=float32)

In [60]:
### Store in the vector DB 
vectorstore.add_documents(chunks,embeddings)

Adding 4 documents to vector store...
Successfully added 4 documents to vector store
Total documents in collection: 4


### Retriever Pipeline from VectorStore