### RAG pipeline: Data ingestion to vector db pipeline

In [34]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

def load_pdf(pdf_dir):
    try:
        pdf_dir_path = Path(pdf_dir)
        print(f"Checking if the provided path '{pdf_dir}' is a valid directory...")
        if not pdf_dir_path.is_dir():
            raise ValueError(f"The provided path '{pdf_dir}' is not a valid directory.")
        
        all_pdfs = []
        for pdf_file in pdf_dir_path.glob("*.pdf"):
            print(f"Loading PDF: {pdf_file.name}")
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            print(f"Loaded {len(documents)} documents")
            all_pdfs.extend(documents)
        print(f"length of all_pdfs: {len(all_pdfs)}")
        
        return all_pdfs

    except Exception as e:
        print(f"An error occurred while loading PDFs: {e}")
        return []

loadAllPdfs = load_pdf("../data/pdf")
loadAllPdfs

Checking if the provided path '../data/pdf' is a valid directory...
Loading PDF: building_the_future__a_business_transformation_tale_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-07-16-57.pdf
Loaded 11 documents
Loading PDF: deploy_empathy__a_practical_guide_to_interviewing_customers_michele_hansen_2025-06-18-15-12.pdf
Loaded 9 documents
Loading PDF: from_book_to_business__a_lean_startup_journey_of_transformation_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-07-17-36.pdf
Loaded 13 documents
Loading PDF: innovate_and_adapt__a_solopreneur's_journey_in_heidelberg_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-06-18-18.pdf
Loaded 10 documents
Loading PDF: innovation_in_solitude__a_solo_founder’s_journey_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-08-19-35.pdf
Loaded 13 documents
Loading PDF: navigating_the_startup_labyrinth_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-08-14-33.pdf
Loaded 13 document

[Document(metadata={'producer': 'pdfmake', 'creator': 'pdfmake', 'creationdate': '2025-08-07T16:57:58+00:00', 'source': '..\\data\\pdf\\building_the_future__a_business_transformation_tale_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-07-16-57.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Building the Future: A Business Transformation \nTale\nby Inspired by the principles of Lean Startup by Eric Ries\nTable of Contents\nGenesis of an Idea\nThe Art of Discovery\nTesting the Waters\nFrom Crisis to Clarity\nScaling New Heights\nThe Ultimate Test\nA New Dawn\nGenesis of an Idea\nIn the historic city of Heidelberg, where the ancient streets whispered tales of the past, Jack found \nhimself standing on the precipice of an uncertain future. The cobblestone roads that had seen the \npassage of time now bore witness to a man ensnared in the throes of his ambition.\nJack was a frontend architect by trade, a maestro of code, orchestrating digital symphonie

In [35]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    try:
        print(f"Splitting documents into chunks with chunk_size={chunk_size} and chunk_overlap={chunk_overlap}...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", " ", ""])
        # all_chunks = []
        # for doc in documents:
        #     print(f"Splitting document: {doc.metadata.get('source', 'unknown source')}")
        #     chunks = text_splitter.split_text(doc.page_content)
        #     print(f"chunks: {chunks}")
        #     all_chunks.extend(chunks)
        
        # return all_chunks

        chunk_documents = text_splitter.split_documents(documents)
        print(f"Total number of chunks created: {len(chunk_documents)}")
        return chunk_documents

    except Exception as e:
        print(f"An error occurred while splitting documents: {e}")
        return []

splitAllPdfs = split_documents(loadAllPdfs)

Splitting documents into chunks with chunk_size=1000 and chunk_overlap=200...
Total number of chunks created: 412


In [36]:
splitAllPdfs

[Document(metadata={'producer': 'pdfmake', 'creator': 'pdfmake', 'creationdate': '2025-08-07T16:57:58+00:00', 'source': '..\\data\\pdf\\building_the_future__a_business_transformation_tale_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-07-16-57.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Building the Future: A Business Transformation \nTale\nby Inspired by the principles of Lean Startup by Eric Ries\nTable of Contents\nGenesis of an Idea\nThe Art of Discovery\nTesting the Waters\nFrom Crisis to Clarity\nScaling New Heights\nThe Ultimate Test\nA New Dawn\nGenesis of an Idea\nIn the historic city of Heidelberg, where the ancient streets whispered tales of the past, Jack found \nhimself standing on the precipice of an uncertain future. The cobblestone roads that had seen the \npassage of time now bore witness to a man ensnared in the throes of his ambition.\nJack was a frontend architect by trade, a maestro of code, orchestrating digital symphonie

In [39]:
splitAllPdfs[0].metadata

{'producer': 'pdfmake',
 'creator': 'pdfmake',
 'creationdate': '2025-08-07T16:57:58+00:00',
 'source': '..\\data\\pdf\\building_the_future__a_business_transformation_tale_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-07-16-57.pdf',
 'total_pages': 11,
 'page': 0,
 'page_label': '1'}

In [52]:
from collections import Counter
for i, chunk in enumerate(splitAllPdfs):
    counts = Counter(chunk.metadata.get("source", "unknown source") for chunk in splitAllPdfs)
chunkCount = dict(counts)

for source, count in chunkCount.items():
    print(f"Source: {source}, Chunk Count: {count}")

Source: ..\data\pdf\building_the_future__a_business_transformation_tale_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-07-16-57.pdf, Chunk Count: 41
Source: ..\data\pdf\deploy_empathy__a_practical_guide_to_interviewing_customers_michele_hansen_2025-06-18-15-12.pdf, Chunk Count: 31
Source: ..\data\pdf\from_book_to_business__a_lean_startup_journey_of_transformation_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-07-17-36.pdf, Chunk Count: 50
Source: ..\data\pdf\innovate_and_adapt__a_solopreneur's_journey_in_heidelberg_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-06-18-18.pdf, Chunk Count: 33
Source: ..\data\pdf\innovation_in_solitude__a_solo_founder’s_journey_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-08-19-35.pdf, Chunk Count: 46
Source: ..\data\pdf\navigating_the_startup_labyrinth_inspired_by_the_principles_of_lean_startup_by_eric_ries_2025-08-08-14-33.pdf, Chunk Count: 49
Source: ..\data\pdf\navigating_the_unknow