## 1.Data ingestion pipeline


In [1]:
from pathlib import Path
from langchain_classic.document_loaders import PyMuPDFLoader

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [2]:
# read all the pdfs inside the directory
def process_all_pdfs(directory):
    '''Process all pdfs in a directory using PyMuPDF'''

    all_documents = []
    pdf_dir = Path(directory)

    # finding all pdfs recursively
    pdf_files = list(pdf_dir.glob('**/*.pdf'))

    print(f"\n====== Found {len(pdf_files)} PDF files to process ======")

    for file in pdf_files:
        print(f"\n[INFO] Processing: {file.name} file")

        try:
            loader = PyMuPDFLoader(
                str(file)
            )
            documents = loader.load()

            # .extend() adds individual items to the list
            all_documents.extend(documents)

            print(
                f"\n✅ Successfully Loaded <{len(documents)}> pages from {file.name}")
            print("=" * 50)

        except Exception as e:
            print(f"❌ Error processing {file.name}: {e}")
            continue

    print(f"\n\n[INFO] Total documents loaded: <{len(all_documents)}>\n")
    return all_documents

In [3]:
all_pdf_docs = process_all_pdfs("data/pdfs")



[INFO] Processing: Deep Learning 101.pdf file

✅ Successfully Loaded <266> pages from Deep Learning 101.pdf

[INFO] Processing: DeepSeek_OCR_paper.pdf file

✅ Successfully Loaded <22> pages from DeepSeek_OCR_paper.pdf

[INFO] Processing: mathematics-ML.pdf file

✅ Successfully Loaded <266> pages from mathematics-ML.pdf

[INFO] Processing: ML.pdf file

✅ Successfully Loaded <169> pages from ML.pdf

[INFO] Processing: pp_report_1.pdf file

✅ Successfully Loaded <14> pages from pp_report_1.pdf

[INFO] Processing: PP_REPORT_2.pdf file

✅ Successfully Loaded <9> pages from PP_REPORT_2.pdf


[INFO] Total documents loaded: <746>



In [4]:
all_pdf_docs[0]

Document(metadata={'producer': 'xdvipdfmx (20250205); modified using OpenPDF UNKNOWN', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-10-15T19:40:49+11:00', 'source': 'data\\pdfs\\Deep Learning 101.pdf', 'file_path': 'data\\pdfs\\Deep Learning 101.pdf', 'total_pages': 266, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-15T22:07:21+05:30', 'trapped': '', 'modDate': "D:20251015220721+05'30'", 'creationDate': "D:20251015194049+11'00'", 'page': 0}, page_content='')

## 2.splitting documents into chunks


In [5]:
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter

In [6]:
def split_docs(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )

    chunked_documents = text_splitter.split_documents(documents)

    print("\n✅Document Splitted successfully!")
    print(
        f"\nSplitted <{len(documents)}> documents into <{len(chunked_documents)}> chunks.")
    print("=" * 50)

    return chunked_documents

In [7]:
chunks = split_docs(all_pdf_docs)


✅Document Splitted successfully!

Splitted <746> documents into <1445> chunks.


## 3.creating new vectorstore from scratch


In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.vectorstores import FAISS

In [9]:
def embed_and_store(chunks):

    try:
        print("\n[INFO] Embedding Initializing...")
        print("=" * 50)

        embedding_model = HuggingFaceEmbeddings(
            model_name="BAAI/bge-small-en-v1.5", show_progress=True,
            model_kwargs={
                'device': 'cpu'
            },
            encode_kwargs={
                'batch_size': 32,
                'normalize_embeddings': True
            }

        )

        print("\n[INFO] VectorStore Initializing...")
        print("=" * 50)

        # Creates a new FAISS index from scratch
        vectorstore = FAISS.from_documents(
            documents=chunks,
            embedding=embedding_model,
            distance_strategy='COSINE'  # Better for normalized embeddings
        )

        print(f"\n[INFO] Vector dimension: {vectorstore.index.d}")

        print(
            f"[INFO] Total Vectors in the store: <{vectorstore.index.ntotal}>")
        print("=" * 50)

        # Save
        vectorstore.save_local("faiss_index")
        print("\n✅✅ Successfully saved FAISS index locally")

        return vectorstore

    except Exception as e:
        print(f"❌ Error during embedding and storing: {e}")

In [10]:
# vectorstore = embed_and_store(chunks)

## 3.1 if vector store exists, directly load it from directory


In [11]:
# loading the vectorstore from disk
def embed_and_load(embedding_model, vectorstore_path):
    try:
        print("\n[INFO] Embedding Initializing...")
        print("=" * 50)

        embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model, show_progress=True,
            model_kwargs={
                'device': 'cpu'
            },
            encode_kwargs={
                'batch_size': 32,
                'normalize_embeddings': True
            }

        )

        print("\n[INFO] VectorStore Initializing...")
        print("=" * 50)

        # loading existing vectorstore
        vectorstore = FAISS.load_local(
            vectorstore_path,
            embeddings=embeddings,
            allow_dangerous_deserialization=True
        )

        print(f"\n[INFO] Vector dimension: {vectorstore.index.d}")

        print(
            f"[INFO] Total Vectors in the store: <{vectorstore.index.ntotal}>")
        print("=" * 50)

        print("\n✅✅ Successfully LOADED Embeddings and Vectorstore.")

        return vectorstore

    except Exception as e:
        print(f"❌ Error during LOADING: {e}")

In [12]:
# vectorstore = embed_and_load(embedding_model="BAAI/bge-small-en-v1.5", vectorstore_path="faiss_index")

## 3.2 if vectorstore exist and want to add more documents, load existing vectorstore and add more docs/ new chunks to it


In [13]:
# loading existing vectorstore and adding more documents/ new chunks
def load_and_add_new_docs(embedding_model, vectorstore_path, new_chunks):
    try:
        print("\n[INFO] Embedding Initializing...")
        print("=" * 50)

        embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model, show_progress=True,
            model_kwargs={
                'device': 'cpu'
            },
            encode_kwargs={
                'batch_size': 32,
                'normalize_embeddings': True
            }

        )

        print("\n[INFO] VectorStore Initializing...")
        print("=" * 50)

        # loading existing vectorstore
        vectorstore = FAISS.load_local(
            vectorstore_path,
            embeddings=embeddings,
            allow_dangerous_deserialization=True
        )

        print("\n[INFO] Adding new CHUNKS to the Vectorstore...")

        # adding new documents/ chunks to existing vectorstore
        vectorstore.add_documents(new_chunks)

       

        print(f"\n[INFO] Vector dimension: {vectorstore.index.d}")

        print(
            f"[INFO] Total Vectors in the store: <{vectorstore.index.ntotal}>")
        print("=" * 50)

        print("\n✅✅ Successfully ADDED new chunks to the Vectorstore.")

        return vectorstore

    except Exception as e:
        print(f"❌ Error during LOADING and ADDING: {e}")

In [14]:
# vectorstore = load_and_add_new_docs(
#     embedding_model="BAAI/bge-small-en-v1.5", 
#     vectorstore_path="faiss_index", 
#     new_chunks=chunks
# )

##


## 4. create RAG pipeline


In [15]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_classic.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

True

In [16]:
def simple_rag(query, vectorstore):
    # Retrieve similar documents
    similar_docs = vectorstore.similarity_search(
        query=query,
        k=3
    )

    # Initialize the Google Generative AI chat model
    chat_model = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro"
    )

    # Create a prompt by combining the query with the content of similar documents
    context = "\n\n".join([doc.page_content for doc in similar_docs])

    prompt = ChatPromptTemplate.from_template(
        '''
    Using the following context to answer the question below. 
    If the context is insufficient, please say "I don't know".
    <context>
    {context}
    </context>

    question: {query}
    '''
    )

    prompt = prompt.format_prompt(
        context=context,
        query=query
    )

    # Generate a response using the chat model
    response = chat_model.invoke(prompt)

    # print(context)

    return response.content

In [17]:
query = "what is llm fine-tuning?"
# rag_response = simple_rag(query, vectorstore)

In [18]:
# print(rag_response)