In [1]:
import torch

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [2]:
# ! pip install langchain_classic langchain_community unstructured

In [3]:
from pathlib import Path

from langchain_classic.document_loaders import PyMuPDFLoader

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [4]:
# 1. read all the pdfs inside the directory

def process_all_pdfs(directory):
    '''Process all pdfs in a directory using PyMuPDF'''

    all_documents = []
    pdf_dir = Path(directory)

    # finding all pdfs recursively
    pdf_files = list(pdf_dir.glob('**/*.pdf'))

    print(f"\n====== Found {len(pdf_files)} PDF files to process ======")

    for file in pdf_files:
        print(f"\n[INFO] Processing: {file.name} file")

        try:
            loader = PyMuPDFLoader(
                str(file)
            )
            documents = loader.load()

            # .extend() adds individual items to the list
            all_documents.extend(documents)

            print(
                f"\n✅ Successfully Loaded <{len(documents)}> pages from {file.name}")
            print("=" * 50)

        except Exception as e:
            print(f"❌ Error processing {file.name}: {e}")
            continue

    print(f"\n\n[INFO] Total PDF documents loaded: <{len(all_documents)}>\n")

    return all_documents

In [5]:
pdf_docs = process_all_pdfs("data")



[INFO] Processing: 10th-defence.pdf file

✅ Successfully Loaded <58> pages from 10th-defence.pdf

[INFO] Processing: 10th-english.pdf file

✅ Successfully Loaded <202> pages from 10th-english.pdf

[INFO] Processing: 10th-geography.pdf file
MuPDF error: library error: FT_New_Memory_Face(MOLGLK+TimesNewRomanPSMT-BoldItalic): unknown file format

MuPDF error: library error: FT_New_Memory_Face(MOLGLK+Arial-Black-Black): unknown file format

MuPDF error: library error: FT_New_Memory_Face(MOLGLK+AdobeHeitiStd-Regular): unknown file format

MuPDF error: library error: FT_New_Memory_Face(MOLGLK+NirmalaUI): unknown file format


✅ Successfully Loaded <82> pages from 10th-geography.pdf

[INFO] Processing: 10th-hindi.pdf file

✅ Successfully Loaded <114> pages from 10th-hindi.pdf

[INFO] Processing: 10th-history.pdf file

✅ Successfully Loaded <110> pages from 10th-history.pdf

[INFO] Processing: 10th-marathi.pdf file

✅ Successfully Loaded <90> pages from 10th-marathi.pdf

[INFO] Processing: 1

In [6]:
"""
Splits loaded documents into smaller, embedding-friendly chunks.
"""

from langchain_classic.text_splitter import RecursiveCharacterTextSplitter


def split_docs(documents):
    """
    Split loaded documents into chunks for embedding.
    """

    if not documents:
        print("⚠️ No documents to split.")
        return []

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    chunked_documents = text_splitter.split_documents(documents)

    print("\n✅✅ Documents splited successfully!")
    print(
        f"\n[INFO] Splitted <{len(documents)}> documents into <{len(chunked_documents)}> chunks."
    )
    print("=" * 50)

    return chunked_documents


In [7]:
chunks = split_docs(pdf_docs)


✅✅ Documents splited successfully!

[INFO] Splitted <2418> documents into <4968> chunks.


In [8]:
'''
generating embeddings for the chunked documents for RAG system
'''

from langchain_huggingface import HuggingFaceEmbeddings
import torch


# 3.GENERATING EMBEDDINGS FOR THE CHUNKED DOCUMENTS

# HuggingFace Embeddings
def huggingface_embeddings(model_name="BAAI/bge-m3"):
    '''Generate embeddings for the chunked documents using HuggingFaceEmbeddings'''

    print("\n[INFO] HuggingFace Embedding model Initializing...")

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print(f"\n[INFO] Using device: {device}")

    embeddings = HuggingFaceEmbeddings(
        model_name=model_name, show_progress=True,
        model_kwargs={
            'device': device
        },
        encode_kwargs={
            'batch_size': 64,
            'normalize_embeddings': True
        }

    )

    print(f"[INFO] Model loaded successfully on {device.upper()}")
    print("=" * 50)

    return embeddings


# ==========================================================================


In [9]:
embeddings = huggingface_embeddings(model_name="google/embeddinggemma-300m")


[INFO] HuggingFace Embedding model Initializing...

[INFO] Using device: cpu


No sentence-transformers model found with name google/embeddinggemma-300m. Creating a new one with mean pooling.


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/embeddinggemma-300m.
403 Client Error. (Request ID: Root=1-6920b302-5666897358c98eaa7360faf7;db0403d9-0c97-4106-9371-54644f6d2c63)

Cannot access gated repo for url https://huggingface.co/google/embeddinggemma-300m/resolve/main/config.json.
Access to model google/embeddinggemma-300m is restricted and you are not in the authorized list. Visit https://huggingface.co/google/embeddinggemma-300m to ask for access.