In [2]:
##


In [3]:
from langchain_community.document_loaders import PyMuPDFLoader,PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from pathlib import Path

In [4]:
def process_all_pdfs(pdf_dir):
    """process all the pdf files in a directory"""
    all_docs = []
    pdf_dir = Path(pdf_dir)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} pdf to process")

    for pdf in pdf_files:
        print("processing")
        try:
            loader = PyPDFLoader(str(pdf))
            document = loader.load()

            for doc in document:
                doc.metadata['source_file'] = pdf.name
                doc.metadata['file_type'] = 'pdf'

            all_docs.extend(document)

        except Exception as e:
            print(f" Error:{e}")
        
    print(f"total {len(all_docs)} loaded")
    return all_docs



 
 

In [5]:
documents = process_all_pdfs('../data/pdf')

Found 6 pdf to process
processing
processing
processing
processing
processing
processing
total 21 loaded


In [6]:
### TExt splitting

def split_docs_fn(documents,chunk_size =1000,chunk_overlap = 200):
    """splits document into smaller chunks for better rag performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = 200,
        length_function = len,
        separators=["\n\n","\n",""," "]
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"split {len(documents)} into {len(split_docs)} chunks")

    if split_docs:
        print(f"content : {split_docs[0].page_content[:1000]}")
        print(f"Metadata {split_docs[0].metadata}")
    
    return split_docs

In [7]:
chunks = split_docs_fn(documents)
chunks

split 21 into 42 chunks
content : CamScanner
Metadata {'producer': 'intsig.com pdf producer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'assignment', 'author': 'CamScanner', 'subject': 'assignment', 'moddate': '', 'keywords': '', 'source': '..\\data\\pdf\\assignment .pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'assignment .pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'intsig.com pdf producer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'assignment', 'author': 'CamScanner', 'subject': 'assignment', 'moddate': '', 'keywords': '', 'source': '..\\data\\pdf\\assignment .pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'assignment .pdf', 'file_type': 'pdf'}, page_content='CamScanner'),
 Document(metadata={'producer': 'intsig.com pdf producer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'assignment', 'author': 'CamScanner', 'subject': 'assignment', 'moddate': '', 'keywords': '', 'source': '..\\data\\pdf\\assignment .pdf', 'total_pages': 2, 'page': 1, 'page_label': '2', 'source_file': 'assignment .pdf', 'file_type': 'pdf'}, page_content='CamScanner'),
 Document(metadata={'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2025-11-09T13:00:10+05:45', 'author': '', 'moddate': '2025-11-09T13:00:10+05:45', 'title': 'Microsoft Word - CT304 COMPUTER NETWORKS', 'source': '..\\

### Embedding and VectorDb

In [8]:
import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
class EmbeddingManager:
    """Handles documents embedding generation using SentenceTransformer"""
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager

        Args:
            model_name:HuggingFace model name for sentence embeddings
        """

        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the sentenceTransformer model"""
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded with Embedding dimension : {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name} : {e}")
            raise

    def generate_embeddings(self,texts:List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts

            Args:
            texta:List of text strings to embed

            Returns:
            numpy array of embeddings with shape (len(textss),embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not found")
        
        print(f"Generating embeddings")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    # def get_embedding_dimension(self) ->int:
    #     """Get the embedding dimension of loaded model"""
    #     if not self.model:
    #         raise ValueError("Mmodel not loaded.")
        
    #     return self.model.get_sentence_embedding_dimension()


embedding_manager = EmbeddingManager()

embedding_manager


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded with Embedding dimension : 384


<__main__.EmbeddingManager at 0x18ccc1dbcb0>