### DATA INGESTION

In [3]:
### document datastructure
from langchain_core.documents import Document

In [4]:
### pdfLoader
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain_community.document_loaders import DirectoryLoader
##load 
dir_Loader=DirectoryLoader(
    "../data",
    glob="**/*.pdf",
    loader_cls = PyMuPDFLoader,
    # show_progress=False
)


documents=dir_Loader.load()
documents

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-26T12:48:07+05:30', 'source': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'file_path': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'total_pages': 38, 'format': 'PDF 1.7', 'title': 'Skill Certification Policy 2025 Ver 2.03', 'author': 'Soniya David', 'subject': '', 'keywords': '', 'moddate': '2025-08-26T12:48:07+05:30', 'trapped': '', 'modDate': "D:20250826124807+05'30'", 'creationDate': "D:20250826124807+05'30'", 'page': 0}, page_content='Skill Certification Policy 2025 Ver 2.03 \n© 2025 Coforge \n1 \nSkill Certification Policy 2025 Ver \n2.03'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-26T12:48:07+05:30', 'source': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'file_path': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'total_pages': 38, 'format': 'PDF 1.7', 'title':

In [5]:
# Chunking and Embedding Pipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np

class EmbeddingPipeline:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2", chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.model = SentenceTransformer(model_name)
        print(f"[INFO] Loaded embedding model: {model_name}")

    def chunk_documents(self, documents):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
        chunks = splitter.split_documents(documents)
        print(f"[INFO] Split {len(documents)} documents into {len(chunks)} chunks.")
        return chunks

    def embed_chunks(self, chunks):
        texts = [chunk.page_content for chunk in chunks]
        print(f"[INFO] Generating embeddings for {len(texts)} chunks...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"[INFO] Embeddings shape: {embeddings.shape}")
        return embeddings

# Use the pipeline
pipeline = EmbeddingPipeline()
chunks = pipeline.chunk_documents(documents)
embeddings = pipeline.embed_chunks(chunks)


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 573.36it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


[INFO] Loaded embedding model: all-MiniLM-L6-v2
[INFO] Split 38 documents into 111 chunks.
[INFO] Generating embeddings for 111 chunks...


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.64it/s]

[INFO] Embeddings shape: (111, 384)





###embeddings and vectorDb

In [7]:
import numpy as np
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
class EmbeddingManager:
    """Handles document embedding generation using SentanceTransformer"""

    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load the sentenceTransformer Model"""
        try:
            print(f"Loading embeddings model:{self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Model loaded successfully.Embeddings dimension:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model{self.model_name}:{e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for list of texts"""
        if not self.model:
            raise ValueError("Model not loaded. Call _load_model first.")
        
        print(f"Genarating embeddings for{len(texts)} texts...)")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings shape:{embeddings.shape}")
        return embeddings
    
    # def get_embedding_dimension(self) -> int:
    #     """Return the dimension of the embeddings"""
    #     if not self.model:
    #         raise ValueError("Model not loaded.")
    #     return self.model.get_sentence_embedding_dimension()

    ##initialize the embedding manger

embedding_manager = EmbeddingManager()
embedding_manager



Loading embeddings model:all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 471.88it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded successfully.Embeddings dimension:384


<__main__.EmbeddingManager at 0x2d70ce22cf0>

### VectorStore

In [None]:
class VectorStore:
    """Manages document embeddings in a chromaDB vector Store"""

    def __init__(self,collection_name:str="pdf_documents",persist_directory:str="../data/vector_store"):
        """Args:
            collection_name: name of the chroamdb collection 
            persist_directory: directory to persist the vector store"""
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()
        
    def _initialize_store(self):
        """Initialize chromadb client and collection"""
        try:
            # Create persistent chromadb client
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)

            #Get or create collection
            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized.Collection'{self.collection_name}' is ready.")
            print(f"Collection info:{self.collection.count()} documents stored.")
        except Exception as e:
            print(f"Error initializinf vector store:{e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """Add documents and their embeddings to the vector store"""
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings.")
        print(f"Added {len(documents)} documents to vector store.")
        
        #prepare data for chromaDB
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]

        for i, (doc, embeddings) in enumerate(zip(documents, embeddings)):
            #generate unique id
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"##takes first 8 characters of uuid
            ids.append(doc_id)

            #metadata
            metadata=dict(doc.metadata) #copy existing metadata
            metadata['doc_index'] = i #Add chunk index
            metadata['content_lenght'] = len(doc.page_content) #Add text length
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(embeddings.tolist())
        
        ## add to collections
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to collection '{self.collection_name}'.")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store:{e}")
            raise

vectorstore=VectorStore()
vectorstore
        

    

Vector store initialized.Collection'pdf_documents' is ready.
Collection info:0 documents stored.


<__main__.VectorStore at 0x2d755529010>

In [15]:
chunks

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-26T12:48:07+05:30', 'source': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'file_path': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'total_pages': 38, 'format': 'PDF 1.7', 'title': 'Skill Certification Policy 2025 Ver 2.03', 'author': 'Soniya David', 'subject': '', 'keywords': '', 'moddate': '2025-08-26T12:48:07+05:30', 'trapped': '', 'modDate': "D:20250826124807+05'30'", 'creationDate': "D:20250826124807+05'30'", 'page': 0}, page_content='Skill Certification Policy 2025 Ver 2.03 \n© 2025 Coforge \n1 \nSkill Certification Policy 2025 Ver \n2.03'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-08-26T12:48:07+05:30', 'source': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'file_path': '..\\data\\SCP Policy_2025 2.03 2.pdf', 'total_pages': 38, 'format': 'PDF 1.7', 'title':

In [16]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]
texts

['Skill Certification Policy 2025 Ver 2.03 \n© 2025 Coforge \n1 \nSkill Certification Policy 2025 Ver \n2.03',
 'Skill Certification Policy 2025 Ver 2.03 \n \n \n© 2025 Coforge \n2 \n \n \nTable of Contents \n1.0        Objective……………………………………………………………………………………………………………………………………………………………..3 \n2.0 \nEligibility………………………………………………………………………………………………………………………………………………………………3 \n3.0 \nDisclaimer ............................................................................................................................................................. 3 \n4.0 \nNomination ............................................................................................................................................................ 3 \n5.0 \nBenefit ....................................................................................................................................... …………………………3 \n6.0 \nProcedure ........................................................................................

In [17]:
## Genearte the embeddings
embeddings=embedding_manager.generate_embeddings(texts)

##store in the vectorDB
vectorstore.add_documents(chunks,embeddings)

Genarating embeddings for111 texts...)


Batches: 100%|██████████| 4/4 [00:01<00:00,  2.06it/s]


Generated embeddings shape:(111, 384)
Added 111 documents to vector store.
Successfully added 111 documents to collection 'pdf_documents'.
Total documents in collection: 111


### Retriever pipeline from vectorstore

In [27]:
class RAGRetriever:
    """Retrieves relevant documents for a given query using vector similarity"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str,top_k: int = 5,score_threshold:float =0.0 ) -> List[Dict[str,Any]]:
        """Retrieve the most similar documents to the query
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold

        Returns:
            List of dictionaries containing document content, metadata and similarity score"""
        # Generate embedding for the query
        print(f"Retriving documents for query:'{query}'")
        print(f"Top K:{top_k},Score threshold:{score_threshold}")
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]


        try:
        # Search for most similar documents
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
        # Format results
            retrieved_docs = []
            if results['documents']and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                        'id':doc_id,
                        'similarity_score': similarity_score,
                        'content': document,
                        'metadata':metadata,
                        'distance': distance,
                        'rank':i+1
                    })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
    

            return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        
rag_retriever = RAGRetriever(vectorstore, embedding_manager)
rag_retriever


<__main__.RAGRetriever at 0x2d719d09400>

In [30]:
rag_retriever.retrieve("AWS Certified DevOps Engineer")

Retriving documents for query:'AWS Certified DevOps Engineer'
Top K:5,Score threshold:0.0
Genarating embeddings for1 texts...)


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.00it/s]

Generated embeddings shape:(1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_92a53224_94',
  'similarity_score': 0.21073400974273682,
  'content': 'Cloud \n$200  \nDigital \n581 \nGCP: Professional Cloud DevOps Engineer \nCloud \n$200  \nDigital + CIMS \n582 \nLinuxFoundation: OpenJS Node.js Application Developer \n(JSNAD) \nCloud \n$300  \nDigital \n583 \nLinuxFoundation: Kubernetes and Cloud Native Associate \n(KCNA) \nCloud \n$250  \nDigital \n584 \nDevOps Leader(https://www.devopsinstitute.com/) \nDevOps \n$450  \nDigital + CIMS \n585 \nAWS Certified DevOps Engineer - Professional \nDevOps \n$300  \nDigital + CIMS \n586 \nMicrosoft Certified: DevOps Engineer Expert (AZ-400) \nDevOps \n$165  \nDigital + CIMS \n587 \nRed Hat Certified Specialist in Managing Automation with \nAnsible Automation Platform exam (EX467) \nDevOps \n$544  \nCIMS \n588 \nLinuxFoundation: Certified Kubernetes Application Developer \n(CKAD) \nDevOps \n$395  \nDigital + CIMS \n589 \nKong Gateway Certified Associate \nIntegration \n$275  \nDigital \n590 \nIBM certified solut

In [31]:
rag_retriever.retrieve("Chethan")

Retriving documents for query:'Chethan'
Top K:5,Score threshold:0.0
Genarating embeddings for1 texts...)


Batches: 100%|██████████| 1/1 [00:00<00:00, 14.16it/s]

Generated embeddings shape:(1, 384)
Retrieved 0 documents (after filtering)





[]