### Data Ingestion

In [6]:
from langchain_core.documents import Document
import os

os.makedirs("../Data/textfiles", exist_ok = True)

In [7]:
# Read text using Text Loader
from langchain.document_loaders import TextLoader
loader = TextLoader(r"C:\Users\DGY3KOR\Desktop\Prep\Data\textfiles\deep.txt", encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': 'C:\\Users\\DGY3KOR\\Desktop\\Prep\\Data\\textfiles\\deep.txt'}, page_content='Deep learning is a powerful subset of machine learning that uses multi-layered artificial neural networks, inspired by the human brain, to automatically learn complex patterns from vast amounts of data, enabling AI to perform tasks like image/speech recognition and natural language processing with high accuracy, often without explicit programming. The "deep" part refers to the multiple hidden layers in these networks, allowing them to build intricate, hierarchical representations of data, from simple features to complex objects, as seen in self-driving cars or voice assistants. \nHow it works\nInspired by the Brain: Uses nodes (neurons) in layers to process information, mimicking how our brains work.\nLayered Learning: Data flows through an input layer, multiple "hidden" layers that extract features (e.g., edges, shapes, objects), and an output layer for the final result.\nFeatu

In [8]:
# Reading files from directory. We use directory Loader
from langchain.document_loaders import TextLoader, DirectoryLoader
loader = DirectoryLoader(
    "../Data/textfiles",
    loader_cls=TextLoader,
    show_progress=True
)

document = loader.load()
print(document)

100%|██████████| 2/2 [00:00<?, ?it/s]

[Document(metadata={'source': '..\\Data\\textfiles\\deep.txt'}, page_content='Deep learning is a powerful subset of machine learning that uses multi-layered artificial neural networks, inspired by the human brain, to automatically learn complex patterns from vast amounts of data, enabling AI to perform tasks like image/speech recognition and natural language processing with high accuracy, often without explicit programming. The "deep" part refers to the multiple hidden layers in these networks, allowing them to build intricate, hierarchical representations of data, from simple features to complex objects, as seen in self-driving cars or voice assistants. \nHow it works\nInspired by the Brain: Uses nodes (neurons) in layers to process information, mimicking how our brains work.\nLayered Learning: Data flows through an input layer, multiple "hidden" layers that extract features (e.g., edges, shapes, objects), and an output layer for the final result.\nFeature Extraction: Early layers det




In [36]:
# %pip install -r requirement.txt

In [9]:
# For PDF files
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
loader = DirectoryLoader(
    "../Data/PDFiles",
    loader_cls=PyPDFLoader,
    show_progress=True
)

document = loader.load()
print(document)

100%|██████████| 2/2 [00:10<00:00,  5.39s/it]






### Embeddings to Vector Store 

In [38]:
# %pip install chromadb

In [10]:
import sqlite3
print(sqlite3.sqlite_version)


3.45.3


### Data Ingestion and Transformation

In [12]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity
import hashlib

class EmbeddingManager:
    """ Handles document embedding generation using SentenceTransformers"""
    def __init__(self, model_name: str = "all-MiniLM-L6-V2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        try:
            print(f"Loading Embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully. Embedding Dimensions: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print("Error loading model {self.model_name}: {e}")
            raise

    def Generate_Embedding(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not Loaded.")
        print(f"Generating Embeddings for {len(texts)} texts.....")
        embedding = self.model.encode(texts, show_progress_bar = True)
        print(f"Generated Embeddings: {embedding.shape}")
        return embedding
    
    # def get_sentence_embedding_dimension(self) -> int:
    #     if not self.model:
    #         raise ValueError("Model not Loaded.")
    #     return self.get_sentence_embedding_dimension()
        
# Initialize Embedding Manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading Embedding model: all-MiniLM-L6-V2
Model Loaded Successfully. Embedding Dimensions: 384


<__main__.EmbeddingManager at 0x1fd149c0080>

### Chunks

In [13]:
def chunk_text(text: str,
    chunk_size: int = 800, # 800 because my documents are well structured. so 0-800, if not 500 will be ideal one
    overlap: int = 150 # it considers from 650 - 1450
) -> List[str]:

    if overlap >= chunk_size:
        raise ValueError("overlap must be smaller than chunk_size")
    chunks = []
    start = 0
    text_length = len(text)
    while start < text_length: # Loop runs until start = text_length
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

def chunk_documents(documents: List[Any],chunk_size: int = 800,overlap: int = 150) -> List[Dict[str, Any]]:
    chunked_docs = []
    for doc_idx, doc in enumerate(documents):
        # Extract text
        if isinstance(doc, str): # Return whether an object is an instance of a class or of a subclass thereof.
            text = doc
            metadata = {"source_doc": doc_idx}
        else:
            text = doc.page_content
            metadata = dict(doc.metadata)
        text_chunks = chunk_text(text, chunk_size, overlap)

        for idx, chunk in enumerate(text_chunks): # The enumerate object yields pairs containing a count (from start, which defaults to zero) and a value yielded by the iterable argument.
            chunked_docs.append({
                "page_content": chunk,
                "metadata": {
                    **metadata,
                    "chunk_index": idx,
                    "chunk_size": chunk_size
                }
            })
    return chunked_docs

def split_documents(documents: List[Any],chunk_size: int = 800, overlap: int = 150) -> List[Dict[str, Any]]:
    return chunk_documents(documents, chunk_size, overlap)


chunks = split_documents(document)
chunks


[{'page_content': 'ARCH2021.1 Shapiro_Machine Learning ... 00e6 1 \nMachine Learning: what is it and what are its components? \n-- some preliminary observations1 \n \nArnold F. Shapiro \nPenn State University, Smeal College of Business, University Park, PA 16802, USA \nAbstract \nThis article focuses on conceptualizing machine learning (ML) concepts.  The general topics \ncovered are supervised learning based on regression and classification, unsupervised \nlearning based on clustering and dimensionality reduction, and reinforcement learning.  The \narticle begins with an overview of ML, including the types of ML and their components. \nThen, for background, basic concepts associated with ML are addressed. From there on the \ndiscussion turns to ML algorithms, by topic. Although insurance applications of ML are not \npur',
  'metadata': {'producer': 'Adobe PDF Library 11.0',
   'creator': 'Acrobat PDFMaker 11 for Word',
   'creationdate': '2020-11-09T20:45:24-05:00',
   'author': 'Arno

### VectorStore

In [14]:
import os
class VectorStore:
    def __init__(self, collection_files: str = "pdf_documents", directory: str = r"C:\Users\DGY3KOR\Desktop\Prep\Data\vectorstore"):
        self.files = collection_files
        self.directory = directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        "Initialize chromadb client and collection"
        try:
            # Create chromadb client
            os.makedirs(self.directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.directory) # PersistentClient is a function in chromadb
            # Get or create collection
            self.collection = self.client.get_or_create_collection(name = self.files, metadata = {"Description": "PDF document Embeddings for RAG"})
            print(f"Vector Store Initialized: collection{self.files}")
            print(f"Existing Documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error Initializing Vector store: {e}")
            raise
    
    # Add document function
    def add_document(self, document = List[Any], embeddings = np.array): # document contains metadata, source, title, author, page title, no.of pages, content etc. we are passing it here. 
        if len(document) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings.")
        print(f"Adding {len(document)} to Vector Store....")

        # Prepare data for Chromadb
        ids = []
        metadatas = []
        document_text = []
        embedding_list = []
        
        for i, (doc, embedding) in enumerate(zip(document, embeddings)):
            doc_id = hashlib.sha256(doc["page_content"].encode("utf-8")).hexdigest()
            ids.append(doc_id)

            metadata = dict(doc["metadata"])
            metadata["doc_index"] = i
            metadata["Content_Length"] = len(doc["page_content"])
            metadatas.append(metadata)

            document_text.append(doc["page_content"])
            embedding_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(ids=ids,metadatas=metadatas,embeddings=embedding_list,documents=document_text)
            print(f"Successfully added {len(document)} to vector store.....")
            print(f"Total Documents in Collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding Documents to vector Store: {e}")
            raise

vectorstore = VectorStore()
vectorstore    



Vector Store Initialized: collectionpdf_documents
Existing Documents in collection: 5082


<__main__.VectorStore at 0x1fd15ccd670>

In [15]:
### Convert text to embeddings
texts = [document["page_content"] for document in chunks]

## Initialize the Embedding Manager
embeddings = embedding_manager.Generate_Embedding(texts)

# Store in Vector Database
vectorstore.add_document(chunks, embeddings)

Generating Embeddings for 1980 texts.....


Batches: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Generated Embeddings: (1980, 384)
Adding 1980 to Vector Store....
Successfully added 1980 to vector store.....
Total Documents in Collection: 5082


### RAG Retrieval Pipeline from Vector Store

In [16]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5,score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Similarity threshold: {score_threshold}")

        # 1. Generate query embedding
        query_embedding = self.embedding_manager.Generate_Embedding([query])[0]

        try:
            # 2. Query Chroma
            results = self.vector_store.collection.query(  # query is a method and contains many parameters. I used few which are required for me.
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                include=["documents", "metadatas", "distances"]
            )

            retrieved_docs = []

            if results["documents"] and results["documents"][0]:
                documents = results["documents"][0]
                metadatas = results["metadatas"][0]
                distances = results["distances"][0]
                ids = results["ids"][0]

                for rank, (doc_id, doc, meta, dist) in enumerate(zip(ids, documents, metadatas, distances), start=1):
                    # Convert distance → similarity
                    similarity = 1 - dist

                    if similarity >= score_threshold:
                        retrieved_docs.append({
                            "id": doc_id,
                            "content": doc,
                            "metadata": meta,
                            "similarity_score": round(similarity, 4),
                            "distance": round(dist, 4),
                            "rank": rank
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")

            return retrieved_docs

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

In [17]:
rag_retriever = RAGRetriever(vectorstore, embedding_manager)
rag_retriever.retrieve("What is classification")

Retrieving documents for query: 'What is classification'
Top K: 5, Similarity threshold: 0.0
Generating Embeddings for 1 texts.....


Batches: 100%|██████████| 1/1 [00:00<00:00, 57.06it/s]

Generated Embeddings: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'bf217cad4fcb51140a240d68c89bd7975a2d9d5e18ae71fac10a9d8ffa68576b',
  'content': 'used to classify the',
  'metadata': {'page': 241,
   'creationdate': "D:20210312135113Z00'00'",
   'moddate': "D:20210312135113Z00'00'",
   'source': 'C:\\Users\\DGY3KOR\\Desktop\\Prep\\Data\\PDFiles\\MachineLearningTomMitchell.pdf',
   'chunk_index': 4,
   'page_label': '242',
   'total_pages': 421,
   'doc_index': 1808,
   'chunk_size': 500,
   'producer': 'macOS Version 11.2.3 (Build 20D91) Quartz PDFContext',
   'title': 'Machine - Learning - Tom Mitchell.pdf',
   'Content_Length': 20,
   'creator': 'Preview'},
  'similarity_score': 0.3159,
  'distance': 0.6841,
  'rank': 1},
 {'id': '250f01e172837efe659d6e4a2d886166ae8e1387340518ae0801277fec7c27a4',
  'content': 'nordered set of labels. \n \nClassification is a classic example of supervised learning in insurance. Suppose we have \nmeasured 5 different properties (blood pressure, smoking status, cholesterol ratios, build, and \nfamily history

### RAG - LLM Pipeline

In [None]:
# %pip install transformers torch langchain-huggingface

In [18]:
import os
from dotenv import load_dotenv
from transformers import pipeline
load_dotenv()




True

In [19]:
def build_rag_context(retrieved_docs, max_chars: int = 4000) -> str:
    """
    Build context string from retrieved documents
    """
    context_parts = []
    total_chars = 0

    for doc in retrieved_docs:
        text = doc["content"].strip()
        if total_chars + len(text) > max_chars:
            break
        context_parts.append(text)
        total_chars += len(text)

    return "\n\n---\n\n".join(context_parts)

def build_prompt(context: str, query: str) -> str:
    return f"""
You are a knowledgeable AI assistant.

Answer the question using ONLY the provided context.
If the answer is not present in the context, say:
"I don't have enough information in the provided documents."

Context:
{context}

Question:
{query}

Answer:
""".strip()


In [20]:
import os
from huggingface_hub import InferenceClient

class DeepSeekLLM:
    def __init__(
        self,
        model_name: str = "deepseek-ai/DeepSeek-V3.2",
        temperature: float = 0.2,
        max_tokens: int = 512
    ):
        self.client = InferenceClient(
            model=model_name,
            token=os.getenv("API")
        )
        self.temperature = temperature
        self.max_tokens = max_tokens

    def generate(self, prompt: str) -> str:
        response = self.client.chat_completion(
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            max_tokens=self.max_tokens,
            temperature=self.temperature
        )

        return response.choices[0].message["content"].strip()
    
def answer_with_rag(query: str,retriever: RAGRetriever,llm: DeepSeekLLM,top_k: int = 5,score_threshold: float = 0.2) -> str:
    # 1. Retrieve relevant chunks
    retrieved_docs = retriever.retrieve(
        query=query,
        top_k=top_k,
        score_threshold=score_threshold
    )

    if not retrieved_docs:
        return "No relevant context found to answer the question."

    # 2. Build context
    context = build_rag_context(retrieved_docs)

    # 3. Build prompt
    prompt = build_prompt(context, query)

    # 4. Generate answer
    answer = llm.generate(prompt)

    return answer


In [21]:
deepseek_llm = DeepSeekLLM()

final_answer = answer_with_rag(
    query="What is Machine Learning",
    retriever=rag_retriever,
    llm=deepseek_llm,
    top_k=5,
    score_threshold=0.1
)

print(final_answer)


Retrieving documents for query: 'What is Machine Learning'
Top K: 5, Similarity threshold: 0.1
Generating Embeddings for 1 texts.....


Batches: 100%|██████████| 1/1 [00:00<00:00, 73.96it/s]

Generated Embeddings: (1, 384)
Retrieved 5 documents (after filtering)





Based solely on the provided context, Machine Learning is defined as "a field of study that gives computers the ability to learn without being explicitly programmed," a definition first conceptualized by Arthur Samuel (1959).


### Advanced RAG. Here we will return source as well

In [23]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.1, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {
            'answer': 'No relevant context found.',
            'sources': [],
            'confidence': 0.0,
            'context': ''
        }

    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])

    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]

    confidence = max(doc['similarity_score'] for doc in results)

    # Build prompt
    prompt = f"""Use the following context to answer the question concisely.
If the answer is not present in the context, say "I don't know".

Context:
{context}

Question:
{query}

Answer:
"""

    # Correct LLM call
    answer = llm.generate(prompt)

    output = {
        'answer': answer,
        'sources': sources,
        'confidence': round(confidence, 3)
    }

    if return_context:
        output['context'] = context

    return output

deepseek_llm = DeepSeekLLM()

result = rag_advanced(
    "what is machine learning",
    retriever=rag_retriever,
    llm=deepseek_llm,
    top_k=3,
    min_score=0.1,
    return_context=True
)

print("Answer:", result["answer"])
print("Confidence:", result["confidence"])
print("Sources:", result["sources"])
# print("Context Preview:", result["context"][:300])


Retrieving documents for query: 'what is machine learning'
Top K: 3, Similarity threshold: 0.1
Generating Embeddings for 1 texts.....


Batches: 100%|██████████| 1/1 [00:00<00:00, 55.97it/s]

Generated Embeddings: (1, 384)
Retrieved 3 documents (after filtering)





Answer: Machine learning is a multidisciplinary field that involves learning paradigms, algorithms, theoretical results, and applications, drawing from areas such as artificial intelligence, statistics, computational complexity, and information theory.
Confidence: 0.46
Sources: [{'source': 'C:\\Users\\DGY3KOR\\Desktop\\Prep\\Data\\PDFiles\\arch-2021-1-machine-learning.pdf', 'page': 0, 'score': 0.4602, 'preview': 'ARCH2021.1 Shapiro_Machine Learning ... 00e6 1 \nMachine Learning: what is it and what are its components? \n-- some preliminary observations1 \n \nArnold F. Shapiro \nPenn State University, Smeal College of Business, University Park, PA 16802, USA \nAbstract \nThis article focuses on conceptualizing machi...'}, {'source': 'C:\\Users\\DGY3KOR\\Desktop\\Prep\\Data\\PDFiles\\MachineLearningTomMitchell.pdf', 'page': 2, 'score': 0.4496, 'preview': 's and theory that \nform the core of machine learning. Machine learning draws on concepts and \nresults from many fields, including st