### Sentence/Dynamic Chunking technique

In [None]:
%pip install nltk
%pip install fastapi

Looking in indexes: https://anu9rng:****@rb-artifactory.bosch.com/artifactory/api/pypi/python-virtual/simple
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://anu9rng:****@rb-artifactory.bosch.com/artifactory/api/pypi/python-virtual/simple
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import nltk # It understands punctuation, abbreviations 
nltk.download("punkt")
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DGY3KOR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DGY3KOR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [1]:
import os
from pypdf import PdfReader
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import re
import hashlib

PDF_FOLDER = "../Data/PDFiles"
VECTOR_DB_PATH = "../Data/Vectorstores"
COLLECTION_NAME = "pdf_semantic_chunks_v1"
INGESTION_VERSION = "v1"

def extract_pdf_pages(pdf_path: str) -> list[tuple[int, str]]:
    reader = PdfReader(pdf_path)
    pages = []

    for page_number, page in enumerate(reader.pages, start=1):
        text = page.extract_text()
        if text and text.strip():
            pages.append((page_number, text))

    return pages
        
def clean_text(text: str) -> str:
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Here we build semantic chunking
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
# Dynamic chunking algorithm
def split_into_paragraphs(text: str) -> list[str]:
    """
    Split cleaned text into paragraph-level chunks.
    Paragraphs are better 'units of reasoning' for PDFs
    than individual sentences.
    """

    paragraphs = [
        p.strip()
        for p in text.split("\n\n")
        if len(p.strip()) > 200  # filter headers / noise
    ]

    return paragraphs

def is_definition_like(text: str) -> bool:
    """
    Detects whether a paragraph looks like a definition or explanation.
    This metadata helps retrieval later.
    """

    signals = [
        " is ",
        " refers to ",
        " defined as ",
        " means ",
        " consists of ",
        " can be described as "
    ]

    lowered = text.lower()
    return any(signal in lowered for signal in signals)

def content_hash(text: str) -> str:
    """
    Create a stable hash for deduplication.
    Prevents repeated paragraphs from poisoning retrieval.
    """

    normalized = " ".join(text.lower().split())
    return hashlib.sha256(normalized.encode("utf-8")).hexdigest()

In [4]:
# Convert chunks to LangChain Documents
def create_documents(chunks, source_file):
    documents = []

    for chunk in chunks:
        documents.append(
            Document(
                page_content=chunk, # What gets embedded
                metadata={
                    "source": source_file # Metadata is for debugging, citations, helpful for debugging
                }
            )
        )
    return documents

In [5]:
# Data Ingestion pipeline
def ingest_pdfs():
    all_documents = []
    seen_hashes = set()

    for filename in os.listdir(PDF_FOLDER):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(PDF_FOLDER, filename)
        print(f"Ingesting: {filename}")

        pages = extract_pdf_pages(pdf_path)

        for page_number, page_text in pages:
            cleaned = clean_text(page_text)
            paragraphs = split_into_paragraphs(cleaned)

            for para in paragraphs:
                h = content_hash(para)

                # Cross-run + in-run deduplication
                if h in seen_hashes:
                    continue

                seen_hashes.add(h)

                all_documents.append(
                    Document(
                        page_content=para,
                        metadata={
                            "source": filename,
                            "page": page_number,
                            "content_hash": h,
                            "ingestion_version": INGESTION_VERSION,
                            "type": "definition" if is_definition_like(para) else "general"
                        }
                    )
                )

    return all_documents


In [6]:
# Create for Vectorstore
def build_vectorstore(documents):
    embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    vector_store = Chroma(
        collection_name=COLLECTION_NAME,
        persist_directory=VECTOR_DB_PATH,
        embedding_function=embedder
    )

    BATCH_SIZE = 500

    for i in range(0, len(documents), BATCH_SIZE):
        batch = documents[i : i + BATCH_SIZE]

        ids = [
            doc.metadata["content_hash"]
            for doc in batch
        ]

        vector_store.add_documents(
            documents=batch,
            ids=ids
        )

        print(f"Upserted batch {i // BATCH_SIZE + 1}")

    return vector_store

In [8]:
if __name__ == "__main__":
    documents = ingest_pdfs()

    print(f"Total chunks created: {len(documents)}")
    # print(f"Page chunks: {len(page_chunks)}")


    vector_store = build_vectorstore(documents)

    print("Vector store successfully built.")


Ingesting: arch-2021-1-machine-learning.pdf
Ingesting: MachineLearningTomMitchell.pdf
Total chunks created: 447




Upserted batch 1
Vector store successfully built.


In [9]:
results = vector_store.similarity_search(
    "What is MAchine Learning?",
    k=3
)

for r in results:
    print("----")
    print(r.page_content)


----
ARCH2021.1 Shapiro_Machine Learning ... 00e6 1 Machine Learning: what is it and what are its components? -- some preliminary observations1 Arnold F. Shapiro Penn State University, Smeal College of Business, University Park, PA 16802, USA Abstract This article focuses on conceptualizing machine learning (ML) concepts. The general topics covered are supervised learning based on regression and classification, unsupervised learning based on clustering and dimensionality reduction, and reinforcement learning. The article begins with an overview of ML, including the types of ML and their components. Then, for background, basic concepts associated with ML are addressed. From there on the discussion turns to ML algorithms, by topic. Although insurance applications of ML are not pursued in this version of the article, at the end of each section is a reference to an insurance article that applies the ML algorithm discussed in the section. The article ends with a commentary. 1 The support of

### Vector DB creation is done based on semantic meaning of sentences rather than explicit chunk size.

In [10]:
%pip show transformers

Name: transformers
Version: 4.41.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: c:\Users\DGY3KOR\AppData\Local\Programs\Python\Python312\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers
Note: you may need to restart the kernel to use updated packages.


In [10]:
from typing import TypedDict, List
from langgraph.graph import StateGraph, END
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
import os
import json

load_dotenv()

HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
assert HF_TOKEN, "Hugging Face API key not found"

# Defining agent state
class AgentState(TypedDict):
    user_query: str
    retrieved_docs: List[str]
    final_answer: str
    needs_retrieval: bool
    coverage: str

# Load Hugging Face LLM
# model_id = "microsoft/phi-2"
hf_client = InferenceClient(
    # model="deepseek-ai/DeepSeek-V3.2",
    token=HF_TOKEN
)


def hf_chat(prompt: str) -> str:
    response = hf_client.chat.completions.create(
        model="deepseek-ai/DeepSeek-V3.2",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=300,
    )

    return response.choices[0].message.content.strip()




# Define Embedding Model
embedder = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

# Provide the existing chroma vector store path
VECTOR_DB_PATH = "../Data/Vectorstores"
vector_store = Chroma(
    collection_name= "pdf_semantic_chunks_v1",  # Provide your collection name. Not sure of collection name, use the below code
    # print(vector_store._collection.count())
    persist_directory= VECTOR_DB_PATH,
    embedding_function= embedder
)


### Build memory state for Agent

In [11]:
def retrieve_node(state: AgentState):
    """
    This node is responsible ONLY for retrieval.
    It does not generate answers.
    """

    # Extract the user query from agent state
    query = state["user_query"]

    # Ask vector store for semantically similar chunks
    results = vector_store.similarity_search(
        query=query,
        k=3
    )

    # Extract raw text from Document objects
    retrieved_texts = [doc.page_content for doc in results]

    # Return partial state update (LangGraph merges state)
    return {
        "retrieved_docs": retrieved_texts
    }

# Now we will create coverage node. Here it will check whether the RAG itself is enough or other tool needs to be called.
def coverage_node(state: AgentState):
    context = "\n\n".join(state.get("retrieved_docs", []))

    prompt = f"""
        You are evaluating information coverage.
        User question:
        {state['user_query']}
        Context:
        {context}
        Decide ONE label:
        - DIRECT
        - PARTIAL
        - NONE
        Return ONLY one word.
        """

    response = hf_chat(prompt).upper()

    if "DIRECT" in response:
        coverage = "DIRECT"
    elif "PARTIAL" in response:
        coverage = "PARTIAL"
    else:
        coverage = "NONE"

    return {"coverage": coverage}


# If RAG provided chunks are not sufficient, then it should ask user.
def ask_user_node(state: AgentState):
    question = (
        "I found related information, but no single clear answer. "
        "Would you like me to synthesize an explanation from multiple sections?"
    )

    return {
        "final_answer": question
    }

def synthesize_node(state: AgentState):
    context = "\n\n".join(state.get("retrieved_docs", []))

    prompt = f"""
        Synthesize an answer using ONLY the context.
        Return JSON.

        Context:
        {context}

        Question:
        {state['user_query']}

        {{"answer": "..."}}
        """

    response = hf_chat(prompt)

    try:
        parsed = json.loads(response)
        answer = parsed.get("answer")
    except json.JSONDecodeError:
        answer = "I do not have enough information."

    return {"final_answer": answer}


def answer_node(state: AgentState):
    context = "\n\n".join(state.get("retrieved_docs", []))

    prompt = f"""
Answer the question using ONLY the context.
Return VALID JSON.

Context:
{context}

Question:
{state['user_query']}

JSON:
{{"answer": "..."}}
"""

    response = hf_chat(prompt)

    try:
        parsed = json.loads(response)
        answer = parsed.get("answer")
    except json.JSONDecodeError:
        answer = "I do not have enough information to answer this."

    return {"final_answer": answer}

In [12]:
# Build the LangGraph
graph = StateGraph(AgentState)

graph.add_node("retrieve", retrieve_node)
graph.add_node("coverage", coverage_node)
graph.add_node("answer", answer_node)
graph.add_node("synthesize", synthesize_node)

graph.set_entry_point("retrieve")
graph.add_edge("retrieve", "coverage")

def route_after_coverage(state: AgentState):
    if state["coverage"] == "DIRECT":
        return "answer"
    elif state["coverage"] == "PARTIAL":
        return "synthesize"
    else:
        return "answer"

graph.add_conditional_edges(
    "coverage",
    route_after_coverage,
    {
        "answer": "answer",
        "synthesize": "synthesize"
    }
)

graph.add_edge("synthesize", END)
graph.add_edge("answer", END)


<langgraph.graph.state.StateGraph at 0x16cb9f34b00>

In [16]:
# Compile and Run
agent = graph.compile()
result = agent.invoke(
    {
        "user_query": "What are the types of machine learning?"
    }
)
print(result["final_answer"])

The types of machine learning are supervised learning and reinforcement learning. Supervised learning is when the system is told what to look for (task-driven) and is trained to apply a label to data. Reinforcement learning involves trial and error, where the algorithm learns to react to the environment to discover the best sequence of actions for an optimal outcome.
