### Dependencies

In [1]:
!pip install pypdf langchain langchain_core langchain_community langchain_huggingface langchain_groq sentence_transformers transformers langchain_google_genai  langchain_openai  pymupdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.3.2-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.2-py3-none-any.whl.metadata (4.7 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.12-py3-none-any.whl.metadata (2.3 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (

### Loading PDF

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("GSTsmartGuide.pdf")

In [3]:
docs = loader.load()

In [4]:
docs[100]

Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-02-27T11:53:51+05:30', 'source': 'GSTsmartGuide.pdf', 'file_path': 'GSTsmartGuide.pdf', 'total_pages': 1321, 'format': 'PDF 1.6', 'title': 'CHAPTER 1', 'author': 'Abha', 'subject': '', 'keywords': '', 'moddate': '2024-02-27T13:03:27+05:30', 'trapped': '', 'modDate': "D:20240227130327+05'30'", 'creationDate': "D:20240227115351+05'30'", 'page': 100}, page_content='84 \nGST Smart Guide \nChap. 5 \n \n(3) The place of supply of the following services shall be the location where \nthe services are actually performed, namely:—  \n(a) services supplied in respect of goods which are required to be made \nphysically available by the recipient of services to the supplier of \nservices, or to a person acting on behalf of the supplier of services in \norder to provide the services:  \n \nProvided that when such services are provided from a remote location by \nway of electronic means, th

In [5]:
import nltk
import re
from nltk.tokenize import sent_tokenize
from typing import List

nltk.download('punkt')  # For sentence tokenization

# Assuming `docs` is a list of strings (one per page or section)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Section-based chunking using headers and metadata


In [7]:
import re
from typing import List, Dict

# Your regex patterns to identify sections/headings
SECTION_PATTERNS = [
    r"(Section\s\d+[\.:]?)",               # e.g., Section 1:
    r"(Clause\s\d+(\.\d+)*[\.:]?)",        # e.g., Clause 2.1:
    r"(Article\s+[IVXLC]+\b[\.:]?)",       # e.g., Article III:
    r"(^[A-Z][A-Z\s]{3,}$)"                # All-caps headings
]

HEADER_REGEX = re.compile("|".join(SECTION_PATTERNS), flags=re.IGNORECASE | re.MULTILINE)

def section_based_chunking(text: str) -> List[str]:
    matches = list(HEADER_REGEX.finditer(text))
    if not matches:
        return [text.strip()]

    chunks = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i+1].start() if i + 1 < len(matches) else len(text)
        chunk = text[start:end].strip()
        chunks.append(chunk)
    return chunks


In [9]:
# Extract chunks and retain metadata like page number, title, etc.
def chunk_all_documents(docs):
    all_chunks = []
    for doc in docs:
        text = doc.page_content
        meta = doc.metadata
        if not text.strip():
            continue  # Skip empty pages

        sections = section_based_chunking(text)
        for i, section in enumerate(sections):
            header_match = HEADER_REGEX.search(section)
            section_title = header_match.group(0) if header_match else f"Section {i}"

            enriched_chunk = {
                "text": section,
                "metadata": {
                    "section_title": section_title,
                    "chunk_index": i,
                    "page": meta.get("page", -1),
                    "source": meta.get("source", ""),
                    "title": meta.get("title", ""),
                }
            }
            all_chunks.append(enriched_chunk)
    return all_chunks


In [10]:
section_chunks = chunk_all_documents(docs)

In [12]:
section_chunks[100]

{'text': 'for \nrevocation \nof \ncancellation \nof \nregistration in terms of Removal of \nDifficulty Order (RoD) number 05/2019-\nCentral Tax dated 23.04.2019 — Reg. \n99/18/2019-GST, \ndated 23-4-2019 \n1012 \n4. Guidelines',
 'metadata': {'section_title': 'for \nrevocation \nof \ncancellation \nof \nregistration in terms of Removal of ',
  'chunk_index': 14,
  'page': 11,
  'source': 'GSTsmartGuide.pdf',
  'title': 'CHAPTER 1'}}