### Dependencies

In [20]:
# !pip install pypdf langchain langchain_core langchain_community langchain_huggingface langchain_groq sentence_transformers transformers langchain_google_genai  langchain_openai  pymupdf chromadb

### Loading PDF

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("GSTsmartGuide.pdf")

In [3]:
docs = loader.load()

In [4]:
docs[100]

Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-02-27T11:53:51+05:30', 'source': 'GSTsmartGuide.pdf', 'file_path': 'GSTsmartGuide.pdf', 'total_pages': 1321, 'format': 'PDF 1.6', 'title': 'CHAPTER 1', 'author': 'Abha', 'subject': '', 'keywords': '', 'moddate': '2024-02-27T13:03:27+05:30', 'trapped': '', 'modDate': "D:20240227130327+05'30'", 'creationDate': "D:20240227115351+05'30'", 'page': 100}, page_content='84 \nGST Smart Guide \nChap. 5 \n \n(3) The place of supply of the following services shall be the location where \nthe services are actually performed, namely:—  \n(a) services supplied in respect of goods which are required to be made \nphysically available by the recipient of services to the supplier of \nservices, or to a person acting on behalf of the supplier of services in \norder to provide the services:  \n \nProvided that when such services are provided from a remote location by \nway of electronic means, th

In [5]:
import nltk
import re
from nltk.tokenize import sent_tokenize
from typing import List

nltk.download('punkt')  # For sentence tokenization

# Assuming `docs` is a list of strings (one per page or section)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Section-based chunking using headers and metadata


In [7]:
import re
from typing import List, Dict

# Your regex patterns to identify sections/headings
SECTION_PATTERNS = [
    r"(Section\s\d+[\.:]?)",               # e.g., Section 1:
    r"(Clause\s\d+(\.\d+)*[\.:]?)",        # e.g., Clause 2.1:
    r"(Article\s+[IVXLC]+\b[\.:]?)",       # e.g., Article III:
    r"(^[A-Z][A-Z\s]{3,}$)"                # All-caps headings
]

HEADER_REGEX = re.compile("|".join(SECTION_PATTERNS), flags=re.IGNORECASE | re.MULTILINE)

def section_based_chunking(text: str) -> List[str]:
    matches = list(HEADER_REGEX.finditer(text))
    if not matches:
        return [text.strip()]

    chunks = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i+1].start() if i + 1 < len(matches) else len(text)
        chunk = text[start:end].strip()
        chunks.append(chunk)
    return chunks


In [9]:
# Extract chunks and retain metadata like page number, title, etc.
def chunk_all_documents(docs):
    all_chunks = []
    for doc in docs:
        text = doc.page_content
        meta = doc.metadata
        if not text.strip():
            continue  # Skip empty pages

        sections = section_based_chunking(text)
        for i, section in enumerate(sections):
            header_match = HEADER_REGEX.search(section)
            section_title = header_match.group(0) if header_match else f"Section {i}"

            enriched_chunk = {
                "text": section,
                "metadata": {
                    "section_title": section_title,
                    "chunk_index": i,
                    "page": meta.get("page", -1),
                    "source": meta.get("source", ""),
                    "title": meta.get("title", ""),
                }
            }
            all_chunks.append(enriched_chunk)
    return all_chunks


In [10]:
section_chunks = chunk_all_documents(docs)

In [12]:
section_chunks[100]

{'text': 'for \nrevocation \nof \ncancellation \nof \nregistration in terms of Removal of \nDifficulty Order (RoD) number 05/2019-\nCentral Tax dated 23.04.2019 — Reg. \n99/18/2019-GST, \ndated 23-4-2019 \n1012 \n4. Guidelines',
 'metadata': {'section_title': 'for \nrevocation \nof \ncancellation \nof \nregistration in terms of Removal of ',
  'chunk_index': 14,
  'page': 11,
  'source': 'GSTsmartGuide.pdf',
  'title': 'CHAPTER 1'}}

### Taking relevent information from documents

In [13]:
from langchain.schema import Document

documents = [
    Document(page_content=item["text"], metadata=item["metadata"])
    for item in section_chunks
]


In [14]:
documents[100]

Document(metadata={'section_title': 'for \nrevocation \nof \ncancellation \nof \nregistration in terms of Removal of ', 'chunk_index': 14, 'page': 11, 'source': 'GSTsmartGuide.pdf', 'title': 'CHAPTER 1'}, page_content='for \nrevocation \nof \ncancellation \nof \nregistration in terms of Removal of \nDifficulty Order (RoD) number 05/2019-\nCentral Tax dated 23.04.2019 — Reg. \n99/18/2019-GST, \ndated 23-4-2019 \n1012 \n4. Guidelines')

**HugginFace Embeddings**

In [15]:
from langchain_huggingface import HuggingFaceEmbeddings
huggingface_embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Using Groq LLM

In [16]:
from google.colab import userdata
api_key = userdata.get('GROQ_API_KEY')

from langchain_groq import ChatGroq
import os
llm = ChatGroq(
    model_name="Gemma2-9b-It",
    api_key=api_key
)


In [17]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a professional legal and tax assistant. Use the following context extracted from a GST reference guide to answer the user’s question as accurately and completely as possible.

If the answer is not found in the context, say “The document does not contain this information.”

-----------------------
Context:
{context}
-----------------------

Question: {question}

Answer:"""


)

### Vectorstore in the Chromadb

In [21]:
# Create Chroma vectorstore
from langchain.vectorstores import Chroma
vdb = Chroma.from_documents(documents, embedding=huggingface_embeddings, persist_directory="./database")
vdb.persist()

  vdb.persist()


In [24]:
# Similarity search
query = "what is The basic principles of adjudication?"
results = vdb.similarity_search(query, k=5)

In [25]:
for doc in results:
    print("📄 Match:", doc.page_content)
    print("📝 Metadata:", doc.metadata)

📄 Match: should pass the final order after thorough verification of the relevant documents 
of the case, due study of the written submission filed by the parties and 
subsequent discussion in the personal hearing.  
6. The basic principles of adjudication  
1. The adjudication proceeding is based on the principle of natural justice
📝 Metadata: {'source': 'GSTsmartGuide.pdf', 'title': 'CHAPTER 1', 'page': 743, 'chunk_index': 2, 'section_title': 'should pass the final order after thorough verification of the relevant documents '}
📄 Match: The adjudicating officers should follow the principles of judicial discipline or 
precedent judgments before passing of any judicial decision. The judicial 
discipline is a vital factor in adjudication proceedings. The adjudicating officers
📝 Metadata: {'section_title': 'The adjudicating officers should follow the principles of judicial discipline or ', 'chunk_index': 5, 'source': 'GSTsmartGuide.pdf', 'page': 744, 'title': 'CHAPTER 1'}
📄 Match: Adjudica

**Retriever**

In [26]:
retriever=vdb.as_retriever()

In [29]:
result=retriever.invoke("what is The basic principles of adjudication?")

In [30]:
result

[Document(metadata={'title': 'CHAPTER 1', 'section_title': 'should pass the final order after thorough verification of the relevant documents ', 'chunk_index': 2, 'source': 'GSTsmartGuide.pdf', 'page': 743}, page_content='should pass the final order after thorough verification of the relevant documents \nof the case, due study of the written submission filed by the parties and \nsubsequent discussion in the personal hearing.  \n6. The basic principles of adjudication  \n1. The adjudication proceeding is based on the principle of natural justice'),
 Document(metadata={'section_title': 'The adjudicating officers should follow the principles of judicial discipline or ', 'source': 'GSTsmartGuide.pdf', 'title': 'CHAPTER 1', 'page': 744, 'chunk_index': 5}, page_content='The adjudicating officers should follow the principles of judicial discipline or \nprecedent judgments before passing of any judicial decision. The judicial \ndiscipline is a vital factor in adjudication proceedings. The adju

In [31]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,  # your Groq LLM instance
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)

In [33]:
query = '''what is The basic principles of adjudication?'''
result = qa_chain.run(query)

print(result)

The basic principles of adjudication are:

1. **The adjudication proceeding is based on the principle of natural justice.**  
2. **The adjudicating officers should follow the principles of judicial discipline or precedent judgments before passing of any judicial decision. The judicial discipline is a vital factor in adjudication proceedings.**  




