### Dependencies

In [1]:
# !pip install pypdf langchain langchain_core langchain_community langchain_huggingface langchain_groq sentence_transformers transformers langchain_google_genai chromadb langchain_openai grandalf pymupdf

### Loading PDF

In [23]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("GSTsmartGuide.pdf")

In [24]:
docs = loader.load()

In [25]:
import nltk
import re
from nltk.tokenize import sent_tokenize
from typing import List

nltk.download('punkt')  # For sentence tokenization

# Assuming `docs` is a list of strings (one per page or section)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Fixed-Size Token-Based Chunking

In [26]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.schema import Document

# # Assume docs is a list of Document objects
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=250
# )

# # This handles splitting across all docs and gives you Document objects
# fixed_chunks = text_splitter.split_documents(docs)

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def fixed_token_chunking(text: str, max_tokens=1000, overlap=250):
    tokens = tokenizer.tokenize(text)
    chunks = []
    i = 0
    while i < len(tokens):
        chunk_tokens = tokens[i:i+max_tokens]
        chunk = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk)
        i += max_tokens - overlap
    return chunks

In [28]:
# docs is a list of Document objects, extract text from .page_content
fixed_chunks = [chunk for doc in docs for chunk in fixed_token_chunking(doc.page_content,max_tokens=300,overlap=100)]

Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors


In [30]:
fixed_chunks[100:102]

['aggregate turnover of ` 500 crore and above. this threshold has been reduced progressively over a period of time and was reduced to ` 10 crores from 01. 10. 2022. this threshold limit has been further reduced to ` 5 crore with effect from 01. 08. 2023 vide notification no. 10 / 2023 - central tax dated 10. 05. 2023. 25. 43 refund related measures. ( a ) a new functionality has been made available on the common portal which allows unregistered persons to take a temporary registration and apply for refund. also, the manner and procedure for filing of refund applications by unregistered persons in certain circumstances has been prescribed vide circular no. 188 / 20 / 2022 - gst, dated 27. 12. 2022. ( b ) manner for processing and sanction of such igst refunds, which werewith held on account of exporters being flagged as risky on the basis of risk parameters and data analytics, was prescribed vide instruction no. 04 / 2022 - gst, dated 28. 11. 2022. 25. 44 registration related measures :

### Converting list of string to Document objects

In [31]:
from langchain.schema import Document

# Convert to Document objects
docs = [Document(page_content=chunk) for chunk in fixed_chunks]


In [32]:
docs[100:103]

[Document(metadata={}, page_content='aggregate turnover of ` 500 crore and above. this threshold has been reduced progressively over a period of time and was reduced to ` 10 crores from 01. 10. 2022. this threshold limit has been further reduced to ` 5 crore with effect from 01. 08. 2023 vide notification no. 10 / 2023 - central tax dated 10. 05. 2023. 25. 43 refund related measures. ( a ) a new functionality has been made available on the common portal which allows unregistered persons to take a temporary registration and apply for refund. also, the manner and procedure for filing of refund applications by unregistered persons in certain circumstances has been prescribed vide circular no. 188 / 20 / 2022 - gst, dated 27. 12. 2022. ( b ) manner for processing and sanction of such igst refunds, which werewith held on account of exporters being flagged as risky on the basis of risk parameters and data analytics, was prescribed vide instruction no. 04 / 2022 - gst, dated 28. 11. 2022. 25.

### I am using HuggingFace Embeddings

In [33]:
from langchain_huggingface import HuggingFaceEmbeddings
huggingface_embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Storing vectors in chromadb

In [37]:
from langchain_community.vectorstores import Chroma
vectorstore=Chroma.from_documents(
    documents=docs,
    collection_name="rag",
    embedding=huggingface_embeddings
)

### Using Groq LLM

In [40]:
from google.colab import userdata
api_key = userdata.get('GROQ_API_KEY')

from langchain_groq import ChatGroq
import os
llm = ChatGroq(
    model_name="Gemma2-9b-It",
    api_key=api_key
)


In [41]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a professional legal and tax assistant. Use the following context extracted from a GST reference guide to answer the user’s question as accurately and completely as possible.

If the answer is not found in the context, say “The document does not contain this information.”

-----------------------
Context:
{context}
-----------------------

Question: {question}

Answer:"""


)


In [42]:
retriever=vectorstore.as_retriever()

In [43]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,  # your Groq LLM instance
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)

In [44]:
query = ''' What is the effective date of composition levy?  '''
result = qa_chain.run(query)

print(result)

  result = qa_chain.run(query)


There can be three situations with respective effective dates as shown below:

* **Situation 1:** Persons who have been granted provisional registration and who opt for composition levy (intimation is filed under rule 3 (1) in form gst cmp - 01) - **1st July, 2017**.
* **Situation 2:** Persons opting for composition levy at the time of making application for new registration in the same registration application itself (the intimation under rule 3 (2) in form gst reg - 01) - **Effective date of registration**.  Intimation shall be considered only after the grant of registration and his option to pay tax under composition scheme shall be effective from the effective date of registration.
* **Situation 3:** Persons opting for composition levy after obtaining registration (the intimation is filed under rule 3 (3) in form gst cmp - 02) - **The beginning of the next financial year**. 



