### Dependencies

In [16]:
# !pip install pypdf langchain langchain_core langchain_community langchain_huggingface langchain_groq sentence_transformers transformers langchain_google_genai chromadb pymupdf

### Loading PDF

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("GSTsmartGuide.pdf")

In [3]:
docs = loader.load()

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assume docs is a list of Document objects
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=250
)

# This handles splitting across all docs and gives you Document objects
fixed_chunks = text_splitter.split_documents(docs)

In [9]:
fixed_chunks[100]

Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-02-27T11:53:51+05:30', 'source': 'GSTsmartGuide.pdf', 'file_path': 'GSTsmartGuide.pdf', 'total_pages': 1321, 'format': 'PDF 1.6', 'title': 'CHAPTER 1', 'author': 'Abha', 'subject': '', 'keywords': '', 'moddate': '2024-02-27T13:03:27+05:30', 'trapped': '', 'modDate': "D:20240227130327+05'30'", 'creationDate': "D:20240227115351+05'30'", 'page': 34}, page_content='18 \nGST Smart Guide \nChap. 1 \n \n25.14 Export: \n25.15 E-Wallet Scheme shall be introduced for exporters from 1-4-2020 and \ntill then relief for exporters shall be given in form of broadly existing practice. \n25.16 Supply of services to Nepal and Bhutan shall be exempted from GST \neven if payment has not been received in foreign convertible currency - such \nsuppliers shall be eligible for input tax credit. \n25.17 Supply of services to qualify as exports, even if payment is received in \nIndian Rupees, where per

### I am using HuggingFace Embeddings

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
huggingface_embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Storing vectors in chromadb

In [8]:
from langchain_community.vectorstores import Chroma
vectorstore=Chroma.from_documents(
    documents=fixed_chunks,
    collection_name="rag",
    embedding=huggingface_embeddings
)

### Using Groq LLM

In [10]:
from google.colab import userdata
api_key = userdata.get('GROQ_API_KEY')

from langchain_groq import ChatGroq
import os
llm = ChatGroq(
    model_name="Gemma2-9b-It",
    api_key=api_key
)


In [11]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a professional legal and tax assistant. Use the following context extracted from a GST reference guide to answer the user’s question as accurately and completely as possible.

If the answer is not found in the context, say “The document does not contain this information.”

-----------------------
Context:
{context}
-----------------------

Question: {question}

Answer:"""


)


In [12]:
retriever=vectorstore.as_retriever()

In [13]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,  # your Groq LLM instance
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)

In [14]:
query = ''' What is the effective date of composition levy?  '''
result = qa_chain.run(query)

print(result)

  result = qa_chain.run(query)


There can be three situations:

* **Situation:** Persons who have been granted provisional registration and who opt for composition levy (Intimation is filed under Rule 3(1) in FORM GST CMP-01)
   **Effective date of composition levy:** 1st July, 2017. 

* **Situation:** Persons opting for composition levy at the time of making application for new registration in the same registration application itself (The intimation under Rule 3(2) in FORM GST REG-01)
   **Effective date of composition levy:** Effective date of registration; Intimation shall be considered only after the grant of registration and his option to pay tax under Section 10 shall be effective from the effective date of registration. 


 



In [15]:
query = ''' I have a unit in the DTA and another in the SEZ; can I take a
common registration?'''
result = qa_chain.run(query)

print(result)

No.  A person having unit(s) in a Special Economic Zone as well as outside the SEZ in a State, shall make a separate application for registration for SEZ unit(s) as a business vertical distinct from his other units located outside the Special Economic Zone in that State. 

