### Dependencies

In [2]:
# !pip install git+https://github.com/stanfordnlp/dspy.git
# !pip install pymupdf  torch langchain chromadb langchain_huggingface langchain-community

### Importing Necessary library

In [3]:
import fitz  # PyMuPDF
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from typing import List
from google.colab import userdata
import os
import dspy

####  1. Initialize DSPy with Together AI LLM

In [4]:
TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
lm = dspy.LM(
    "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
    api_key=TOGETHER_API_KEY
)
dspy.configure(lm=lm)

#### 2. PDF Text Extraction

In [5]:
def extract_text_from_pdf(pdf_path: str) -> List[str]:
    """Extracts text from each page of a PDF."""
    text_per_page = []
    try:
        pdf_document = fitz.open(pdf_path)
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            text = page.get_text("text", flags=fitz.TEXTFLAGS_TEXT)
            text_per_page.append(text)
    except FileNotFoundError:
        print(f"Error: File not found at {pdf_path}")
    return text_per_page

In [6]:
docs=extract_text_from_pdf("GSTsmartGuide.pdf")

#### 3. LangChain Hugging Face Embeddings

In [7]:
def initialize_embedding_model(model_name="all-MiniLM-L6-v2"):
    """Initializes the Hugging Face embedding model from LangChain."""
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

#### 4. Chunking

In [8]:
def chunk_text(text_per_page: List[str]) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # Tune as needed
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    chunks = []
    for page_text in text_per_page:
        chunks.extend(text_splitter.split_text(page_text))
    return chunks

In [9]:
chunks=chunk_text(docs)

In [10]:
chunks[100]

'was fragmented with too many obstacles in free movement of goods necessitated \nby procedural requirement under VAT and CST. \n5.4 In the constitutional scheme, taxation powers on goods was with Central \nGovernment but it was limited up to the stage of manufacture and production \nwhile States have powers to tax sale and purchase of goods. Centre had powers to \ntax services and States also had powers to tax certain services specified in clause'

#### 5. Chroma Vector Store

In [11]:
def create_vector_store(text_chunks: List[str], embeddings,persist_directory="chroma_db"):
    """Creates a Chroma vector store."""
    vector_store = Chroma.from_texts(
        texts=text_chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vector_store.persist()
    return vector_store

In [12]:
vectorstore=create_vector_store(chunks,initialize_embedding_model())

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  vector_store.persist()


#### 6. DSPy Modules for RAG

In [13]:
class GenerateQuery(dspy.Signature):
    """Generates a search query from a question ."""
    question = dspy.InputField()
    query = dspy.OutputField()

In [14]:
class RetrieveContext(dspy.Module):
    def __init__(self, vector_store, k: int = 3):
        super().__init__()
        self.generate_query = dspy.Predict(GenerateQuery)
        self.vector_store = vector_store
        self.k = k

    def forward(self, question):
        query = self.generate_query(question=question).query
        results = self.vector_store.similarity_search(query, k=self.k)
        context = "\n".join([doc.page_content for doc in results])
        return context

In [15]:
Retriever=RetrieveContext(vectorstore)

In [16]:
Retriever.forward("What is the procedure for refund of IGST on services?")

'1116 \n12. Clarification relating to export of services-\ncondition (v) of section 2(6) of the IGST \nAct, 2017. \n161/17/2021-GST, \ndated 20-9-2021 \n1141 \n13. Procedure relating to sanction, post-audit \nand review of refund claims-Reg. \nInstruction No. \n03/2022-GST \ndated 14-6-2022 \n1144 \n14. Clarification on refund related Issues \n(Inverted duty structure) \n181/13/2022-GST \ndated 10-11-2022 \n1149 \n15. Prescribes the manner of filing an applica-\ntion for refund by unregistered persons.\nAs per Section 54 of the CGST Act and read with Rule 89(1) of the CGST \nRules, provides refund of IGST in respect of export of goods or services on \npayment of tax by a registered person or in respect of supplies to a Special \nEconomic Zone unit or a SEZ developer, the application for refund shall be filed \nby the— \n \n(e) supplier of goods after such goods have been admitted in full in the SEZ \nfor authorized operations, as endorsed by the specified officer of the \nZone;\n94/13/

#### RAG

In [17]:
class GenerateAnswer(dspy.Signature):
    """Answers a question about GST given relevant context."""
    context = dspy.InputField()
    question = dspy.InputField()
    answer = dspy.OutputField()

In [18]:
class RAG(dspy.Module):
    def __init__(self, vector_store, k: int = 3,optimized_predictor=None):
        super().__init__()
        self.retrieve_context = RetrieveContext(vector_store, k=k)
        self.generate_answer = dspy.Predict(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve_context(question=question)
        answer = self.generate_answer(context=context, question=question).answer
        return answer

In [19]:
rag_system = RAG(vectorstore)
rag_system.forward("What is the procedure for refund of IGST on services give me pointwise?")

'The procedure for refund of IGST on services is as follows:\n\n1. The supplier of services must have exported the services.\n2. The supplier must have paid the IGST on the services.\n3. The supplier must have obtained the necessary documents, such as the export invoice and the bill of entry.\n4. The supplier must have filed an application for refund in FORM GST RFD-01.\n5. The application must be accompanied by the necessary supporting documents, such as the export invoice and the bill of entry.\n6. The application must be filed within the time limit specified in the GST Rules.\n7. The refund will be processed and disbursed electronically.\n\nNote: The above points are based on the given context and may not be a comprehensive list of the procedures for refund of IGST on services.'

## Optimizing RAG

In [22]:
from dspy.teleprompt import BootstrapFewShot
from dspy.evaluate import Evaluate

In [29]:
from dspy import Example

examples = [
    Example(
        question="What is the GST and why is it considered the biggest tax reform in India?",
        answer="GST is a destination-based tax on the supply of goods and services, unifying multiple indirect taxes under a single regime. It is considered the biggest tax reform since independence due to its comprehensive and integrated approach."
    ).with_inputs("question"),

    Example(
        question="What does Article 246A introduced by the 101st Amendment state?",
        answer="Article 246A gives concurrent powers to the Union and States to make laws with respect to goods and services tax."
    ).with_inputs("question"),

    Example(
        question="What are the functions of the GST Council under Article 279A?",
        answer="The GST Council recommends GST rates, exemptions, thresholds, model laws, and decides when petroleum products will be brought under GST."
    ).with_inputs("question"),

    Example(
        question="Why were petroleum products and alcohol excluded from GST?",
        answer="Alcohol was excluded due to its revenue significance for States. Petroleum products were temporarily excluded, pending recommendation by the GST Council for inclusion."
    ).with_inputs("question"),

    Example(
        question="What were the key reasons for introducing GST in India?",
        answer="GST was introduced to eliminate the cascading effect of taxes, integrate Central and State taxes, and create a unified national market."
    ).with_inputs("question"),

    Example(
        question="What are the threshold limits for GST registration?",
        answer="For goods, ₹40 lakhs in most States and ₹20 lakhs in special category States. For services, ₹20 lakhs and ₹10 lakhs in special category States."
    ).with_inputs("question"),

    Example(
        question="What is the Reverse Charge Mechanism (RCM) under GST?",
        answer="Under RCM, the recipient pays the tax instead of the supplier. It applies to certain notified goods/services and supplies from unregistered persons."
    ).with_inputs("question"),

    Example(
        question="What structural challenges did India face in implementing GST?",
        answer="Challenges included reconciling State vs. Central powers, destination vs. origin-based taxation, rate harmonization, and ensuring revenue protection for States."
    ).with_inputs("question"),

    Example(
        question="What does Article 269A of the Constitution provide for?",
        answer="Article 269A provides for GST on inter-State supplies to be levied and collected by the Centre and apportioned between the Centre and States."
    ).with_inputs("question"),

    Example(
        question="What is the QRMP Scheme under GST?",
        answer="The QRMP Scheme allows taxpayers with turnover up to ₹5 crores to file returns quarterly while paying taxes monthly."
    ).with_inputs("question")
]


In [30]:
teleprompter = BootstrapFewShot()
compiled_rag = teleprompter.compile(RAG(vectorstore), trainset=examples)


 40%|████      | 4/10 [00:06<00:09,  1.53s/it]

Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.





In [31]:
output = compiled_rag(question="What is the procedure for refund of IGST on services give me pointwise?")

In [32]:
output

'The procedure for refund of IGST on services is as follows:\n\n1. The application for refund shall be filed by the supplier of goods or services.\n2. The goods or services must have been exported or supplied to a Special Economic Zone (SEZ) unit or a SEZ developer.\n3. The supplier must have paid the IGST on the export of goods or services.\n4. The application for refund shall be filed in FORM GST RFD-01.\n5. The application shall be accompanied by the required documents, including the export invoice, bill of export, and other relevant documents.\n6. The application shall be filed electronically through the GST portal.\n7. The refund shall be processed and disbursed electronically through the GST portal.\n8. The refund shall be credited to the bank account of the supplier.'