### Dependencies

In [12]:
# !pip install git+https://github.com/stanfordnlp/dspy.git
# !pip install pymupdf transformers torch langchain faiss-cpu chromadb langchain_huggingface langchain-community

### Importing Necessary library

In [122]:
import fitz  # PyMuPDF
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from typing import List
from google.colab import userdata
import os
import dspy

####  1. Initialize DSPy with Together AI LLM

In [57]:
TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
lm = dspy.LM(
    "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
    api_key=TOGETHER_API_KEY
)
dspy.configure(lm=lm)

#### 2. PDF Text Extraction

In [15]:
def extract_text_from_pdf(pdf_path: str) -> List[str]:
    """Extracts text from each page of a PDF."""
    text_per_page = []
    try:
        pdf_document = fitz.open(pdf_path)
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            text = page.get_text("text", flags=fitz.TEXTFLAGS_TEXT)
            text_per_page.append(text)
    except FileNotFoundError:
        print(f"Error: File not found at {pdf_path}")
    return text_per_page

In [29]:
docs=extract_text_from_pdf("GSTsmartGuide.pdf")

#### 3. LangChain Hugging Face Embeddings

In [35]:
def initialize_embedding_model(model_name="all-MiniLM-L6-v2"):
    """Initializes the Hugging Face embedding model from LangChain."""
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

#### 4. Chunking

In [36]:
def chunk_text(text_per_page: List[str]) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # Tune as needed
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    chunks = []
    for page_text in text_per_page:
        chunks.extend(text_splitter.split_text(page_text))
    return chunks

In [37]:
chunks=chunk_text(docs)

In [38]:
chunks[100]

'was fragmented with too many obstacles in free movement of goods necessitated \nby procedural requirement under VAT and CST. \n5.4 In the constitutional scheme, taxation powers on goods was with Central \nGovernment but it was limited up to the stage of manufacture and production \nwhile States have powers to tax sale and purchase of goods. Centre had powers to \ntax services and States also had powers to tax certain services specified in clause'

#### 5. Chroma Vector Store

In [40]:
def create_vector_store(text_chunks: List[str], embeddings,persist_directory="chroma_db"):
    """Creates a Chroma vector store."""
    vector_store = Chroma.from_texts(
        texts=text_chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vector_store.persist()
    return vector_store

In [41]:
vectorstore=create_vector_store(chunks,initialize_embedding_model())

#### 6. DSPy Modules for RAG

In [44]:
class GenerateQuery(dspy.Signature):
    """Generates a search query from a question ."""
    question = dspy.InputField()
    query = dspy.OutputField()

In [45]:
class RetrieveContext(dspy.Module):
    def __init__(self, vector_store, k: int = 3):
        super().__init__()
        self.generate_query = dspy.Predict(GenerateQuery)
        self.vector_store = vector_store
        self.k = k

    def forward(self, question):
        query = self.generate_query(question=question).query
        results = self.vector_store.similarity_search(query, k=self.k)
        context = "\n".join([doc.page_content for doc in results])
        return context

In [58]:
Retriever=RetrieveContext(vectorstore)

In [59]:
Retriever.forward("What is the procedure for refund of IGST on services?")

'1116 \n12. Clarification relating to export of services-\ncondition (v) of section 2(6) of the IGST \nAct, 2017. \n161/17/2021-GST, \ndated 20-9-2021 \n1141 \n13. Procedure relating to sanction, post-audit \nand review of refund claims-Reg. \nInstruction No. \n03/2022-GST \ndated 14-6-2022 \n1144 \n14. Clarification on refund related Issues \n(Inverted duty structure) \n181/13/2022-GST \ndated 10-11-2022 \n1149 \n15. Prescribes the manner of filing an applica-\ntion for refund by unregistered persons.\n1116 \n12. Clarification relating to export of services-\ncondition (v) of section 2(6) of the IGST \nAct, 2017. \n161/17/2021-GST, \ndated 20-9-2021 \n1141 \n13. Procedure relating to sanction, post-audit \nand review of refund claims-Reg. \nInstruction No. \n03/2022-GST \ndated 14-6-2022 \n1144 \n14. Clarification on refund related Issues \n(Inverted duty structure) \n181/13/2022-GST \ndated 10-11-2022 \n1149 \n15. Prescribes the manner of filing an applica-\ntion for refund by unregi

#### RAG

In [49]:
class GenerateAnswer(dspy.Signature):
    """Answers a question about GST given relevant context."""
    context = dspy.InputField()
    question = dspy.InputField()
    answer = dspy.OutputField()

In [137]:
class RAG(dspy.Module):
    def __init__(self, vector_store, k: int = 3,optimized_predictor=None):
        super().__init__()
        self.retrieve_context = RetrieveContext(vector_store, k=k)
        self.generate_answer = dspy.Predict(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve_context(question=question)
        answer = self.generate_answer(context=context, question=question).answer
        return answer

In [131]:
rag_system = RAG(vectorstore)
rag_system.forward("What is the procedure for refund of IGST on services give me pointwise?")

'Based on the given context, the procedure for refund of IGST on services can be summarized as follows:\n\n1. **Eligibility**: The exporter of services is eligible for refund of IGST under the condition (v) of section 2(6) of the IGST Act, 2017.\n2. **Refund Application**: The exporter needs to file an application for refund in FORM GST RFD-01.\n3. **Supporting Documents**: The application should be accompanied by supporting documents, such as:\n\t* Invoice and bill of entry\n\t* Certificate of origin\n\t* Proof of export\n\t* Bank certificate\n4. **Post-Audit and Review**: The refund application will be subject to post-audit and review as per the procedure laid down in Instruction No. 03/2022-GST dated 14-6-2022.\n5. **Refund Sanction**: The refund will be sanctioned as per the procedure laid down in Instruction No. 03/2022-GST dated 14-6-2022.\n6. **Refund Disbursement**: The refund will be disbursed electronically through the fully electronic refund process as per the procedure laid

## Optimizing RAG