%pip install -q torch transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets langchain-community ragatouille

In [1]:
from dotenv import load_dotenv, find_dotenv
import sys
import os, getpass
from openai import OpenAI


# Add the project root directory to Python path
project_root = os.path.dirname(os.path.abspath(''))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils import set_api_key


load_dotenv(find_dotenv())  

QWEN_API_KEY = set_api_key('QWEN_API_KEY')

from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import tiktoken

# requirements (example)
# pip install requests beautifulsoup4 transformers sentence-transformers faiss-cpu langchain pillow pytesseract

import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import numpy as np
#import torch

# Hugging Face tools
#from transformers import pipeline, CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer  # for text embeddings

# LangChain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
import langgraph
from langgraph.prebuilt import ToolNode
from langchain.chat_models import init_chat_model



pd.set_option("display.max_colwidth", None)

API key found in .env file for QWEN_API_KEY
API key set successfully.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxableincome",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr1-form-sahaj-faq",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr4-form-sugam-faq"

]
loader = WebBaseLoader(urls)
docs = loader.load()
assert len(docs) == 5

print(f"Total Characters: {sum([len(doc.page_content) for doc in docs])}")


USER_AGENT environment variable not set, consider setting it to identify your requests.


Total Characters: 117671


In [3]:
docs[0]

Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'description': 'ITRs', 'language': 'en'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHow to File Tax Returns | Income Tax Department\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\nCall Us\n\n\n\ne-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries\n\n\n1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700\n\n\n08:00 hrs - 20:00 hrs\n(Monday to Friday)\n\n\n\n\nTax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Update through NSDL\n\n\n+91-20-27218080\n\n\n07:00 hrs - 23:00 hrs\n(All Days)\n\n\n\n\n

In [4]:
# Split the documents
#langchain_docs = [LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(ds)]


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
    separators=[ r"\n#", ".", r"\t#", " ", ""],
    length_function= lambda text: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(text)),
    is_separator_regex=True
)

docs_processed = []
for doc in docs:
    docs_processed += text_splitter.split_documents([doc])



In [5]:
len(docs_processed)

141

In [6]:
docs_processed[:3]

[Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'description': 'ITRs', 'language': 'en', 'start_index': 34}, page_content='How to File Tax Returns | Income Tax Department\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\nCall Us\n\n\n\ne-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries\n\n\n1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700\n\n\n08:00 hrs - 20:00 hrs\n(Monday to Friday)\n\n\n\n\nTax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Update through NSDL\n\n\n+91-20-27218080\n\n\n07:00 hrs - 23:00 hrs\n(All Days)\n\n\n\n\nAIS and Reporting Portal\nQueries related to AIS

#### Creating a team of agents for different tasks
- Question Agent
- Critique Agent
- Agent as a Judge
- Answering Agent

In [10]:
MISTRAL_KEY = os.environ.get('MISTRAL_KEY')
QWEN_API_KEY = os.environ.get('QWEN_API_KEY')

In [34]:
llm  = init_chat_model("mistral-large-latest", 
                       model_provider="mistralai", 
                       timeout = 60, 
                       streaming = True,
                       api_key = MISTRAL_KEY
                       )

In [43]:
llm.invoke("Hi dude")

AIMessage(content='Hey! What\'s up? ðŸ˜Š How can I help you today? (Also, love the casual vibeâ€”"dude" works for me! ðŸ¤™)', additional_kwargs={}, response_metadata={'model_name': 'mistral-large-latest', 'finish_reason': 'stop'}, id='run--edfed2b0-7670-4ebc-8bfd-612dc30e9464-0', usage_metadata={'input_tokens': 6, 'output_tokens': 38, 'total_tokens': 44})

In [None]:
"model": "qwen/qwen3-4b:free",

In [36]:
def openrouter_llm(url:str, model:str, api_key:str, prompt:str):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": F"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": model,
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],

        })
    )
    return response.json()['choices'][0]['message']['content']

In [37]:
response = openrouter_llm(
    url="https://openrouter.ai/api/v1/chat/completions",
    model="qwen/qwen3-4b:free",
    api_key=QWEN_API_KEY,
    prompt="Hi"
)

In [38]:
response

'Hello! How can I assist you today? ðŸ˜Š'

In [45]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [57]:
import random
N_QA_COUPLES = 20

print(f"Generating {N_QA_COUPLES} QA pairs...")

outputs = []

for sampled_context in tqdm(random.sample(docs_processed, N_QA_COUPLES)):
    prompt = QA_generation_prompt.format(context=sampled_context.page_content)
    response = openrouter_llm(
    url="https://openrouter.ai/api/v1/chat/completions",
    model="qwen/qwen3-4b:free",
    api_key=QWEN_API_KEY,
    prompt=prompt
)

    try:
        question = response.split("Factoid question:")[-1].split("Answer:")[0]
        answer = response.split("Answer:")[-1]
        assert len(answer) < 300, "Answer too long"
        outputs.append({
            "context": sampled_context.page_content,
            "question": question,
            "answer": answer,
            "source": sampled_context.metadata["source"]
        })
    except Exception as e:
        print(f"Error generating QA pair for context: {sampled_context.page_content}")
        print(f"Error message: {e}")


Generating 20 QA pairs...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [02:02<00:00,  6.12s/it]


In [58]:
display(pd.DataFrame(outputs).head(5))

Unnamed: 0,context,question,answer,source
0,"pproved Undertaking Agency\n\n\nRBI Approved Banks\n\n\n\n\nTax Deductor & Collector\n\n\nRegistration\n\n\nService Available\n\n\n\n\n\n\nDownloads\n\n\nHelp\n\n\n\n\n\n\n\n\n\n\nDo not have an account?\nRegister\nAlready have an account?\nLogin\n\n\n\n\n\n\n\n\n\n\n\n\n\ntest\n\n\nSearch\n\n\n\n\n\n\n\n\n\nHelp\n\n\n Tax payer\n \n\n Salaried Individuals for AY 2025-26\n \n\nSalaried Individuals for AY 2025-26\n\n\n\n\n\n\n\n\n\n\n\n\nÂ \nReturns and Forms Applicable for Salaried Individuals for AYÂ 2025-26\nÂ \nDisclaimer:Â The content on this page is only to give an overview and general guidance and is not exhaustive. For complete details and guidelines please refer Income Tax Act, Rules andÂ Notifications.\nÂ \nÂ \nÂ \n\n1. ITR-1 (SAHAJ) â€“ Applicable only for Individual\n\n\nThis return is applicable for a Resident (other than Not Ordinarily Resident) Individual having Total Income from any of the following sources up to â‚¹ 50 lakh\n\nSalary / Pension\n\n\nOne House Property\n\n\nOther sources (Interest, Family Pension, Dividend etc.)\n\n\nAgricultural Income up to â‚¹ 5,000\n\nÂ Capital Gain income u/s 112 A up to Rs. 125000",What is the income limit for ITR-1 (SAHAJ) applicable to salaried individuals in AY 2025-26? \n,â‚¹50 lakh.,https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions
1,"me Tax Department\n\n\nÂ \n\n4. Form 67- Statement of Income from a country or specified territory outside India and Foreign Tax Credit\n\n\n\nSubmitted by\n\n\nDetails provided in the form\n\n\nTaxpayerÂ on or before the due date specified for furnishing the ITRs u/s 139(1)\n\n\nIncome from a country or specified territory outside India and Foreign Tax Credit claimed\n\n\nÂ \n\n5.Â \n\n\n\nForm 26 AS\n\n\nÂ AIS Â (Annual information Statement)\n\n\nProvided by:\nIncome Tax Department (It is available on e-Filing Portal: \nLogin > e-File > Income Tax Return > View Form 26AS)\nDetails provided in the form:\nTax Deducted /Â Collected at Source.\n\n\nProvided by:\nIncome Tax Department (It can be accessed after logging on to Income Tax e-Filing portal)\nÂ Go to e-filing portal > login > AIS\nDetails provided in the form:\nTax Deducted /Â Collected at Source \nSFT Information\nPayment of taxes\nDemand /Â Refund\nOther information (likeÂ Pending/Completed proceedings, GSTÂ Information, Information receivedÂ from foreign governmentÂ etc)\n\n\nÂ \n\n6. Form 15G - Declaration by resident taxpayer (",What is the submission deadline for Form 67? \n,The form must be submitted on or before the due date specified for furnishing the ITRs u/s 139(1).,https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxableincome
2,"sources up to â‚¹ 50 lakh\n\nSalary / Pension\n\n\nOne House Property\n\n\nOther sources (Interest, Family Pension, Dividend etc.)\n\n\nAgricultural Income up to â‚¹ 5,000\n\nÂ Capital Gain income u/s 112 A up to Rs. 125000\nÂ \n\nÂ â€‹\n\n\n\nNote:Â ITR-1 cannot be used by a person who:(a) is a Director in a companyÂ \n\t\t\t\t\t\t\t\t\t(b) has short term capital gain\n\t\t\t\t\t\t\t\t\t(c ) has Long-term capital gain u/s 112A exceeding Rs.1.25 lakhs\n\t\t\t\t\t\t\t\t\t(d) has held any unlisted equity shares at any time during the previous yearÂ \n\t\t\t\t\t\t\t\t\t(e) has any asset (including financial interest in any entity) located outside IndiaÂ \n\t\t\t\t\t\t\t\t\t(f) has signing authority in any account located outside IndiaÂ \n\t\t\t\t\t\t\t\t\t(g) has income from any source outside IndiaÂ \n\t\t\t\t\t\t\t\t\t(h) is a person in whose case tax has been deducted u/s 194NÂ \n\t\t\t\t\t\t\t\t\t(e) is a person in whose case payment or deduction of tax has been deferred on ESOP\n\t\t\t\t\t\t\t\t\t(i) has any brought forward loss or loss to be carried forward under any head of income\n(i) has total income exceeding Rs. 50 la",What is the maximum allowable amount for agricultural income in the context? \n,"â‚¹ 5,000",https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions
3,"ng winning from Lottery and Income from Race Horses): \nInterest from Savings Account\nInterest from Deposit (Bank / Post Office / Cooperative Society)\nInterest from Income Tax Refund\nFamily Pension\nInterest received on enhanced compensation\nAny other Interest Income (e.g., Interest Income from Unsecured Loan)\n\nÂ \n2. Who is not eligible to file ITR-4 for AY 2025-26?\nITR-4 cannot be filed by an individual / HUF / Firm (Other than LLP) who:\nis a Resident but Not Ordinarily Resident (RNOR), Â or Non-Resident Indian\nhas total income exceeding â‚¹ 50 Lakh\nShort term capital gains;\nLong-term capital gain u/s 112A exceeding Rs. 1.25 lakhs\nhas agricultural income in excess of â‚¹5,000/-\nis a Director in a Company\nhas income from more than one House Property;\nhas income of the following nature: \nwinnings from lottery;\nactivity of owning and maintaining race horses;\nincome taxable at special rates Â u/s115BBDA or Section 115BBE;\n\nhas held any unlisted equity shares at any time during the previous year\nhas deferred in",What is one reason an individual is not eligible to file ITR-4 for AY 2025-26? \n,winnings from lottery,https://www.incometax.gov.in/iec/foportal/help/e-filing-itr4-form-sugam-faq
4,"ed Undertaking Agency\n\n\nRBI Approved Banks\n\n\n\n\nTax Deductor & Collector\n\n\nRegistration\n\n\nService Available\n\n\n\n\n\n\nDownloads\n\n\nHelp\n\n\n\n\n\n\n\n\n\n\nDo not have an account?\nRegister\nAlready have an account?\nLogin\n\n\n\n\n\n\n\n\n\n\n\n\n\ntest\n\n\nSearch\n\n\n\n\n\n\n\n\n\n File ITR-4 (Sugam) Online FAQs\n \n\n File ITR-4 (Sugam) Online FAQs\n \n\nFile ITR-4 (Sugam) Online FAQs\n\n\n\n\n\n\n\n\n\n\n\n\n\n1. Who is eligible to file ITR-4 for AY 2025-26?ITR-4 can be filed by a Resident Individual / HUF / Firm (other than LLP) who has:\nIncome not exceeding â‚¹50 Lakh during the FY\nIncome from Business and Profession which is computed on a presumptive basis u/s 44AD, 44ADA or 44AE\nLong-term capital gain u/s 112A not exceeding Rs.1.25 lakhs\nIncome from Salary/Pension, one House Property, Agricultural Income (up to â‚¹ 5000/-)\nOther Sources which include (excluding winning from Lottery and Income from Race Horses): \nInterest from Savings Account\nInterest from Deposit (Bank / Post Office / Cooperative Society)\nInterest from Income Tax Refund\nFamily Pension\nInter",What is the maximum income limit for filing ITR-4 in FY 2025-26? \n,â‚¹50 Lakh,https://www.incometax.gov.in/iec/foportal/help/e-filing-itr4-form-sugam-faq


##### Building critique agents: We cannot blindly trust the questions generated by the LLM, we must do a quality check to validate them.
We follow this paper - https://huggingface.co/papers/2312.10003

Criteria - 
- Groundedness: can the question be answered from the given context?
- Relevance: is the question relevant to users? For instance, "What is the date when transformers 4.29.1 was released?" is not relevant for ML practitioners.
One last failure case weâ€™ve noticed is when a function is tailored for the particular setting where the question was generated, but undecipherable by itself.
- Stand-alone: is the question understandable free of any context, for someone with domain knowledge/Internet access? 

In [None]:
groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """