%pip install -q torch transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets langchain-community ragatouille

In [1]:
from dotenv import load_dotenv, find_dotenv
import sys
import os, getpass
from openai import OpenAI
import re


# Add the project root directory to Python path
project_root = os.path.dirname(os.path.abspath(''))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils import set_api_key


load_dotenv(find_dotenv())  

QWEN_API_KEY = set_api_key('QWEN_API_KEY')
MISTRAL_KEY = set_api_key('MISTRAL_KEY')

from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import tiktoken

# requirements (example)
# pip install requests beautifulsoup4 transformers sentence-transformers faiss-cpu langchain pillow pytesseract

import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import numpy as np
#import torch

# Hugging Face tools
#from transformers import pipeline, CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer  # for text embeddings

# LangChain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
import langgraph
from langgraph.prebuilt import ToolNode
from langchain.chat_models import init_chat_model



pd.set_option("display.max_colwidth", None)

API key found in .env file for QWEN_API_KEY
API key set successfully.
API key found in .env file for MISTRAL_KEY
API key set successfully.


In [2]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxableincome",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr1-form-sahaj-faq",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr4-form-sugam-faq"

]
loader = WebBaseLoader(urls)
docs = loader.load()
assert len(docs) == 5

print(f"Total Characters: {sum([len(doc.page_content) for doc in docs])}")


USER_AGENT environment variable not set, consider setting it to identify your requests.


Total Characters: 117671


In [3]:
docs[:5]

[Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'description': 'ITRs', 'language': 'en'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHow to File Tax Returns | Income Tax Department\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\nCall Us\n\n\n\ne-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries\n\n\n1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700\n\n\n08:00 hrs - 20:00 hrs\n(Monday to Friday)\n\n\n\n\nTax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Update through NSDL\n\n\n+91-20-27218080\n\n\n07:00 hrs - 23:00 hrs\n(All Days)\n\n\n\n\

In [4]:
# After looking at few examples of documents, we see that there is a lot of noise in the text in the form of extended whitespace, markdown artifacts, and other non-informative content. We need to clean this up to improve the quality of our embeddings.

def normalize_text (text: str, preserve_para:bool = True):
    text = re.sub(r'[ \t]+', ' ', text)  # Replace multiple spaces/tabs with a single space
    if preserve_para:
        return re.sub(r'\n{2,}', ' ', text).strip()
    return re.sub(r'\n\n+', '\n', text).strip()



In [5]:
langchain_docs = [LangchainDocument(page_content=normalize_text(doc.page_content), 
                                    metadata={"source": doc.metadata["source"], 
                                              "title": doc.metadata["title"],
                                              "language": doc.metadata["language"]}) for doc in tqdm(docs)]

  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
langchain_docs

[Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'language': 'en'}, page_content='How to File Tax Returns | Income Tax Department  Skip to main content\n    Call Us e-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries 1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700 08:00 hrs - 20:00 hrs\n(Monday to Friday) Tax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Update through NSDL +91-20-27218080 07:00 hrs - 23:00 hrs\n(All Days) AIS and Reporting Portal\nQueries related to AIS, TIS, SFT Preliminary response, Response to e-campaigns or e-Verification 1800 103 4215 09:30 hrs - 18:00 hrs\n(Monday to Friday) View All     Select your language\nEnglishH

In [7]:
# Split the documents
#langchain_docs = [LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(ds)]

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
    "/",
]


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    add_start_index=True,
    separators=MARKDOWN_SEPARATORS,
    #is_separator_regex=True,
    strip_whitespace=True
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])



In [8]:
len(docs_processed)

151

In [9]:
docs_processed[:3]

[Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'language': 'en', 'start_index': 0}, page_content='How to File Tax Returns | Income Tax Department  Skip to main content\n    Call Us e-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries 1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700 08:00 hrs - 20:00 hrs\n(Monday to Friday) Tax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Update through NSDL +91-20-27218080 07:00 hrs - 23:00 hrs\n(All Days) AIS and Reporting Portal\nQueries related to AIS, TIS, SFT Preliminary response, Response to e-campaigns or e-Verification 1800 103 4215 09:30 hrs - 18:00 hrs\n(Monday to Friday) View All     Select your 

#### Creating a team of agents for different tasks
- Question Agent
- Critique Agent
- Agent as a Judge
- Answering Agent

In [10]:
llm  = init_chat_model("mistral-large-latest", 
                       model_provider="mistralai", 
                       timeout = 60, 
                       streaming = True,
                       api_key = MISTRAL_KEY
                       )

In [11]:
llm.invoke("Hi dude").content

'Hey! What\'s up? 😊 How can I help you today? (And yeah, "dude" works—casual vibes appreciated. 👌)'

In [17]:
openrouter_model= "cognitivecomputations/dolphin-mistral-24b-venice-edition:free"
url="https://openrouter.ai/api/v1/chat/completions"

In [18]:
response = requests.post(
        url=url,
        headers={
            "Authorization": F"Bearer {QWEN_API_KEY}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": openrouter_model,
            "messages": [
                {
                    "role": "user",
                    "content": "Hi"
                }
            ],

        })
    )
response = response.json()
response

{'id': 'gen-1756702275-2bwwHBRPpm0gPkg4nYT8',
 'provider': 'Venice',
 'model': 'cognitivecomputations/dolphin-mistral-24b-venice-edition:free',
 'object': 'chat.completion',
 'created': 1756702276,
 'choices': [{'logprobs': None,
   'finish_reason': 'stop',
   'native_finish_reason': 'stop',
   'index': 0,
   'message': {'role': 'assistant',
    'content': 'Hello! How can I assist you today?',
    'refusal': None,
    'reasoning': None}}],
 'usage': {'prompt_tokens': 758, 'completion_tokens': 10, 'total_tokens': 768}}

In [22]:
def openrouter_llm( model:str, prompt:str):
    response = requests.post(
        url=url,
        headers={
            "Authorization": F"Bearer {QWEN_API_KEY}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": model,
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],

        })
    )
    return response.json()['choices'][0]['message']['content']

In [23]:
response = openrouter_llm(
    model=openrouter_model,
    prompt="Hi"
)
response

"Hello! How can I assist you today? Is there a specific topic you'd like to discuss or a question you need help with? Feel free to let me know!"

In [None]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
HF_KEY = os.environ.get("HF_KEY")

In [69]:
import os
from huggingface_hub import InferenceClient
#Qwen/Qwen3-235B-A22B-Instruct-2507	hyperbolic	live
client = InferenceClient(
    provider="together",
    api_key=HF_KEY,
    model="openai/gpt-oss-120b"
)

stream = client.chat.completions.create(
    model="openai/gpt-oss-120b",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ]
)

# response = client.text_generation("Explain Langchain", max_new_tokens=200)
# print(response)
stream.choices[0].message.content

'The capital of France is **Paris**.'

In [52]:
from langchain_huggingface import HuggingFaceEndpoint
repo_id = "meta-llama/Llama-2-7b-chat-hf"
# repo_id = "google/flan-t5-small"
# Wrap a hosted HF model
llm = HuggingFaceEndpoint(
    repo_id=repo_id,  # or any other HF model
    huggingfacehub_api_token=HF_KEY,  # optional if logged in
    task="conversation",
    temperature=0.5,
    max_new_tokens=800
)

In [12]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [72]:
client = InferenceClient(
    provider="together",
    api_key=HF_KEY,
    model="openai/gpt-oss-120b"
)


In [13]:
import random
N_QA_COUPLES = 20

print(f"Generating {N_QA_COUPLES} QA pairs...")

outputs = []

for sampled_context in tqdm(random.sample(docs_processed, N_QA_COUPLES)):
    prompt = QA_generation_prompt.format(context=sampled_context.page_content)
    response = llm.invoke(prompt).content
    #response = openrouter_llm(url=url, model=openrouter_model, api_key=QWEN_API_KEY, prompt=prompt)

    try:
        question = response.split("Factoid question:")[-1].split("Answer:")[0]
        answer = response.split("Answer:")[-1]
        assert len(answer) < 300, "Answer too long"
        outputs.append({
            "context": sampled_context.page_content,
            "question": question,
            "answer": answer,
            "source": sampled_context.metadata["source"]
        })
    except Exception as e:
        print(f"Error generating QA pair for context: {sampled_context.page_content}")
        print(f"Error message: {e}")


Generating 20 QA pairs...


  0%|          | 0/20 [00:00<?, ?it/s]

In [15]:
display(pd.DataFrame(outputs).head(5))

Unnamed: 0,context,question,answer,source
0,Deductions on which I can get tax benefit Update my profile details Assisted filing Downloads Hindu Undivided Family (HUF) Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Assisted filing Downloads Company Domestic Company Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Foreign Company Guidance to file Tax return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Non-Company AOP/BOI/Trust/AJP Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Firm/LLP Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Local Authority Guidance to file Tax,**What are the tax return filing options available for a Hindu Undivided Family (HUF) on the income tax portal?**\n\n,"**Guidance to file Tax Return, Return/Forms applicable, Tax slabs, and Deductions for tax benefits**",https://www.incometax.gov.in/iec/foportal/help/e-filing-itr1-form-sahaj-faq
1,Register ‌ HomeIndividual/HUFSalaried EmployeesGuidance to file Tax ReturnReturn / Forms applicable to meTax slabsDeductions on which I can get tax benefitUpdate my profile detailsAssisted filingDownloadsBusiness/ProfessionGuidance to file Tax ReturnReturn / Forms applicable to meTax slabsDeductions on which I can get tax benefitUpdate my profile detailsAssisted filingDownloadsSenior / Super Senior CitizenGuidance to file Tax ReturnReturn / Forms applicable to me Tax slabsDeductions on which I can get tax benefitUpdate my profile detailsAssisted filingDownloadsNon ResidentGuidance to file Tax ReturnReturn / Forms applicable to meTax slabsDeductions on which I can get tax benefitUpdate my profile details Assisted filingDownloadsHindu Undivided Family (HUF)Guidance to file Tax ReturnReturn / Forms applicable to meTax slabsDeductions on which I can get tax benefitUpdate my profile detailsAssisted filingDownloadsCompanyDomestic CompanyGuidance to file Tax ReturnReturn / Forms applicable,**What are the tax filing categories available for individuals on the Indian income tax portal?**\n\n,"**Individual/HUF (Salaried Employees, Senior/Super Senior Citizen, Non-Resident), Hindu Undivided Family (HUF), Business/Profession, and Domestic Company.**",https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxableincome
2,"Nil ₹ 3,00,001 - ₹ 7,00,000** 5% above ₹ 3,00,000 Nil ₹ 5,00,001 - ₹ 10,00,000 ₹ 10,000 + 20% above ₹ 5,00,000 Nil ₹ 7,00,001 - ₹ 10,00,000 ₹ 20,000 + 10% above ₹ 7,00,000 Nil ₹ 10,00,001- ₹ 50,00,000 ₹ 1,10,000 + 30% above ₹ 10,00,000 Nil ₹ 10,00,001 - ₹ 12,00,000 ₹ 50,000 + 15% above ₹ 10,00,000 Nil ₹ 50,00,001- ₹ 100,00,000 ₹ 1,10,000 + 30% above ₹ 10,00,000 10% ₹ 12,00,001 - ₹ 15,00,000 ₹ 80,000 + 20% above ₹ 12,00,000 Nil ₹ 100,00,001- ₹ 200,00,000 ₹ 1,10,000 + 30% above ₹ 10,00,000 15% ₹ 15,00,001- ₹ 50,00,000 ₹ 1,40,000 + 30% above ₹ 15,00,000 Nil ₹ 200,00,001- ₹ 500,00,000 ₹ 1,10,000 + 30% above ₹ 10,00,000 25% ₹ 50,00,001- ₹ 100,00,000 ₹ 1,40,000 + 30% above ₹ 15,00,000 10% Above ₹ 500,00,000 ₹ 1,10,000 + 30% above ₹ 10,00,000 37% ₹ 100,00,001- ₹ 200,00,000 ₹ 1,40,000 + 30% above ₹ 15,00,000 15% Above ₹ ₹ 200,00,001 ₹ 1,40,000 + 30% above ₹ 15,00,000 25% Tax rates for Individual (resident or non-resident) 80 years of age or more anytime during the previous year are","**What is the income tax rate for individuals aged 80+ with an annual income between ₹10,00,001 and ₹12,00,000?**\n\n","**₹50,000 + 15% on income above ₹10,00,000**",https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions
3,Deductions on which I can get tax benefit Update my profile details Assisted filing Downloads Hindu Undivided Family (HUF) Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Assisted filing Downloads Company Domestic Company Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Foreign Company Guidance to file Tax return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Non-Company AOP/BOI/Trust/AJP Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Firm/LLP Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Local Authority Guidance to file Tax,**What are the tax return filing options available for a Hindu Undivided Family (HUF) on the income tax portal?**\n\n,"**Guidance to file Tax Return, Return / Forms applicable to me, Tax slabs, and Deductions on which tax benefits can be claimed.**",https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxableincome
4,I can get tax benefit Update my profile details Downloads Local Authority Guidance to file Tax Return Return / Forms applicable to me Tax slabs Deductions on which I can get tax benefit Update my profile details Downloads Tax Professionals & Others Chartered Accountants Registration Service Available e-Return Intermediaries API Specifications ERI List External Agency Central & State Government Department/Approved Undertaking Agency RBI Approved Banks Tax Deductor & Collector Registration Service Available Downloads Help Do not have an account?,"**""What are the tax deduction options available for claiming tax benefits in India?""**\n\n",**Deductions on which tax benefits can be claimed.**,https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns


In [14]:
QA_pairs = pd.DataFrame(outputs).to_csv("QA_pairs.csv", index=False)

##### Building critique agents: We cannot blindly trust the questions generated by the LLM, we must do a quality check to validate them.
We follow this paper - https://huggingface.co/papers/2312.10003

Criteria - 
- Groundedness: can the question be answered from the given context?
- Relevance: is the question relevant to users? For instance, "What is the date when transformers 4.29.1 was released?" is not relevant for ML practitioners.
One last failure case we’ve noticed is when a function is tailored for the particular setting where the question was generated, but undecipherable by itself.
- Stand-alone: is the question understandable free of any context, for someone with domain knowledge/Internet access? 

In [None]:
groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """