%pip install -q torch transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets langchain-community ragatouille

In [1]:
from dotenv import load_dotenv, find_dotenv
import sys
import os, getpass
from openai import OpenAI


# Add the project root directory to Python path
project_root = os.path.dirname(os.path.abspath(''))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils import set_api_key


load_dotenv(find_dotenv())  

QWEN_API_KEY = set_api_key('QWEN_API_KEY')

from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import tiktoken

# requirements (example)
# pip install requests beautifulsoup4 transformers sentence-transformers faiss-cpu langchain pillow pytesseract

import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import numpy as np
#import torch

# Hugging Face tools
#from transformers import pipeline, CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer  # for text embeddings

# LangChain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
import langgraph
from langgraph.prebuilt import ToolNode
from langchain.chat_models import init_chat_model



pd.set_option("display.max_colwidth", None)

API key found in .env file for QWEN_API_KEY
API key set successfully.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxableincome",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr1-form-sahaj-faq",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr4-form-sugam-faq"

]
loader = WebBaseLoader(urls)
docs = loader.load()
assert len(docs) == 5

print(f"Total Characters: {sum([len(doc.page_content) for doc in docs])}")


USER_AGENT environment variable not set, consider setting it to identify your requests.


Total Characters: 117671


In [3]:
docs[0]

Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'description': 'ITRs', 'language': 'en'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHow to File Tax Returns | Income Tax Department\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\nCall Us\n\n\n\ne-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries\n\n\n1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700\n\n\n08:00 hrs - 20:00 hrs\n(Monday to Friday)\n\n\n\n\nTax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Update through NSDL\n\n\n+91-20-27218080\n\n\n07:00 hrs - 23:00 hrs\n(All Days)\n\n\n\n\n

In [4]:
# Split the documents
#langchain_docs = [LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(ds)]


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
    separators=[ r"\n#", ".", r"\t#", " ", ""],
    length_function= lambda text: len(tiktoken.encoding_for_model("gpt-3.5-turbo").encode(text)),
    is_separator_regex=True
)

docs_processed = []
for doc in docs:
    docs_processed += text_splitter.split_documents([doc])



In [5]:
len(docs_processed)

141

In [6]:
docs_processed[:3]

[Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'description': 'ITRs', 'language': 'en', 'start_index': 34}, page_content='How to File Tax Returns | Income Tax Department\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\nCall Us\n\n\n\ne-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries\n\n\n1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700\n\n\n08:00 hrs - 20:00 hrs\n(Monday to Friday)\n\n\n\n\nTax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Update through NSDL\n\n\n+91-20-27218080\n\n\n07:00 hrs - 23:00 hrs\n(All Days)\n\n\n\n\nAIS and Reporting Portal\nQueries related to AIS

#### Creating a team of agents for different tasks
- Question Agent
- Critique Agent
- Agent as a Judge
- Answering Agent

In [10]:
MISTRAL_KEY = os.environ.get('MISTRAL_KEY')
QWEN_API_KEY = os.environ.get('QWEN_API_KEY')

In [34]:
llm  = init_chat_model("mistral-large-latest", 
                       model_provider="mistralai", 
                       timeout = 60, 
                       streaming = True,
                       api_key = MISTRAL_KEY
                       )

In [43]:
llm.invoke("Hi dude")

AIMessage(content='Hey! What\'s up? 😊 How can I help you today? (Also, love the casual vibe—"dude" works for me! 🤙)', additional_kwargs={}, response_metadata={'model_name': 'mistral-large-latest', 'finish_reason': 'stop'}, id='run--edfed2b0-7670-4ebc-8bfd-612dc30e9464-0', usage_metadata={'input_tokens': 6, 'output_tokens': 38, 'total_tokens': 44})

In [None]:
"model": "qwen/qwen3-4b:free",

In [36]:
def openrouter_llm(url:str, model:str, api_key:str, prompt:str):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": F"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": model,
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],

        })
    )
    return response.json()['choices'][0]['message']['content']

In [37]:
response = openrouter_llm(
    url="https://openrouter.ai/api/v1/chat/completions",
    model="qwen/qwen3-4b:free",
    api_key=QWEN_API_KEY,
    prompt="Hi"
)

In [38]:
response

'Hello! How can I assist you today? 😊'

In [45]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [46]:
import random
N_QA_COUPLES = 20

print(f"Generating {N_QA_COUPLES} QA pairs...")

outputs = []

for sampled_context in tqdm(random.sample(docs_processed, N_QA_COUPLES)):
    prompt = QA_generation_prompt.format(context=sampled_context.page_content)
    response = openrouter_llm(
    url="https://openrouter.ai/api/v1/chat/completions",
    model="qwen/qwen3-4b:free",
    api_key=QWEN_API_KEY,
    prompt=prompt
)

    try:
        question = response.split("Factoid question:")[-1].split("Answer:")[0]
        answer = response.split("Answer:")[-1]
        assert len(answer) < 300, "Answer too long"
        outputs.append({
            "context": sampled_context.page_content,
            "question": question,
            "answer": answer,
            "source": sampled_context.metadata["source"]
        })
    except Exception as e:
        print(f"Error generating QA pair for context: {sampled_context.page_content}")
        print(f"Error message: {e}")


Generating 20 QA pairs...


100%|██████████| 20/20 [02:24<00:00,  7.24s/it]


In [48]:
display(pd.DataFrame(outputs).head(2))

Unnamed: 0,context,question,answer,source
0,"dent\n• PAN of the Dependent\n• Aadhaar of the Dependent\n• Acknowledgement no. of form 10 IA filed incase of autism, cerebral palsy, or multiple disabilities.\n• UDID number (if available)\n\nSection 80DDB\n\n\n \n\n\n \nDeduction towards payments made towards Medical treatment of Self or Dependant for specified diseases\n\n\n \n\nDeduction limit of₹ 40,000\n\t\t\t\t\t\t(₹ 1,00,000 if Senior Citizen)\n\n\n \n \n \n\nSection 80E\n\n\nDeduction towards interest payments made on loan for higher education of Self or relative\n\n\n\nTotal amount paid towards interest on loan taken\n\n\nNote:\nFor claiming deduction under section 80E, below details need to be provided in ITR :\n• Loan taken from bank / institution\n• Name of the institution / bank from which the loan is taken\n• Loan Account Number of the bank / institution\n• Date of sanction of loan\n• Total Amount of loan\n• Loan outstanding as on last date of financial year\n• Interest u/s 80E\nPlease note that the deduction u/s 80E can be claimed only if the limit in section",What is the deduction limit under Section 80DDB for senior citizens? \n,"₹1,00,000",https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions
1,"income of the whole of the amount so contributed \n\n\n\n\nTax deductions in the Old Tax Regime \nSection 24(b) – Deduction from Income from House Property on interest paid on housing loan & housing improvement loan. In case of self- occupied property, the upper limit for deduction of interest paid on housing loan is ₹ 2 lakh. Interest on loan u/s 24(b) allowable is tabulated below:\n\nNature of Property\n\n\nWhen loan was taken\n\n\nPurpose of loan\n\n\nAllowable (Maximum limit)\n\nDetails Required\n\nSelf-Occupied\n\n\nOn or after 1/04/1999\n\n\nConstruction or purchase of house property \n\n\n₹ 2,00,000\n\n•Loan taken from bank / Other than bank\n\t\t\t•Name of the bank / institution / person from whom the loan is taken\n\t\t\t•Loan Account Number of the bank / institution .\n\t\t\t•Date of sanction of loan\n\t\t\t•Total Amount of loan\n\t\t\t•Loan outstanding as on last date of financial year\n\t\t\t•Interest on borrowed capital u/s 24(b)\n\nOn or after 1/04/1999\n\n\nFor Repairs of house property \n\n\n₹ 30,000\n\n\nBefore 1/04/1999\n\n\nCons",What is the maximum deduction allowed for interest paid on a housing loan for a self-occupied property under the Old Tax Regime? \n,"₹2,00,000.",https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions
