%pip install -q torch transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets langchain-community ragatouille

In [43]:
from dotenv import load_dotenv, find_dotenv
import sys
import os, getpass
from openai import OpenAI
import re
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import tiktoken
from g4f.client import Client


# pip install requests beautifulsoup4 transformers sentence-transformers faiss-cpu langchain pillow pytesseract
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os
import numpy as np
#import torch

# Hugging Face tools
#from transformers import pipeline, CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer  # for text embeddings
from transformers import AutoTokenizer
from datasets import Dataset

# LangChain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document as LangchainDocument
import langgraph
from langgraph.prebuilt import ToolNode
from langchain.chat_models import init_chat_model


# Add the project root directory to Python path
project_root = os.path.dirname(os.path.abspath(''))
if project_root not in sys.path:
    sys.path.append(project_root)
from utils import set_api_key
load_dotenv(find_dotenv())  

QWEN_API_KEY = set_api_key('QWEN_API_KEY')
MISTRAL_KEY = set_api_key('MISTRAL_KEY')
GROQ_API_KEY = set_api_key('GROQ_API_KEY')

pd.set_option("display.max_colwidth", None)

API key found in .env file for QWEN_API_KEY
API key set successfully.
API key found in .env file for MISTRAL_KEY
API key set successfully.
API key found in .env file for GROQ_API_KEY
API key set successfully.


In [2]:
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxdeductions",
    "https://www.incometax.gov.in/iec/foportal/help/individual/return-applicable-1#taxableincome",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr1-form-sahaj-faq",
    "https://www.incometax.gov.in/iec/foportal/help/e-filing-itr4-form-sugam-faq"

]
loader = WebBaseLoader(urls)
docs = loader.load()
assert len(docs) == 5

print(f"Total Characters: {sum([len(doc.page_content) for doc in docs])}")


USER_AGENT environment variable not set, consider setting it to identify your requests.


Total Characters: 118694


In [3]:
docs[:5]

[Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'description': 'ITRs', 'language': 'en'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHow to File Tax Returns | Income Tax Department\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n \n\n  Call Us\n\n\n\ne-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries\n\n\n1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700\n\n\n08:00 hrs - 20:00 hrs\n(Monday to Friday)\n\n\n\n\nTax Information Network - NSDL\nQueries related to PAN & TAN applic

In [4]:
# After looking at few examples of documents, we see that there is a lot of noise in the text in the form of extended whitespace, 
# markdown artifacts, and other non-informative content. We need to clean this up to improve the quality of our embeddings.

def normalize_text (text: str, preserve_para:bool = True):
    text = re.sub(r'[ \t]+', ' ', text)  # Replace multiple spaces/tabs with a single space
    if preserve_para:
        return re.sub(r'\n{2,}', ' ', text).strip()  # Replace multiple newlines with a single space, if there are more than 2 newlines.
    return re.sub(r'\n\n+', '\n', text).strip()

In [5]:
def create_knowledge_base(
        chunk_size: int,
        documents: List[Document],
        tokenizer_name:str,
        normalize:bool = False,
   
) -> List[LangchainDocument]:
    
    MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
    "/",
]

    if normalize:
        for doc in documents:
            doc.page_content = normalize_text(doc.page_content)
    
    documents = [Document(page_content=doc.page_content.strip(), metadata=
                            { "source": doc.metadata["source"],
                            "length": len(doc.page_content),
                            "title": doc.metadata["title"]
                            }) 
                    for doc in documents]
    
        
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size * 0.1),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in documents:
        docs_processed.extend(
            text_splitter.split_documents([doc])
        )
    
    #remove duplicates
    seen = set()
    unique_docs = []
    for doc in docs_processed:
        if doc.page_content not in seen:
            unique_docs.append(doc)
            seen.add(doc.page_content)
    
    return unique_docs



In [6]:
RAW_KNOWLEDGE_BASE = [LangchainDocument(page_content=doc.page_content, 
                                    metadata={"source": doc.metadata["source"], 
                                              "title": doc.metadata["title"],
                                              "language": doc.metadata["language"]}) for doc in tqdm(docs)]

  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
RAW_KNOWLEDGE_BASE

[Document(metadata={'source': 'https://www.incometax.gov.in/iec/foportal/help/all-topics/tax-payer/individual/how-to-file-tax-returns', 'title': 'How to File Tax Returns | Income Tax Department', 'language': 'en'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHow to File Tax Returns | Income Tax Department\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n \n\n  Call Us\n\n\n\ne-filing and Centralized Processing Center\ne-Filing of Income Tax Return or Forms and other value added services & Intimation, Rectification, Refund and other Income Tax Processing Related Queries\n\n\n1800 103 0025 (or)\n1800 419 0025\n+91-80-46122000\n+91-80-61464700\n\n\n08:00 hrs - 20:00 hrs\n(Monday to Friday)\n\n\n\n\nTax Information Network - NSDL\nQueries related to PAN & TAN application for Issuance / Up

#### Creating a team of agents for different tasks
- Question Agent
- Critique Agent
- Agent as a Judge
- Answering Agent

In [8]:
llm  = init_chat_model("mistral-large-latest", 
                       model_provider="mistralai", 
                       timeout = 60, 
                       streaming = True,
                       api_key = MISTRAL_KEY
                       )
                       

In [9]:
# llm.invoke("Hi dude").content

In [10]:
openrouter_model= "cognitivecomputations/dolphin-mistral-24b-venice-edition:free"
url="https://openrouter.ai/api/v1/chat/completions"

In [None]:
def openrouter_llm( model:str, prompt:str):
    response = requests.post(
        url=url,
        headers={
            "Authorization": F"Bearer {GROQ_API_KEY}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": model,
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],

        })
    )
    return response.json()['choices'][0]['message']['content']

# response = openrouter_llm(
#     model=openrouter_model,
#     prompt="Hi"
# )
# response

In [12]:
# from huggingface_hub import notebook_login
# notebook_login()


In [13]:
HF_KEY = os.environ.get("HF_KEY")

In [14]:

# import os
# from huggingface_hub import InferenceClient
# #Qwen/Qwen3-235B-A22B-Instruct-2507	hyperbolic	live
# client = InferenceClient(
#     provider="together",
#     api_key=HF_KEY,
#     model="openai/gpt-oss-120b",
    
# )

# stream = client.chat.completions.create(
#     model="openai/gpt-oss-120b",
#     messages=[
#         {
#             "role": "user",
#             "content": "What is the capital of France?"
#         }
#     ]
# )

# # response = client.text_generation("Explain Langchain", max_new_tokens=200)
# # print(response)
# stream.choices[0].message.content

In [15]:
from langchain_huggingface import HuggingFaceEndpoint
repo_id = "meta-llama/Llama-2-7b-chat-hf"
# repo_id = "google/flan-t5-small"
# Wrap a hosted HF model
llm = HuggingFaceEndpoint(
    repo_id=repo_id,  # or any other HF model
    huggingfacehub_api_token=HF_KEY,  # optional if logged in
    task="text-generation",
    temperature=0.5,
    max_new_tokens=800
)

In [16]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [17]:
# client = InferenceClient(
#     provider="together",
#     api_key=HF_KEY,
#     model="openai/gpt-oss-120b"
# )


In [18]:
"""
import random
N_QA_COUPLES = 20

print(f"Generating {N_QA_COUPLES} QA pairs...")

outputs = []



for sampled_context in tqdm(random.sample(docs_processed, N_QA_COUPLES)):
    prompt = QA_generation_prompt.format(context=sampled_context.page_content)
    response = llm.invoke(prompt).content
    #response = openrouter_llm(url=url, model=openrouter_model, api_key=QWEN_API_KEY, prompt=prompt)

    try:
        question = response.split("Factoid question:")[-1].split("Answer:")[0]
        answer = response.split("Answer:")[-1]
        assert len(answer) < 300, "Answer too long"
        outputs.append({
            "context": sampled_context.page_content,
            "question": question,
            "answer": answer,
            "source": sampled_context.metadata["source"]
        })
    except Exception as e:
        print(f"Error generating QA pair for context: {sampled_context.page_content}")
        print(f"Error message: {e}")
"""

'\nimport random\nN_QA_COUPLES = 20\n\nprint(f"Generating {N_QA_COUPLES} QA pairs...")\n\noutputs = []\n\n\n\nfor sampled_context in tqdm(random.sample(docs_processed, N_QA_COUPLES)):\n    prompt = QA_generation_prompt.format(context=sampled_context.page_content)\n    response = llm.invoke(prompt).content\n    #response = openrouter_llm(url=url, model=openrouter_model, api_key=QWEN_API_KEY, prompt=prompt)\n\n    try:\n        question = response.split("Factoid question:")[-1].split("Answer:")[0]\n        answer = response.split("Answer:")[-1]\n        assert len(answer) < 300, "Answer too long"\n        outputs.append({\n            "context": sampled_context.page_content,\n            "question": question,\n            "answer": answer,\n            "source": sampled_context.metadata["source"]\n        })\n    except Exception as e:\n        print(f"Error generating QA pair for context: {sampled_context.page_content}")\n        print(f"Error message: {e}")\n'

In [19]:
#display(pd.DataFrame(outputs).head(5))

In [20]:
# Offloading QA pairs to persistent storage
#QA_pairs = pd.DataFrame(outputs).to_csv("QA_pairs.csv", index=False)
# QA_pairs = pd.read_csv("QA_pairs.csv")

##### Building critique agents: We cannot blindly trust the questions generated by the LLM, we must do a quality check to validate them.
We follow this paper - https://huggingface.co/papers/2312.10003

Criteria - 
- Groundedness: can the question be answered from the given context?
- Relevance: is the question relevant to users? For instance, "What is the date when transformers 4.29.1 was released?" is not relevant for ML practitioners.
One last failure case we’ve noticed is when a function is tailored for the particular setting where the question was generated, but undecipherable by itself.
- Stand-alone: is the question understandable free of any context, for someone with domain knowledge/Internet access? 

In [21]:
groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context. Do not make anything up, stick to the context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: 
"""

In [22]:
relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to answer questions about tax department of India and its policies.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating. Answer format: a text)
Total rating: (your rating. Answer format: a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer based on their format described above.

Now here is the question.

Question: {question}\n
Answer::: 
"""


In [23]:
standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For example, if the question refers to a particular setting, like 'in the context' or 'in the document' or 'according to', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating. Answer format: a text)
Total rating: (your rating. Answer format: a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer based on their format described above.

Now here is the question.

Question: {question}\n
Answer::: """

In [24]:
model = "openai/gpt-oss-120b"
model="deepseek-v3"
from g4f.client import Client
client = Client()

In [25]:
from g4f.client import Client
client = Client()
model="deepseek-v3"
# response = client.chat.completions.create(
#     model=model,  # Try "gpt-4o", "deepseek-v3", etc.
#     messages=[{"role": "user", "content": "Hello"}],
#     web_search=False
# )
# print(response.choices[0].message.content)

In [26]:

# print("Evaluating QA pairs...")
# evals = []
# for _, pair in tqdm(QA_pairs.iterrows()):
#     question = pair["question"]
#     context = pair["context"]
    
#     evaluations = {
#         # "groundedness": llm.invoke(groundedness_critique_prompt.format(question=question, context=context)).content,
#         # "relevance": llm.invoke(relevance_critique_prompt.format(question=question)).content,
#         # "standalone": llm.invoke(standalone_critique_prompt.format(question=question)).content
#         # "groundedness": openrouter_llm(model=openrouter_model,prompt=groundedness_critique_prompt.format(question=question, context=context)),
#         # "relevance": openrouter_llm(model=openrouter_model,prompt=relevance_critique_prompt.format(question=question)),
#         # "standalone": openrouter_llm(model=openrouter_model,prompt=standalone_critique_prompt.format(question=question))
#         "groundedness": client.chat.completions.create(model=model, messages=[{"role": "user", 
#                         "content": groundedness_critique_prompt.format(question=question, context=context)}]).choices[0].message.content,
#         "relevance": client.chat.completions.create(model=model, messages=[{"role": "user", 
#                         "content": relevance_critique_prompt.format(question=question)}]).choices[0].message.content,
#         "standalone": client.chat.completions.create(model=model, messages=[{"role": "user", 
#                         "content": standalone_critique_prompt.format(question=question)}]).choices[0].message.content
#     }
#     try:
#         for criterion, response in evaluations.items():
        
#             evaluation = response.split("Evaluation:")[-1].split("Total rating:")[0].strip()
#             total_rating = int(re.findall(r'\d+', response.split("Total rating:")[-1])[0])
#             pair[f"{criterion}_evaluation"] = evaluation
#             pair[f"{criterion}_rating"] = total_rating

#     except Exception as e:
#         continue
#     evals.append(pair)


In [27]:
# pair

In [28]:
# evaluations.items()

In [29]:
# import pandas as pd
# generated_outputs = pd.DataFrame.from_dict(evals)

In [30]:
# generated_outputs.head(1)

In [31]:
generated_outputs = pd.read_csv("generated_QA_pairs_evaluated_v2.csv")
display(
    generated_outputs[
        [
            "question",
            "answer",
            "groundedness_rating",
            "relevance_rating",
            "standalone_rating",
        ]
    ]
)

Unnamed: 0,question,answer,groundedness_rating,relevance_rating,standalone_rating
0,**What are the tax filing categories available for individuals on the Indian income tax portal?**\n\n,"**Individual/HUF (Salaried Employees, Senior/Super Senior Citizen, Non-Resident), Hindu Undivided Family (HUF), Business/Profession, and Domestic Company.**",5,5,5
1,**What is the maximum deduction limit for interest on a housing loan for a self-occupied property under Section 24(b)?**\n\n,**₹2 lakh**,5,5,5
2,**What are the income thresholds for claiming marginal relief from surcharge under the old tax regime in India?**\n\n,"**₹50 lakhs, ₹1 crore, ₹2 crores, and ₹5 crores.**",5,5,5
3,**What is the minimum annual tax amount that requires payment of advance tax in India?**\n\n,"**₹10,000**",5,5,5
4,**What form must be filed to claim deduction under section 80GG from AY 2025-26?**\n\n,**Form 10BA**,5,5,5
5,**What is the maximum cash donation amount eligible for deduction under Section 80G?**\n\n,"**₹2,000**",5,4,5
6,**What is the maximum tax deduction limit for preventive health check-up under Section 80D in India?**\n\n,"**₹5,000**",5,5,5
7,**What is the threshold for mandatory ITR filing if someone has income from long-term capital gains under Section 112A?**\n\n,**Exceeding ₹1.25 lakhs**,5,5,5
8,**What is the maximum income limit to file ITR-4 (Sugam) for AY 2025-26?**\n\n,**₹50 lakh**,5,5,5
9,"**What is the income tax rate for senior citizens (60-80 years) earning between ₹3,00,001 and ₹5,00,000 under the old tax regime?**\n\n","**5% on income above ₹3,00,000**",5,5,5


In [32]:
# generated_outputs = generated_outputs.loc[(generated_outputs["groundedness_rating"] >= 4) &
#                                            (generated_outputs["relevance_rating"] >= 4) &
#                                            (generated_outputs["standalone_rating"] >= 4)]

In [33]:
# generated_outputs.to_csv("generated_QA_pairs_evaluated_v2.csv", index=False)

In [34]:
eval_dataset = Dataset.from_pandas(pd.read_csv("generated_QA_pairs_evaluated_v2.csv"))

#### Building our main RAG system

In [35]:
# Step 1 is to split the knowledge base into chunks and create embeddings for each chunk.
# Step 2 is to create vector embeddings for each chunk and store them in a vector database.


from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os

def load_embeddings(
        langchain_docs: List[LangchainDocument],
        chunk_size: int,
        embedding_model_name: Optional[str],
        normalize:bool = False,
) -> FAISS:
    
    """
    Creates a FAISS index from the given embedding model and documents. Or loads an existing index if it exists.
    Args:
        langchain_docs: list of LangchainDocument objects
        chunk_size: size of each chunk
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """

    # create an embedding model
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name,
                                            encode_kwargs={'normalize_embeddings': True} )
    
    # create an index path
    index_path = f"index_chunk_{chunk_size}_{embedding_model_name.replace('/', '_')}"
    index_folder_path = f"./data/indexes/{index_path}/"
    
    if os.path.isdir(index_folder_path):
        print(f"loading existing index from {index_folder_path}")
        vectorindex = FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True

        )
        return vectorindex

    else:
        print(f"Index not found, creating a new index")
        docs_processed = create_knowledge_base(
            chunk_size=chunk_size,
            documents=langchain_docs,
            tokenizer_name=embedding_model_name,
            normalize=normalize,
        )

        new_index = FAISS.from_documents(
            docs_processed,
            embedding=embedding_model,
            distance_strategy=DistanceStrategy.COSINE
        )
        new_index.save_local(folder_path=index_folder_path)
        return new_index



In [None]:
# openrouter_model= "cognitivecomputations/dolphin-mistral-24b-venice-edition:free"
url="https://openrouter.ai/api/v1/chat/completions"
# eval_model = "openai/gpt-oss-120b"
answer_model="deepseek/deepseek-chat-v3.1:free"
eval_model = "x-ai/grok-4-fast:free"
def openrouter_llm( model:str, prompt:str):
    response = requests.post(
        url=url,
        headers={
            "Authorization": F"Bearer {GROQ_API_KEY}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": model,
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],

        })
    )
    return response.json()['choices'][0]['message']['content']

# response = openrouter_llm(
#     model=answer_model,
#     prompt="Hi"
# )
# response

In [37]:
# Step 3 is to create a retrieval-based system using the vector database and a language model.

RAG_PROMPT_TEMPLATE = """
Using the information contained in the context,
give a comprehensive, accurate answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.

Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
"""



client = Client()
"""
response = client.chat.completions.create(
    model=model,  # Try "gpt-4o", "deepseek-v3", etc.
    messages=[{"role": "user", "content": "Hello"}],
    
)
"""
#from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore

answer_model="deepseek-v3"
def answer_with_rag(
    question:str,
    answer_model:str,
    knowledge_index: VectorStore,
    # reranker: Optional[RAGPretrainedModel] = None,
    num_return_sources: int = 10,
    num_docs_final: int = 5, 

)-> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG model with the given knowledge index and LLM."""

    relevant_docs = knowledge_index.similarity_search(
        query=question,
        k=num_return_sources
    )
    # keep only the text content of the documents
    relevant_docs = [doc.page_content for doc in relevant_docs]

    # if reranker:
    #     relevant_docs = reranker.rerank(
    #         question=question,
    #         documents=relevant_docs,
    #         top_k=num_docs_final
    #     )
    #     relevant_docs = [doc['content'] for doc in relevant_docs]
    relevant_docs = relevant_docs[:num_docs_final]


    # build the final prompt
    context = "\nExtracted documents: \n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

#     answer = openrouter_llm(
#     model=answer_model,
#     prompt=final_prompt
# )
    answer = client.chat.completions.create(
        model=answer_model,
        messages=[
            {"role": "user", "content": final_prompt}
        ]
    )
     # Extract the content from the ChatCompletion object
    answer_text = answer.choices[0].message.content
    
    return answer_text, relevant_docs  # Return the text content instead of the full object
    


#### Benchmarking the RAG system

In [38]:
# Step 4 is to evaluate the QA system using the generated QA pairs.
def run_rag_tests(
    eval_dataset,
    answer_model,
    knowledge_index:VectorStore,
    output_file: str,
    # reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None  # to document the test settings used      
):
    
    """Run RAG tests on the given evaluation dataset and save the results to the output file."""
    try: # load previous generations if they exist
        with open(output_file, "r") as f:
            results = json.load(f)
    except:
        results = []

    for row in tqdm(eval_dataset):
        question = row["question"]
        if question in [result['question'] for result in results]:
            print(f"Skipping already processed question: {question}")
            continue
        # call the reader LLM to answer the question
        answer, relevant_docs = answer_with_rag(
            question,
            answer_model,
            knowledge_index,
            # reranker
        )
        
        # if verbose, display the question, answer, ground truth answer and relevant docs
        if verbose:
            print("-------------------------------------------------")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f"Ground Truth Answer: {row['answer']}")
            print(f"Relevant Docs: {relevant_docs}")
            print("\n\n")
        # create a data structure to save the result
        result = {
            "question": question,
            "ground_truth": row["answer"],
            "source_doc": row["source"],
            "generated_answer": answer,
            "relevant_docs": [doc for doc in relevant_docs],
        }

        # add custom test settings if provided
        if test_settings:
            result["test_settings"] = test_settings
        results.append(result)

    # save the results to the output file
    with open(output_file, "w") as f:
        json.dump(results,f)


In [39]:
EVALUATION_PROMPT = """
###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

# Wrap around langchain prompt template to create prompt object
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model who follows instructions carefully."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

#### Evaluate RAG system

In [46]:
llm  = init_chat_model("mistral-large-latest", 
                       model_provider="mistralai", 
                       timeout = 60, 
                       streaming = True,
                       api_key = MISTRAL_KEY
                       )
                       

In [48]:
def evaluate_answer(
    answer_path:str,
    # eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """ Evaluate generated answers. Modifies the answer file in place to add evaluation results."""
    answers = []
    if os.path.isfile(answer_path): 
        answers = json.load(open(answer_path, "r")) # load existing answers if they exist
    
    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            print(f"Skipping already evaluated experiment: {experiment['question']}")
            continue

        eval_prompt = evaluation_prompt_template.format_prompt(
            instruction = experiment["question"],
            response = experiment["generated_answer"],
            reference_answer = experiment["ground_truth"],
        )

        # eval_result = client.chat.completions.create(
        #     model=eval_chat_model,  
        #     messages=[{"role": "user", "content": eval_prompt}],
        # )
        
        # eval_result = eval_result.choices[0].message.content
        eval_result = llm.invoke(eval_prompt).content
        feedback, score = [part.strip() for part in eval_result.split("[RESULT]") if part.strip()]
        experiment[f"eval_feedback_{evaluator_name}"] = feedback
        experiment[f"eval_score_{evaluator_name}"] = score

    with open(answer_path, "w") as f:
        json.dump(answers, f)


In [41]:
# # openrouter_model= "cognitivecomputations/dolphin-mistral-24b-venice-edition:free"
# url="https://openrouter.ai/api/v1/chat/completions"
# # eval_model = "openai/gpt-oss-120b"
# answer_model="deepseek/deepseek-chat-v3.1:free"
# eval_model = "x-ai/grok-4-fast:free"
# eval_model = "openai/gpt-oss-120b:free"
# def openrouter_llm( model:str, prompt:str):
#     response = requests.post(
#         url=url,
#         headers={
#             "Authorization": F"Bearer {QWEN_API_KEY}",
#             "Content-Type": "application/json",
#         },
#         data=json.dumps({
#             "model": model,
#             "messages": [
#                 {
#                     "role": "user",
#                     "content": prompt
#                 }
#             ],

#         })
#     )
#     return response.json()['choices'][0]['message']['content']

# # response = openrouter_llm(
# #     model=answer_model,
# #     prompt="Hi"
# # )
# # response
client = Client()
eval_model="gpt-4"

In [49]:
if not os.path.exists("./output"):
    os.makedirs("./output", exist_ok=True)


from g4f.client import Client
client = Client()


for chunk_size in [200]:
    for embeddings in ["BAAI/bge-small-en"]:
        settings_name = f"chunk_{chunk_size}_embeddings_{embeddings.replace('/', '_')}_reader_model_{answer_model}_eval_model_{eval_model}"
        output_file_name = f"./output/rag_{settings_name}.json"
        output_file_name = output_file_name.replace(":", "_").replace("/", "_")
        
        print(f"Running evaluations for {settings_name}:\n")
        knowledge_index = load_embeddings(
            langchain_docs=RAW_KNOWLEDGE_BASE,
            chunk_size=chunk_size,
            embedding_model_name=embeddings,
            normalize=True,

        )

        print("Running RAG...")
        run_rag_tests(
            eval_dataset=Dataset.from_pandas(generated_outputs),
            answer_model="deepseek-v3",
            knowledge_index=knowledge_index,
            output_file=output_file_name,
            verbose=False,
            test_settings=settings_name
        )

        print("Evaluating answers...")
        evaluate_answer(
            answer_path=output_file_name,
            # eval_chat_model="deepseek-v3",
            evaluator_name="MistralAI",
            evaluation_prompt_template=evaluation_prompt_template,
            
        )

Running evaluations for chunk_200_embeddings_BAAI_bge-small-en_reader_model_deepseek-v3_eval_model_gpt-4:

loading existing index from ./data/indexes/index_chunk_200_BAAI_bge-small-en/
Running RAG...


  0%|          | 0/13 [00:00<?, ?it/s]

Skipping already processed question:  **What are the tax filing categories available for individuals on the Indian income tax portal?**


Skipping already processed question:  **What is the maximum deduction limit for interest on a housing loan for a self-occupied property under Section 24(b)?**


Skipping already processed question:  **What are the income thresholds for claiming marginal relief from surcharge under the old tax regime in India?**


Skipping already processed question:  **What is the minimum annual tax amount that requires payment of advance tax in India?**


Skipping already processed question:  **What form must be filed to claim deduction under section 80GG from AY 2025-26?**


Skipping already processed question:  **What is the maximum cash donation amount eligible for deduction under Section 80G?**


Skipping already processed question:  **What is the maximum tax deduction limit for preventive health check-up under Section 80D in India?**


Skipping already processe

  0%|          | 0/13 [00:00<?, ?it/s]

HTTPStatusError: Error response 429 while fetching https://api.mistral.ai/v1/chat/completions: {"object":"error","message":"Service tier capacity exceeded for this model.","type":"service_tier_capacity_exceeded","param":null,"code":"3505"}