In [1]:
from dotenv import load_dotenv

load_dotenv()

from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
import random
import datetime
import os

from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Import from helpers
from rag_helpers import (
    read_jsonl,
    write_jsonl,
    write_csv,
    to_serializable,
    call_llm,
    QA_generation_prompt,
    question_groundedness_critique_prompt,
    question_relevance_critique_prompt,
    question_standalone_critique_prompt,
)

pd.set_option("display.max_colwidth", None)

# Initialize LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=500,
)

# RAG Evaluation
_Authored by: [Aymeric Roucher](https://huggingface.co/m-ric)_

This notebook demonstrates how you can evaluate your RAG (Retrieval Augmented Generation), by building a synthetic evaluation dataset and using LLM-as-a-judge to compute the accuracy of your system.

For an introduction to RAG, you can check [this other cookbook](rag_zephyr_langchain)!

RAG systems are complex: here a RAG diagram, where we noted in blue all possibilities for system enhancement:

<img src="https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/RAG_workflow.png" height="700">

Implementing any of these improvements can bring a huge performance boost; but changing anything is useless if you cannot monitor the impact of your changes on the system's performance!
So let's see how to evaluate our RAG system.

### Evaluating RAG performance

Since there are so many moving parts to tune with a big impact on performance, benchmarking the RAG system is crucial.

For our evaluation pipeline, we will need:
1. An evaluation dataset with question - answer couples (QA couples)
2. An evaluator to compute the accuracy of our system on the above evaluation dataset.

‚û°Ô∏è It turns out, we can use LLMs to help us all along the way!
1. The evaluation dataset will be synthetically generated by an LLM ü§ñ, and questions will be filtered out by other LLMs ü§ñ
2. An [LLM-as-a-judge](https://huggingface.co/papers/2306.05685) agent ü§ñ will then perform the evaluation on this synthetic dataset.

__Let's dig into it and start building our evaluation pipeline!__ First, we install the required model dependancies.

In [2]:
# !pip install -q torch transformers langchain sentence-transformers tqdm openpyxl openai pandas datasets langchain-community ragatouille

In [3]:
%reload_ext autoreload
%autoreload 2

In [4]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets

pd.set_option("display.max_colwidth", None)

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

### Load your knowledge base

In [6]:
# download the dataset from the hub
ds = datasets.load_dataset(
    "m-ric/huggingface_doc", split="train"
)

# ds = datasets.load_dataset(
#     "json",
#     data_files=str(save_dir / "initial_corpus.jsonl"),
#     split="train",
# )

In [7]:
print(ds)

Dataset({
    features: ['text', 'source'],
    num_rows: 2647
})


# 1. Build a synthetic dataset for evaluation
We first build a synthetic dataset of questions and associated contexts. The method is to get elements from our knowledge base, and ask an LLM to generate questions based on these documents.

Then we setup other LLM agents to act as quality filters for the generated QA couples: each of them will act as the filter for a specific flaw.

### 1.1. Prepare source documents

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument

langchain_docs = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in tqdm(ds)
]


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

  0%|          | 0/2647 [00:00<?, ?it/s]

### 1.2. Setup agents for question generation

We use [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) for QA couple generation because it it has excellent performance in leaderboards such as [Chatbot Arena](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard).

In [9]:



print(call_llm(llm, "This is a test context"))

It looks like you're testing the context feature. How can I assist you further? If you have any specific questions or topics you'd like to discuss, feel free to let me know!


Now let's generate our QA couples.
For this example, we generate only 10 QA couples and will load the rest from the Hub.

But for your specific knowledge base, given that you want to get at least ~100 test samples, and accounting for the fact that we will filter out around half of these with our critique agents later on, you should generate much more, in the >200 samples.

In [None]:
import random
from tqdm import tqdm

N_GENERATIONS = 10  # keep it low for testing

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(
    random.sample(docs_processed, min(N_GENERATIONS, len(docs_processed)))
):
    output_QA_couple = call_llm(
        llm, QA_generation_prompt.format(context=sampled_context.page_content)
    )
    try:
        question = (
            output_QA_couple.split("Factoid question: ")[-1]
            .split("Answer: ")[0]
            .strip()
        )
        answer = output_QA_couple.split("Answer: ")[-1].strip()
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata.get("source"),
            }
        )
    except Exception:
        continue

Generating 30 QA couples...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:41<00:00,  1.38s/it]


In [11]:
display(pd.DataFrame(outputs).head(5))

Unnamed: 0,context,question,answer,source_doc
0,"| Model | Dataset | License | Use |\n|------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------|-------------------------|\n| [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b) | [Falcon RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) | Apache-2.0 | Text Generation |\n| [SalesForce XGen 7B](https://huggingface.co/Salesforce/xgen-7b-8k-base) | Mix of C4, RedPajama and more | Apache-2.0 | Text Generation |\n| [MPT-30B](https://huggingface.co/mosaicml/mpt-30b) | Mix of C4, RedPajama and more | Apache-2.0 | Text Generation |",What is the license type for the Falcon 40B model?,Apache-2.0,huggingface/blog/blob/main/os-llms.md
1,"To load a pretrained model:\n\n```py\n>>> import timm\n>>> model = timm.create_model('tf_efficientnet_lite0', pretrained=True)\n>>> model.eval()\n```\n\nTo load and preprocess the image:\n\n```py\n>>> import urllib\n>>> from PIL import Image\n>>> from timm.data import resolve_data_config\n>>> from timm.data.transforms_factory import create_transform\n\n>>> config = resolve_data_config({}, model=model)\n>>> transform = create_transform(**config)\n\n>>> url, filename = (""https://github.com/pytorch/hub/raw/master/images/dog.jpg"", ""dog.jpg"")\n>>> urllib.request.urlretrieve(url, filename)\n>>> img = Image.open(filename).convert('RGB')\n>>> tensor = transform(img).unsqueeze(0) # transform and add batch dimension\n```\n\nTo get the model predictions:\n\n```py\n>>> import torch\n>>> with torch.no_grad():\n... out = model(tensor)\n>>> probabilities = torch.nn.functional.softmax(out[0], dim=0)\n>>> print(probabilities.shape)\n>>> # prints: torch.Size([1000])\n```\n\nTo get the top-5 predictions class names:\n\n```py\n>>> # Get imagenet class mappings\n>>> url, filename = (""https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"", ""imagenet_classes.txt"")\n>>> urllib.request.urlretrieve(url, filename)\n>>> with open(""imagenet_classes.txt"", ""r"") as f:\n... categories = [s.strip() for s in f.readlines()]\n\n>>> # Print top categories per image\n>>> top5_prob, top5_catid = torch.topk(probabilities, 5)\n>>> for i in range(top5_prob.size(0)):\n... print(categories[top5_catid[i]], top5_prob[i].item())\n>>> # prints class names and probabilities like:\n>>> # [('Samoyed', 0.6425196528434753), ('Pomeranian', 0.04062102362513542), ('keeshond', 0.03186424449086189), ('white wolf', 0.01739676296710968), ('Eskimo dog', 0.011717947199940681)]\n```\n\nReplace the model name with the variant you want to use, e.g. `tf_efficientnet_lite0`. You can find the IDs in the model summaries at the top of this page.",What is the shape of the probabilities tensor after getting model predictions?,torch.Size([1000]),huggingface/pytorch-image-models/blob/main/hfdocs/source/models/tf-efficientnet-lite.mdx
2,"`column_statistics` content depends on the feature type, see examples below.\n##### class_label\n\n<details><summary>example: </summary>\n<p>\n\n```python\n{\n ""column_name"": ""class_col"",\n ""column_type"": ""class_label"",\n ""column_statistics"": {\n ""nan_count"": 0,\n ""nan_proportion"": 0.0,\n ""no_label_count"": 0, # number of -1 values - special value of the `datasets` lib to encode `no label` \n ""no_label_proportion"": 0.0,\n ""n_unique"": 5, # number of unique values (excluding `no label` and nan)\n ""frequencies"": { # mapping value -> its count\n ""this"": 19834,\n ""are"": 20159,\n ""random"": 20109,\n ""words"": 20172,\n ""test"": 19726\n }\n }\n}\n```\n</p>\n</details> \n\n##### float\n\nBin size for histogram is counted as `(max_value - min_value) / DESCRIPTIVE_STATISTICS_HISTOGRAM_NUM_BINS`\n\n<details><summary>example: </summary>\n<p>\n\n```python\n{\n ""column_name"": ""delay"",\n ""column_type"": ""float"",\n ""column_statistics"": {\n ""nan_count"": 0,\n ""nan_proportion"": 0.0,\n ""min"": -10.206,\n ""max"": 8.48053,\n ""mean"": 2.10174,\n ""median"": 3.4012,\n ""std"": 3.12487,\n ""histogram"": {\n ""hist"": [\n 2,\n 34,\n 256,\n 15198,\n 9037,\n 2342,\n 12743,\n 45114,\n 14904,\n 370\n ],\n ""bin_edges"": [\n -10.206,\n -8.33734,\n -6.46869,\n -4.60004,\n -2.73139,\n -0.86273,\n 1.00592,\n 2.87457,\n 4.74322,\n 6.61188,\n 8.48053 # includes maximum value, so len is always len(hist) + 1\n ]\n }\n }\n}\n```\n</p>\n</details> \n\n##### int","What is the maximum value for the column named ""delay""?",8.48053,huggingface/datasets-server/blob/main/services/worker/README.md
3,"And, we've released two parts of this course and planning to release the third part this year. I'm really excited about the next part that we're developing right now where we're going to explore different modalities where transformers are really powerful. Most of the time we think of transformers for NLP, but likely there's been this explosion where transformers are being used in things like audio or in computer vision and we're going to be looking at these in detail. \n\n### What are some transformers applications that you're excited about? \n\n**Lewis:** So one that's kind of fun is in the course we had an event last year where we got people in the community to use the course material to build applications.\n\nAnd one of the participants in this event created a cover letter generator for jobs. So the idea is that when you apply for a job there's always this annoying thing you have to write a cover letter and it's always like a bit like you have to be witty. So this guy created a cover letter generator where you provide some information about yourself and then it generates it from that.\n\nAnd he actually used that to apply to Hugging Face.\n\n### No way?!\n\n**Lewis:** He's joining the Big Science team as an intern. So. I mean this is a super cool thing, right? When you learn something and then use that thing to apply which I thought was pretty awesome. \n\n### Where do you want to see more ML applications?",What type of generator did a participant create for job applications?,A cover letter generator.,huggingface/blog/blob/main/lewis-tunstall-interview.md
4,Weights: https://download.pytorch.org/models/resnet101-5d3b4d8f.pth\n Results:\n - Task: Image Classification\n Dataset: ImageNet\n Metrics:\n Top 1 Accuracy: 77.37%\n Top 5 Accuracy: 93.56%\n- Name: tv_resnet152\n In Collection: ResNet\n Metadata:\n FLOPs: 14857660416\n Parameters: 60190000\n File Size: 241530880\n Architecture:\n - 1x1 Convolution\n - Batch Normalization\n - Bottleneck Residual Block\n - Convolution\n - Global Average Pooling\n - Max Pooling\n - ReLU\n - Residual Block\n - Residual Connection\n - Softmax\n Tasks:\n - Image Classification\n Training Techniques:\n - SGD with Momentum\n - Weight Decay\n Training Data:\n - ImageNet\n ID: tv_resnet152\n LR: 0.1\n Epochs: 90\n Crop Pct: '0.875'\n LR Gamma: 0.1\n Momentum: 0.9\n Batch Size: 32\n Image Size: '224'\n LR Step Size: 30\n Weight Decay: 0.0001\n Interpolation: bilinear\n Code: https://github.com/rwightman/pytorch-image-models/blob/9a25fdf3ad0414b4d66da443fe60ae0aa14edc84/timm/models/resnet.py#L769\n Weights: https://download.pytorch.org/models/resnet152-b121ed2d.pth\n Results:\n - Task: Image Classification\n Dataset: ImageNet\n Metrics:\n Top 1 Accuracy: 78.32%\n Top 5 Accuracy: 94.05%\n- Name: tv_resnet34\n In Collection: ResNet\n Metadata:\n FLOPs: 4718469120\n Parameters: 21800000\n File Size: 87306240\n Architecture:\n - 1x1 Convolution\n - Batch Normalization\n - Bottleneck Residual Block\n - Convolution\n - Global Average Pooling\n - Max Pooling\n - ReLU\n - Residual Block\n - Residual Connection\n - Softmax\n Tasks:\n - Image Classification\n Training Techniques:\n - SGD with Momentum\n - Weight Decay\n Training Data:\n - ImageNet\n ID: tv_resnet34\n LR: 0.1\n Epochs: 90\n Crop Pct: '0.875'\n LR Gamma: 0.1\n Momentum: 0.9\n Batch Size: 32\n Image Size: '224'\n LR Step Size: 30\n Weight Decay: 0.0001\n Interpolation: bilinear,What is the Top 1 Accuracy of the tv_resnet152 model on the ImageNet dataset?,78.32%,huggingface/pytorch-image-models/blob/main/docs/models/resnet.md


### 1.3. Setup critique agents

The questions generated by the previous agent can have many flaws: we should do a quality check before validating these questions.

We thus build critique agents that will rate each question on several criteria, given in [this paper](https://huggingface.co/papers/2312.10003):
- **Groundedness:** can the question be answered from the given context?
- **Relevance:** is the question relevant to users? For instance, `"What is the date when transformers 4.29.1 was released?"` is not relevant for ML practitioners.

One last failure case we've noticed is when a function is tailored for the particular setting where the question was generated, but undecipherable by itself, like `"What is the name of the function used in this guide?"`.
We also build a critique agent for this criteria:
- **Stand-alone**: is the question understandable free of any context, for someone with domain knowledge/Internet access? The opposite of this would be `What is the function used in this article?` for a question generated from a specific blog article.

We systematically score functions with all these agents, and whenever the score is too low for any one of the agents, we eliminate the question from our eval dataset.

üí° ___When asking the agents to output a score, we first ask them to produce its rationale. This will help us verify scores, but most importantly, asking it to first output rationale gives the model more tokens to think and elaborate an answer before summarizing it into a single score token.___

We now build and run these critique agents.

In [12]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(
            llm,
            question_groundedness_critique_prompt.format(
                context=output["context"], question=output["question"]
            ),
        ),
        "relevance": call_llm(
            llm,
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_llm(
            llm,
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception:
        continue

Generating critique for each QA couple...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [02:23<00:00,  4.77s/it]


Now let us filter out bad questions based on our critique agent scores:

In [13]:
import pandas as pd

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 4)
    & (generated_questions["standalone_score"] >= 4)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(
    generated_questions, split="train", preserve_index=False
)

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,What is the license type for the Falcon 40B model?,Apache-2.0,5,4,2
1,What is the shape of the probabilities tensor after getting model predictions?,torch.Size([1000]),5,5,2
2,"What is the maximum value for the column named ""delay""?",8.48053,5,2,1
3,What type of generator did a participant create for job applications?,A cover letter generator.,5,1,1
4,What is the Top 1 Accuracy of the tv_resnet152 model on the ImageNet dataset?,78.32%,5,2,2
5,What is the model name used in the hyperparameters for training?,facebook/bart-large-cnn,5,3,2
6,What method allows you to instantiate a model from a pretrained version?,`from_pretrained()`,5,5,3
7,What is the purpose of a neural compressor?,To optimize neural network models for efficiency and performance.,1,3,4
8,What is the famous benchmark mentioned for measuring how good models are at question answering?,SQuAD,5,4,2
9,How do you configure git to sign your commits with GPG?,Use the command `git config user.signingkey <Your GPG Key ID>` and `git config user.email <Your email on hf.co>`.,5,2,5


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
28,What command is used to install the `Accelerate` library from Hugging Face?,`pip install git+https://github.com/huggingface/accelerate`,5,4,4


Now our synthetic evaluation dataset is complete! We can evaluate different RAG systems on this evaluation dataset.

We have generated only a few QA couples here to reduce time and cost. But let's kickstart the next part by loading a pre-generated dataset:

In [14]:
# download the dataset from the hub
eval_dataset = datasets.load_dataset("m-ric/huggingface_doc_qa_eval", split="train")

# eval_dataset = datasets.load_dataset(
#     "json",
#     data_files=str(save_dir / "eval_dataset_hf.jsonl"),
#     split="train",
# )

## Save datasets

In [15]:
# Save ALL datasets
save_dir = f"datasets_local/{datetime.datetime.now():%Y%m%d_%H%M%S}"
os.makedirs(save_dir, exist_ok=True)

# 1. Original corpus (2,647 docs)
if "ds" in globals():
    initial_corpus = [{"text": doc["text"], "source": doc["source"]} for doc in ds]
    write_jsonl(os.path.join(save_dir, "initial_corpus.jsonl"), initial_corpus)

# 2. Processed/chunked docs
if "docs_processed" in globals():
    write_jsonl(os.path.join(save_dir, "processed_docs.jsonl"), docs_processed)

# 3. Your generated QA pairs (all 30)
if "outputs" in globals():
    write_jsonl(os.path.join(save_dir, "qa_generated_all.jsonl"), outputs)

# 4. Your filtered QA pairs (6 that passed)
if "generated_questions" in globals():
    outputs_filtered = generated_questions.to_dict("records")
    write_jsonl(os.path.join(save_dir, "qa_generated_filtered.jsonl"), outputs_filtered)

# 5. The downloaded eval dataset (67 questions)
if "eval_dataset" in globals():
    eval_data = [item for item in eval_dataset]
    write_jsonl(os.path.join(save_dir, "eval_dataset_hf.jsonl"), eval_data)


print(save_dir)

datasets_local/20250914_145157
