# RAG Evalaution on LegalBenchRAG

- evaluation on dataset legalbenchrag
- albert medium model and bge-m3 embedding
- build the elastic search index with `python evalap/rag/build.py`

In [None]:
import os
import sys
import time
from io import StringIO
import concurrent.futures

import dotenv
from IPython.display import HTML
import numpy as np
import pandas as pd
import requests
from jinja2 import Template

dotenv.load_dotenv("../.env")

EVALAP_API_URL = "http://localhost:8000/v1"
EVALAP_API_KEY = os.getenv("EVALAP_API_KEY")
ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1"
ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")
OPENAI_URL = "https://api.openai.com/v1"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
headers = {"Authorization": f"Bearer {EVALAP_API_KEY}"}

In [2]:
# Fetch the dataset from Evalap
# --
dataset_name = "LegalBenchRAG"
response = requests.get(
    f"{EVALAP_API_URL}/dataset?name={dataset_name}&with_df=true",
    headers={"Authorization": f"Bearer {EVALAP_API_KEY}"},
)
response.raise_for_status()
dataset = response.json()
dataset_df =  pd.read_json(StringIO(dataset["df"]))
dataset_df

Unnamed: 0,query,output_true,snippets,dataset_name
0,Consider the Marketing Affiliate Agreement bet...,This agreement shall begin upon the date of it...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
1,Consider the Marketing Affiliate Agreement bet...,This agreement shall begin upon the date of it...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
2,Consider the Marketing Affiliate Agreement bet...,This Agreement may be terminated by either par...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
3,Consider the Marketing Affiliate Agreement bet...,This Agreement is accepted by Company in the S...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
4,Consider the Marketing Affiliate Agreement bet...,"MA may not assign, sell, lease or otherwise tr...",[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
...,...,...,...,...
6884,Consider VELCO's Non-Disclosure Agreement; Doe...,"For purposes of this Agreement, “BCSI” shall m...",[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli
6885,Consider VELCO's Non-Disclosure Agreement; Doe...,"The foregoing notwithstanding, the Recipient m...",[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli
6886,Consider VELCO's Non-Disclosure Agreement; Doe...,5. In the event that the Recipient is required...,[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli
6887,Consider VELCO's Non-Disclosure Agreement; Doe...,"The foregoing notwithstanding, the Recipient m...",[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli


In [None]:
# Build 1000 random index to work with a subset of the dataset in order to do faster and cheaper evaluation
# --
N = len(dataset_df) # Size of the
rng = np.random.default_rng(42)
sample = rng.choice(np.arange(N), size=1000, replace=False)
sample = sample.tolist()

In [129]:
# Initial NORAG experiments
# --

expset_name = "LegalBenchRAG Evaluation"
expset_readme = "A extensive RAG evaluation on the LegalBenchRAG dataset. See [complete me]"

system_prompt = "Provide a clear and sound answer to the question. Use the source of information, if given, to answer."
sampling_params = {"temperature": 0.2}

common_params = {
    "dataset": dataset["name"],
    "metrics": ["judge_precision", "output_length"],
    "model": {"sampling_params" : sampling_params, "system_prompt": system_prompt},
    "judge_model": {
        "name": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY
    },
    "sample": sample,
}

grid_params = {
    "model": [
        {"name": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY}
    ],
}

expset = {
    "name": expset_name,
    "readme": expset_readme,
    "cv": {"common_params": common_params, "grid_params": grid_params, "repeat": 1},
}

response = requests.post(
    f"{EVALAP_API_URL}/experiment_set",
    headers={"Authorization": f"Bearer {EVALAP_API_KEY}", "Content-Encoding": "gzip"},
    json=expset,
)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Created expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)

Created expset: LegalBenchRAG Evaluation (111)


In [None]:
# add the evalap repo into the workspace
# @FUTURE: !pip install evalap
sys.path.append("../")
from evalap.clients.llm import LlmClient, split_think_answer
from evalap.rag.search import SearchEngineClient


rag_prompt = Template("""Use the following sources of information as a source of truth if they address the question:

<SOURCE>
{% for chunk in chunks %}
{{chunk.text}}

---

{% endfor %}
</SOURCE>

QUESTION:
{{query}}
""")

def rerank(prompt: str, hits: list[str], model=None) -> list[str]:
    """
    Reranks a list of documents based on their relevance to the prompt.

    Args:
        prompt: The query or question to rerank documents against
        hits: List of document texts to rerank
        model: Optional model name to use for reranking (uses default if None)

    Returns:
        A reordered list of the input documents, sorted by relevance
    """
    # API endpoint (replace with your actual endpoint)
    url = "https://albert.api.etalab.gouv.fr/v1/rerank"

    # Prepare the request payload
    payload = {"prompt": prompt, "input": hits, "model": model}

    # Set headers
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('ALBERT_API_KEY')}"}

    # Send the request
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    result = response.json()

    # Extract the scores and indices
    scored_indices = result["data"]

    # Sort the indices by score in descending order (higher score = more relevant)
    sorted_indices = sorted(scored_indices, key=lambda x: x.get("score", 0), reverse=True)

    # Reorder the original hits list based on the sorted indices
    reranked_hits = []
    for item in sorted_indices:
        index = item.get("index", 0)
        if 0 <= index < len(hits):
            reranked_hits.append(hits[index])

    return reranked_hits

# Augment a prompt with collection search
def do_rag(query, collection_name=None, limit=5, search_method="hybrid", model_embedding=None, model_rerank=None):
    # Search relevant chunks
    se_client = SearchEngineClient()
    if model_rerank:
        limit_ = limit * 2
    else:
        limit_ = limit
    hits = se_client.search(collection_name, query, limit=limit_, method=search_method, model_embedding=model_embedding)
    if model_rerank:
        # albert reranker only...
        _hits = rerank(query, [h["text"] for h in hits], model=model_rerank)
        for i, h in enumerate(_hits):
            hits[i]["text"] = h
        hits = hits[:limit]

    # Render prompt
    return rag_prompt.render(query=query, chunks=hits, limit=limit)

# The LLM core generation
def generate_with_rag(prompt, model=None, system_prompt=system_prompt, with_rag=True, sampling_params=None, **rag_params):
    if not sampling_params:
        sampling_params = {}

    if with_rag:
        prompt = do_rag(prompt, **rag_params)

    messages = [{"role": "user", "content": prompt}]
    if system_prompt:
        messages = [{"role": "system", "content": system_prompt}] + messages

    aiclient = LlmClient()
    result = aiclient.generate(model=model, messages=messages, **sampling_params)
    observation = result.choices[0].message.content
    think, answer = split_think_answer(observation)
    return answer


In [17]:
se_client = SearchEngineClient()
hits = se_client.search("legalbenchrag_v2", "CNI", limit=3, method="semantic", model_embedding="BAAI/bge-m3")
#hits = se_client.search("legalbenchrag_v4", "CNI", limit=3, method="hybrid", model_embedding="qwen3-embedding-8b")
print(hits)
hits = rerank("CNI", [h["text"] for h in hits], model="BAAI/bge-reranker-v2-m3")
hits

[{'text': '1\n\nSource: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020', '__score': 0.7200798}, {'text': '2\n\nSource: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020', '__score': 0.7199438}, {'text': '4\n\nSource: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020', '__score': 0.71921223}]


['4\n\nSource: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020',
 '1\n\nSource: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020',
 '2\n\nSource: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020']

In [8]:
print(do_rag("CNI", collection_name="legalbenchrag_v3", model_embedding="BAAI/bge-m3", limit=3, model_rerank="BAAI/bge-reranker-v2-m3"))

Use the following sources of information as a source of truth if they address the question:

<SOURCE>

<intro>
Exhibit 10.1 Development Agreement This Development Agreement (the "Agreement") dated as of March 20, 2020 (the "Effective Date") is entered into by and between CNS Pharmaceuticals, Inc. ("CNS"), a Nevada corporation, having a business address of 2100 West Loop South, Suite 900, Houston, Texas 77027, and WPD Pharmaceuticals, ("WPD"), a Polish corporation, having a business address of ul. Żwirki i Wigury 101, 02-089 Warszawa. CNS and WPD are sometimes referred to herein individually as a "Party" and collectively as the "Parties." RECITALS WHEREAS, WPD is party to a sublicense agreement dated February 19, 2019 with Moleculin Biotech, Inc. ("MBI") (the "Sublicense Agreement") to research and develop, manufacture, have manufactured, use, export/import, offer to sell and/or sell certain products for use in certain territories; WHEREAS, WPD is developing certain anti-viral indicatio

In [None]:
# Async computing -- RAG generation
# --
from functools import partial

"""
sans "lbrv": simple chunking
k: 10, lw: 1, sw: 1.5

lbrv2: simple chunking
k: 40, lw: 0.5, sw: 1

lbrv2.5: simple chunking
k: 50, lw: 1, sw: 1

lbrv3: simple chunking
k: 60, lw: 1, sw: 1.5

lbvrv4: header chunking
k20, lw: 1, sw: 1.5

v4.5: header chunking
k:2, lw: 1, sw: 1.5

v4.6: header chunking
k:10, lw: 1, sw: 1.5 (best)

v5: header chunking + reranker
k:10, lw: 1, sw: 1.5
"""

# The models to runs
models = [
    #{
    # "aliased_name": "model_hybrid_7_qwen3_lbrv4.6",
    # "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    # "collection_name": "legalbenchrag_v4",
    # "model_embedding": "qwen3-embedding-8b",
    # "search_method": "hybrid",
    # "limit": 7,
    # "system_prompt": system_prompt,
    # "sampling_params": sampling_params,
    #},
    #{
    # "aliased_name": "model_semantic_20_qwen3_lbrv4.6",
    # "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
    # "collection_name": "legalbenchrag_v4",
    # "model_embedding": "qwen3-embedding-8b",
    # "search_method": "semantic",
    # "limit": 20,
    # "system_prompt": system_prompt,
    # "sampling_params": sampling_params,
    #},
   {
     "aliased_name": "model_hybrid_7_qwen3_lbrv5",
     "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
     "collection_name": "legalbenchrag_v4",
     "model_embedding": "qwen3-embedding-8b",
     "model_rerank": "BAAI/bge-reranker-v2-m3",
     "search_method": "hybrid",
     "limit": 7,
     "system_prompt": system_prompt,
     "sampling_params": sampling_params,
    },
    {
     "aliased_name": "model_semantic_20_qwen3_lbrv5",
     "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
     "collection_name": "legalbenchrag_v4",
     "model_embedding": "qwen3-embedding-8b",
     "model_rerank": "BAAI/bge-reranker-v2-m3",
     "search_method": "semantic",
     "limit": 20,
     "system_prompt": system_prompt,
     "sampling_params": sampling_params,
    },

]

# Loop over the model to try
model_answers = []
for model in models:
    # Create a list of model arguments (same model repeated for each prompt)
    prompts = dataset_df.iloc[sample]['query'].tolist()
    params_to_partial = {
        k: model[k]
        for k in ["model", "collection_name", "model_embedding", "search_method", "limit", "system_prompt", "sampling_params", "model_rerank"]
        if model.get(k)
    }
    generate_with_rag_partial = partial(generate_with_rag, **params_to_partial)

    # Async over the prompts
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        # Map generate over pairs of (model, prompt)
        results = list(executor.map(generate_with_rag_partial, prompts))

    model_answers.append(results)

In [20]:
for answers in model_answers:
    print(len([x for x in answers if x.strip()]))

1000
1000


In [None]:
# Adding RAG model to the experiment set (patching)
# --
common_params = {
    "dataset": dataset["name"],
    "metrics": ["judge_precision", "output_length"],
    "model": {"sampling_params" : sampling_params, "system_prompt": system_prompt},
    "judge_model": {
        "name": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY
    },
    "sample": sample,
}

grid_params = {
    "model": [
        {
            "aliased_name": model["aliased_name"],
            "name": model["model"],
            "system_prompt": system_prompt,
            "sampling_params": sampling_params,
            "output": model_answers[i],
        }
        for i, model in enumerate(models)
    ],
}

expset = {
    "name" : "LegalBenchRAG Evaluation v1",
    "readme" : "A extensive RAG evaluation on the LegalBenchRAG dataset. See [complete me]",
    "cv": {"common_params": common_params, "grid_params": grid_params, "repeat": 1},
}

response = requests.patch(
    f"{EVALAP_API_URL}/experiment_set/{expset_id}",
    headers={"Authorization": f"Bearer {EVALAP_API_KEY}", "Content-Encoding": "gzip"},
    json=expset,
)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Patch expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)

Patch expset: LegalBenchRAG Evaluation v1 (111)
