In [72]:
import os
import sys
import time
from io import StringIO
import concurrent.futures

import dotenv
from IPython.display import HTML
import numpy as np
import pandas as pd
import requests
from jinja2 import Template

dotenv.load_dotenv("../.env")

EVALAP_API_URL = "http://localhost:8000/v1"
#EVALAP_API_URL = "https://evalap.etalab.gouv.fr/v1"
EVALAP_API_KEY = os.getenv("EVALAP_API_KEY") 
ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1"
ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")
OPENAI_URL = "https://api.openai.com/v1"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
headers = {"Authorization": f"Bearer {EVALAP_API_KEY}"}

In [73]:
# Fetch the dataset from Evalap
# --
dataset_name = "LegalBenchRAG"
response = requests.get(
    f"{EVALAP_API_URL}/dataset?name={dataset_name}&with_df=true",
    headers={"Authorization": f"Bearer {EVALAP_API_KEY}"},
)
response.raise_for_status()
dataset = response.json()
dataset_df =  pd.read_json(StringIO(dataset["df"]))
dataset_df

Unnamed: 0,query,output_true,snippets,dataset_name
0,Consider the Marketing Affiliate Agreement bet...,This agreement shall begin upon the date of it...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
1,Consider the Marketing Affiliate Agreement bet...,This agreement shall begin upon the date of it...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
2,Consider the Marketing Affiliate Agreement bet...,This Agreement may be terminated by either par...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
3,Consider the Marketing Affiliate Agreement bet...,This Agreement is accepted by Company in the S...,[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
4,Consider the Marketing Affiliate Agreement bet...,"MA may not assign, sell, lease or otherwise tr...",[{'file_path': 'cuad/CybergyHoldingsInc_201405...,cuad
...,...,...,...,...
6884,Consider VELCO's Non-Disclosure Agreement; Doe...,"For purposes of this Agreement, “BCSI” shall m...",[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli
6885,Consider VELCO's Non-Disclosure Agreement; Doe...,"The foregoing notwithstanding, the Recipient m...",[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli
6886,Consider VELCO's Non-Disclosure Agreement; Doe...,5. In the event that the Recipient is required...,[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli
6887,Consider VELCO's Non-Disclosure Agreement; Doe...,"The foregoing notwithstanding, the Recipient m...",[{'file_path': 'contractnli/VELCO%20NDA%20rev0...,contractnli


In [76]:
# Build 1000 random index to work with a subset of the dataset in order to do faster and cheaper evaluation
# --
N = len(dataset_df) # Size of the 
rng = np.random.default_rng(43)
sample = rng.choice(np.arange(N+1), size=5, replace=False)
sample = sample.tolist()

In [78]:
# Initial NORAG experiments
# --

expset_name = "LegalBenchRAG Evaluation v1"
expset_readme = "A extensive RAG evaluation on the LegalBecnhRAG dataset. See [complete me]"

system_prompt = "Réponds de manière simple et direct à la question sans aucune explication. Ne cite pas les sources/contexte si donnée."
sampling_params = {"temperature": 0.2}

common_params = {
    "dataset": dataset["name"],
    "metrics": ["judge_precision", "output_length"],
    "model": {"sampling_params" : sampling_params, "system_prompt": system_prompt},
    "judge_model": {
        "name": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY
    },
    "sample": sample,
}

grid_params = {
    "model": [
        {"name": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY}
    ],
}

expset = {
    "name": expset_name,
    "readme": expset_readme,
    "cv": {"common_params": common_params, "grid_params": grid_params, "repeat": 1},
}

response = requests.post(
    f"{EVALAP_API_URL}/experiment_set",
    headers={"Authorization": f"Bearer {EVALAP_API_KEY}", "Content-Encoding": "gzip"},
    json=expset,
)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Created expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)

Created expset: LegalBenchRAG Evaluation v1 (47)


In [35]:
# add the evalap repo into the workspace
# @FUTURE: !pip install evalap
sys.path.append("../")
from evalap.clients.llm import LlmClient, split_think_answer
from evalap.rag.search import SearchEngineClient


rag_prompt = Template("""Utilise les sources d'informations suivantes comme source de vérité si elle adresse la question :

<SOURCE>
{% for chunk in chunks %}
{{chunk.text}}

---

{% endfor %}
</SOURCE>

QUESTION: 
{{query}}
""")

# Augment a prompt with collection search
def do_rag(query, collection_name=None, limit=5, search_method="hybrid", model_embedding=None):
    # Search relevant chunks
    se_client = SearchEngineClient()
    hits = se_client.search(collection_name, query, limit=limit, method=search_method, model_embedding=model_embedding)
    # Render prompt
    return rag_prompt.render(query=query, chunks=hits, limit=limit)

# The LLM core generation
def generate_with_rag(prompt, model=None, system_prompt=system_prompt, with_rag=True, sampling_params=None, **rag_params):
    if not sampling_params:
        sampling_params = {}
        
    if with_rag:
        prompt = do_rag(prompt, **rag_params)
    
    messages = [{"role": "user", "content": prompt}]
    if system_prompt:
        messages = [{"role": "system", "content": system_prompt}] + messages
        
    aiclient = LlmClient()
    result = aiclient.generate(model=model, messages=messages, **sampling_params)
    observation = result.choices[0].message.content
    think, answer = split_think_answer(observation)
    return answer


In [49]:
print(do_rag("CNI", collection_name="legalbenchrag_v1", model_embedding="BAAI/bge-m3", limit=3))

Utilise les sources d'informations suivantes comme source de vérité si elle adresse la question :

<SOURCE>

1

Source: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020

---


2

Source: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020

---


4

Source: CNS PHARMACEUTICALS, INC., 8-K, 3/26/2020

---


</SOURCE>

QUESTION: 
CNI


In [None]:
# Async computing -- RAG generation
# --
from functools import partial

# The models to runs
models = [
   {
     "aliased_name": "model_hybrid_7_bgem3",
     "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", 
     "collection_name": "legalbenchrag_v1",
     "model_embedding": "BAAI/bge-m3",
     "search_method": "hybrid",
     "limit": 7,
     "system_prompt": system_prompt,
     "sampling_params": sampling_params,
    },
    {
     "aliased_name": "model_semantic_7_bgem3",
     "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", 
     "collection_name": "legalbenchrag_v1",
     "model_embedding": "BAAI/bge-m3",
     "search_method": "semantic",
     "limit": 7,
     "system_prompt": system_prompt,
     "sampling_params": sampling_params,
    },
    {
     "aliased_name": "model_lexical_7_bgem3",
     "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", 
     "collection_name": "legalbenchrag_v1",
     "model_embedding": "BAAI/bge-m3",
     "search_method": "lexical",
     "limit": 7,
     "system_prompt": system_prompt,
     "sampling_params": sampling_params,
    },
]

# Loop over the model to try
model_answers = []
for model in models:
    # Create a list of model arguments (same model repeated for each prompt)
    prompts = dataset_df.iloc[sample]['query'].tolist()
    params_to_partial = {
        k: model[k]
        for k in ["model", "collection_name", "model_embedding", "search_method", "limit", "system_prompt", "sampling_params"] 
        if model.get(k)
    }
    generate_with_rag_partial = partial(generate_with_rag, **params_to_partial)

    # Async over the prompts
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        # Map generate over pairs of (model, prompt)
        results = list(executor.map(generate_with_rag_partial, prompts))

    model_answers.append(results)

In [70]:
# Adding RAG model to the experiment set
# --
common_params = {
    "dataset": dataset["name"],
    "metrics": ["judge_precision", "output_length"],
    "model": {"sampling_params" : sampling_params, "system_prompt": system_prompt},
    "judge_model": {
        "name": "mistralai/Mistral-Small-3.2-24B-Instruct-2506", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY
    },
    "sample": sample,
}

grid_params = {
    "model": [
        {
            "aliased_name": model["aliased_name"],
            "name": model["model"],
            "system_prompt": system_prompt,
            "sampling_params": sampling_params,
            "output": model_answers[i],
        }
        for i, model in enumerate(models)    
    ],
}

expset = {
    "cv": {"common_params": common_params, "grid_params": grid_params, "repeat": 2},
}

response = requests.patch(
    f"{EVALAP_API_URL}/experiment_set/{expset_id}",
    headers={"Authorization": f"Bearer {EVALAP_API_KEY}", "Content-Encoding": "gzip"},
    json=expset,
)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Created expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)