# Demo for launching an experiment set with crossvalidation


In [None]:
import os
import sys
import time

import dotenv
from IPython.display import HTML
import numpy as np
import pandas as pd
import requests

dotenv.load_dotenv("../.env")
sys.path.append("..")

EVALAP_API_URL = "http://localhost:8000/v1"
#EVALAP_API_URL = "https://evalap.etalab.gouv.fr/v1"
EVALAP_API_KEY = os.getenv("EVALAP_API_KEY") 
ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1"
ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")
MFS_API_URL = "https://franceservices.etalab.gouv.fr/api/v1"
MFS_API_KEY = os.getenv("MFS_API_KEY")
headers = {"Authorization": f"Bearer {EVALAP_API_KEY}"}

In [None]:
# Various utility functions
# --
def format_metrics(row):
   # format a dataframe has a series of "mean ± std"
   metrics = {}
   for metric in final_df.columns.levels[0]:
       mean_value = row[(metric, 'mean')]
       std_value = row[(metric, 'std')]
       metrics[metric] = f"{mean_value:.2f} ± {std_value:.2f}"
   return pd.Series(metrics)
    
def highlight_cells(s):
 # Custom function to highlight the entry with the highest/lowest mean value
    means = s.apply(lambda x: float(str(x).split('±')[0].strip()))
    # Create a mask where 1 for max, 0 for min
    max_mean_index = means.idxmax()
    min_mean_index = means.idxmin()  
    mask = pd.Series({max_mean_index: 1, min_mean_index: 0}, index=s.index)
    return [
        'font-weight: bold; color: salmon' if mask_value == 0 else
        'font-weight: bold; color: green' if mask_value == 1 else
        ''
        for mask_value in mask
    ]

## Designing and running an experiment set with raglimit


*Objectif*: **Comparing the impact of the `limit` parameters on a RAG model**

An experiment set is a collection of experiments that are part of the same evaluation scenario. 
In this notebook, we're comparing how the (maximum) number of chunks influences the model's performance.

To conduct these experiments, one approach consists by creating an empty experiment set (via POST /experiment_set) and then add a list of experiments to it (via POST /experiment with the reference to the experimentset_id). Each experiment should have all parameters the same, except for the (maximum) number of chunks.

Alternatively, the /experiment_set endpoint offers a convenient feature called cv (short for cross-validation). This feature includes two key parameters:

- **common_params**: These are the parameters that will be shared across all experiments in the set.
- **grid_params**: This allows you to specify a list of varying values for any parameter.


Both **commons_params** and **grid_params** accept all the parameter defined by the ExperimentSetCreate schema.  
The experiments will be generated by combining the **common_params** with each unique set of values from the cartesian product of the lists provided in **grid_params**.

In [None]:
judge_name = "gpt-4.1"
judge_api_url = "https://api.openai.com/v1",
judge_api_key = os.getenv("OPENAI_API_KEY")

In [None]:
# Designing my experiments
# --
expset_name = "mfs_rag_limit_v1"
expset_readme = "Comparing the impact of the `limit` parameters on a RAG model."
metrics = ["answer_relevancy", "judge_exactness", "judge_notator", "output_length", "generation_time"]
common_params = {
    "dataset" : "MFS_questions_v01",
    "model" : {"name": "meta-llama/Llama-3.1-8B-Instruct", "sampling_params": {"temperature": 0.2}, "base_url": MFS_API_URL, "api_key": MFS_API_KEY},
    "metrics" : metrics,
    "judge_model": {
        "name": judge_name,
        "base_url": judge_api_url,
        "api_key": judge_api_key,
    },
}
grid_params = {
    "model": [{"extra_params": {"rag": {"mode":"rag", "limit":i}}} for i in [1, 2, 3, 4, 5, 7, 10, 15, 20]],
}

# Lauching the experiment set
expset = {
    "name" : expset_name, 
    "readme": expset_readme,
    "cv": {"common_params": common_params, "grid_params": grid_params}
}
response = requests.post(f'{EVALAP_API_URL}/experiment_set', json=expset, headers=headers)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Created expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)

------

## Designing and running an experiment set with repeat params


*Objectif*: **Comparing some models variability**

An experiment set is a collection of experiments that are part of the same evaluation scenario. 
In this notebook, we're comparing how the (maximum) number of chunks influences the model's performance.

To conduct these experiments, one approach consists by creating an empty experiment set (via POST /experiment_set) and then add a list of experiments to it (via POST /experiment with the reference to the experimentset_id). Each experiment should have all parameters the same, except for the (maximum) number of chunks.

Alternatively, the /experiment_set endpoint offers a convenient feature called cv (short for cross-validation). This feature includes two key parameters:

- **common_params**: These are the parameters that will be shared across all experiments in the set.
- **grid_params**: This allows you to specify a list of varying values for any parameter.


Both **commons_params** and **grid_params** accept all the parameter defined by the ExperimentSetCreate schema.  
The experiments will be generated by combining the **common_params** with each unique set of values from the cartesian product of the lists provided in **grid_params**.

# Designing my experiments
# --
expset_name = "mfs_variability_v3"
expset_readme = "Comparing some models variability."
metrics = ["answer_relevancy", "judge_exactness", "judge_notator", "output_length", "generation_time"]
common_params = {
    "dataset" : "MFS_questions_v01",
    "model": {"sampling_params" : {"temperature": 0.2}},
    "metrics" : metrics,
    "judge_model": "gpt-4o",
}
grid_params = {
    "model": [
        {"name": "google/gemma-2-9b-it",                   "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        {"name": "meta-llama/Llama-3.1-8B-Instruct",       "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        {"name": "meta-llama/Llama-3.2-3B-Instruct",  "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        {"name": "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8",  "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        {"name": "meta-llama/Llama-3.3-70B-Instruct", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        {"name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        {
          "name": "meta-llama/Llama-3.1-8B-Instruct", "extra_params": {"rag": {"mode":"rag", "limit":7}}, 
          "base_url": MFS_API_URL, "api_key": MFS_API_KEY, 
        },
        {
          "name": "AgentPublic/llama3-instruct-guillaumetell", "extra_params": {"rag": {"mode":"rag", "limit":7}}, 
          "base_url": MFS_API_URL, "api_key": MFS_API_KEY, 
        }
    ],
}

# Lauching the experiment set
expset = {
    "name" : expset_name, 
    "readme": expset_readme,
    "cv": {"common_params": common_params, "grid_params": grid_params, "repeat":5}
}
response = requests.post(f'{EVALAP_API_URL}/experiment_set', json=expset, headers=headers)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Created expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)

-----

## Designing and running an experiment set with rag metrics



*Objectif*: **Comparing baseline models with RAG metrics (use the _retriever_context_)**

In [None]:
# Designing my experiments
# --
expset_name = "mfs_RAG_metrics_v3"
expset_readme = "Evaluation of baseline models with RAG metrics (retriever_context)."
metrics = ["contextual_relevancy", "contextual_recall", "contextual_precision"]
common_params = {
    "dataset" : "MFS_questions_v01",
    "model": {
        "extra_params": {"rag": {"mode":"rag", "limit":7}},
        "sampling_params" : {"temperature": 0.2}
    },
    "metrics" : metrics,
    "judge_model": "gpt-4o",
}
grid_params = {
    "model": [
        {
          "name": "meta-llama/Llama-3.2-3B-Instruct", 
          "base_url": MFS_API_URL, "api_key": MFS_API_KEY, 
        },
        {
          "name": "google/gemma-2-9b-it", 
          "base_url": MFS_API_URL, "api_key": MFS_API_KEY, 
        },
        {
          "name": "meta-llama/Llama-3.1-8B-Instruct", 
          "base_url": MFS_API_URL, "api_key": MFS_API_KEY, 
        },
        {
          "name": "AgentPublic/llama3-instruct-guillaumetell", 
          "base_url": MFS_API_URL, "api_key": MFS_API_KEY, 
        }
    ],
}

# Lauching the experiment set
expset = {
    "name" : expset_name, 
    "readme": expset_readme,
    "cv": {"common_params": common_params, "grid_params": grid_params, "repeat":3}
}
response = requests.post(f'{EVALAP_API_URL}/experiment_set', json=expset, headers=headers)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Created expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)