# Demo for launching a simple experiment in EG1

- code: https://github.com/etalab-ia/eg1
- api documentation: https://eg1.etalab.gouv.fr/redoc

In [1]:
import os
import sys
import time

import dotenv
from IPython.display import HTML
import numpy as np
import pandas as pd
import requests

dotenv.load_dotenv("../.env")
sys.path.append("..")
from api.utils import log_and_raise_for_status

API_URL = "http://localhost:8000"
ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1"
ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")
MFS_API_URL = "https://franceservices.staging.etalab.gouv.fr/api/v1"
MFS_API_KEY = "fee84132-e662-42d0-b198-cfda9de95c51"

## Load a dataset on which your evaluation is grounded

In [2]:
mfs_dataset = pd.read_csv("_data/evaluation_set_qo_qcm.csv", delimiter=";")
mfs_dataset.rename(columns={'true_answer': 'output_true'}, inplace=True) # to be compliant with EG1
mfs_dataset

Unnamed: 0,query,output_true,top_valid,operateur,thematique
0,Comment contester un PV reçu depuis l’Italie ?,"Pour contester un PV reçu depuis l'Italie, il ...",1,ANTS,Amendes
1,Quelles aides judiciaires trouver en France po...,Vous pouvez bénéficier de l'aide juridictionne...,0,ANTS,Amendes
2,Une mairie peut-elle refuser un formulaire cer...,"Non, une mairie ne peut pas refuser un formula...",1,ANTS,CNI/passeport
3,Comment renouveler une Carte Nationale d’Ident...,Pour renouveler une Carte Nationale d'Identité...,1,ANTS,CNI/passeport
4,Quel formulaire cerfa utiliser pour renouveler...,"Si vous ne faites pas la pré-demande, vous dev...",1,ANTS,CNI/passeport
5,Peut-on corriger les informations mises sur un...,"Non, Une fois validée, la pré-demande ne pourr...",1,ANTS,CNI/passeport
6,La mairie accepte-t-elle de corriger des faute...,"Non, Une fois validée, la pré-demande ne pourr...",1,ANTS,CNI/passeport
7,Le fils est né en France et a la nationalité f...,Il faut sélectionner : Vous êtes né en France ...,1,ANTS,CNI/passeport
8,Comment refaire une carte grise si l’ancien pr...,Le vendeur doit faire une déclaration de perte...,1,Ministère de l'intérieur ?,Carte grise
9,"Je suis un mineur, qui doit faire la demande d...",pour faire une demande de passeport pour un mi...,1,ANTS,CNI/Passeport


## Publish the dataset on EG1

If the dataset already exists you'll geta DuplicateEntry error. That is normal.

In [33]:
# Publish a dataset
dataset = {"name": "MFS_questions_v01", "df": mfs_dataset.to_json()}
response = requests.post(f'{API_URL}/v1/dataset', json=dataset)
resp = response.json()
resp

{'name': 'MFS_questions_v01',
 'id': 1,
 'has_query': True,
 'has_output': False,
 'has_output_true': True,
 'size': 39}

## List all avalaible metrics and their information

- the `require` field indicates which fields is required in the dataset for this metrics to operate.
- the `type` field is ignore for now. It will be associated with the type of the observation you get in the result output later

In [31]:
# Show available metrics
# - the require fields should be an existing field to the dataset used with the metric.
# - output metric can be generated from the query (see below)
response = requests.get(f'{API_URL}/v1/metrics')
all_metrics = response.json()
df = pd.DataFrame(all_metrics).sort_values(by=["type", "name"])
HTML(df.to_html(index=False))

name,description,type,require
answer_relevancy,see https://docs.confident-ai.com/docs/metrics-introduction,deepeval,"[output, query]"
bias,see https://docs.confident-ai.com/docs/metrics-introduction,deepeval,"[output, query]"
faithfulness,see https://docs.confident-ai.com/docs/metrics-introduction,deepeval,"[output, query, retrieval_context]"
toxicity,see https://docs.confident-ai.com/docs/metrics-introduction,deepeval,"[output, query]"
judge_completude,[0-100] score of the completude correspondance between output and output_true,llm,"[output, output_true]"
judge_exactness,Binary correspondance between output and output_true,llm,"[output, output_true, query]"
judge_notator,[1-10] score semantic similarity between output and output_true,llm,"[output, output_true, query]"
output_length,Binary correspondance between output and output_true,llm,[output]
qcm_exactness,output and output_true binary equality,llm,"[output, output_true]"


## Lauching a couple of experiments

Here we lauched the experiment independantly with the `/experiment` route. We will see on another notebook how to launch grouped experiments in a grid search fashion by using the route `/experimentset`.

In [87]:
# Launch an experiment with a given **dataset**, **model** and set of **metrics** to compute.
# - the model generate the "output" from the "query"
# - you can also pass the "output" column instead of a model if you generated the answer by yourself...

# Designing my experiments
sampling_params = {"temperature": 0.2} # use the same sampling params for all in this evalation
models_to_test = [
    {"name": "AgentPublic/llama3-instruct-8b",         "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
    {"name": "meta-llama/Meta-Llama-3.1-8B-Instruct",  "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
    {"name": "meta-llama/Meta-Llama-3.1-70B-Instruct", "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
    {
      "_name": "mfs-rag-baseline",
      "name": "AgentPublic/llama3-instruct-8b", "extra_params": {"rag": {"mode":"rag", "limit":7}}, 
      "base_url": MFS_API_URL, "api_key": MFS_API_KEY, 
    },    
]

# Lauching the experiments
experiment_ids = []
for i, model in enumerate(models_to_test):
    name = model["_name"] if model.get("_name") else model["name"]
    model["sampling_params"] = sampling_params
    experiment = {
        "name" : f"MFS_questions_{name}_{i}_v0",
        "dataset": "MFS_questions_v01",
        "model": model,
        "metrics": ["output_length", "judge_exactness", "judge_completude", "judge_notator", "bias", "toxicity", "answer_relevancy"],
    }
    response = requests.post(f'{API_URL}/v1/experiment', json=experiment)
    resp = response.json()
    if "id" in resp:
        experiment_ids.append(resp["id"])
        print(f'Created experiment: {resp["name"]} ({resp["id"]}), status: {resp["experiment_status"]}')
    else:
        print(resp)

Created experiment: MFS_questions_AgentPublic/llama3-instruct-8b_0_v0 (93), status: running_answers
Created experiment: MFS_questions_meta-llama/Meta-Llama-3.1-8B-Instruct_1_v0 (94), status: running_answers
Created experiment: MFS_questions_meta-llama/Meta-Llama-3.1-70B-Instruct_2_v0 (95), status: running_answers
Created experiment: MFS_questions_mfs-rag-baseline_3_v0 (96), status: running_answers


In [None]:
# Add a metric to an existing experiment(s)
# - If you want to update one or many metriccs without relaucnhing the answer generation
# --
return
for exp_id in experiment_ids:
    experiment = {
    #"metrics": ["answer_relevancy"],
    "metrics": ["output_length", "judge_exactness", "judge_completude", "judge_notator", "bias", "toxicity", "answer_relevancy"],
    "skip_answers_generation": True,
    }
    response = requests.patch(f'{API_URL}/v1/experiment/{exp_id}', json=experiment)
    resp = response.json()
    if "id" in resp:
        print(f'Updated experiment: {resp["name"]} ({resp["id"]}), status: {resp["experiment_status"]}')
    else:
        print(resp)

## Read results and showing results

In [91]:
# Read results
# --
df_all = [] # list of 
supports = [] # if some answer/observation failed, the support can be less than the dataset size
# @TODO: make it simple !
for model, exp_id in zip(models_to_test, experiment_ids):
    # Get an experiment result
    response = requests.get(f'{API_URL}/v1/experiment/{exp_id}?with_results=true')
    experiment = response.json()
    if experiment["experiment_status"] != "finished":
        print(f"Warning: experiment {exp_id} is not finished yet...")
    # experiment["name"] # Name of the experiment
    results = experiment["results"]
    
    # Build a result dataframe
    # --
    # build frontend measure from the observation_table (mean stc etc)
    df_metrics = {}
    metric_support = []
    supports.append(metric_support)
    for metric_results in results: 
        metric_name = metric_results["metric_name"]
        arr = np.array([x["score"] for x in metric_results["observation_table"] if (isinstance(x["score"], (int, float)))])
        df = pd.DataFrame([[
                np.mean(arr), # mean
                np.std(arr), # std
                np.median(arr), # median
                f"{arr.mean():.2f} ± {arr.std():.2f}",  # Formatting as 'mean±std'
                len(arr), # support
            ]], columns=["mean", "std", "median", "mean_std", "support"])
        df_metrics[metric_name] = df
        metric_support.append(len(arr))
    
    # Stack the mean_std final measure
    # --
    # Create a DataFrame for mean±std
    name = model["_name"] if model.get("_name") else model["name"]
    df = pd.DataFrame({metric_name:df["mean_std"].iloc[0] for metric_name, df in sorted(df_metrics.items())}, index=[name])
    df_all.append(df)

final_df = pd.concat(df_all)
#final_df["support"] = supports # for debugging
final_df

Unnamed: 0,answer_relevancy,bias,judge_completude,judge_exactness,judge_notator,output_length,toxicity
AgentPublic/llama3-instruct-8b,0.86 ± 0.24,0.00 ± 0.00,29.62 ± 21.44,0.03 ± 0.16,3.69 ± 1.99,308.44 ± 97.30,0.00 ± 0.00
meta-llama/Meta-Llama-3.1-8B-Instruct,0.94 ± 0.11,0.00 ± 0.00,29.49 ± 19.21,0.00 ± 0.00,3.51 ± 2.01,303.51 ± 107.09,0.00 ± 0.00
meta-llama/Meta-Llama-3.1-70B-Instruct,0.92 ± 0.21,0.00 ± 0.00,35.51 ± 23.14,0.05 ± 0.22,4.72 ± 2.53,254.03 ± 104.30,0.00 ± 0.00
mfs-rag-baseline,0.88 ± 0.18,0.00 ± 0.00,43.92 ± 28.44,0.18 ± 0.38,5.51 ± 2.60,117.18 ± 59.10,0.00 ± 0.00


## What is inside an experiment result ?

In [102]:
# See what's inside an experiment result
# --
metric_index = 4
len(results) # number of metrics
list(results[metric_index]) # the keys of the dict representing the result object
len(results[metric_index]["observation_table"]) # number of observation -> one per dataset line.
results[metric_index]["observation_table"][0] # the actual "observation" for one metric, for one line in the dataset.

{'id': 7087,
 'created_at': '2024-11-09T01:37:35.215941',
 'score': 1.0,
 'observation': 'The score is 1.00 because the response perfectly addressed the question without any irrelevant information. Well done!',
 'num_line': 10,
 'error_msg': None,
 'execution_time': 5}