In [67]:
import os
import sys
import time

import numpy as np
import pandas as pd
import requests

sys.path.append("..")
from api.utils import log_and_raise_for_status

API_URL = "http://localhost:8000"
ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1"
ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")

In [241]:
mfs_dataset = pd.read_csv("_data/evaluation_set_qo_qcm.csv", delimiter=";")
mfs_dataset.rename(columns={'true_answer': 'output_true'}, inplace=True) # to be compliant with EG1
mfs_dataset

Unnamed: 0,query,output_true,top_valid,operateur,thematique
0,Comment contester un PV reçu depuis l’Italie ?,"Pour contester un PV reçu depuis l'Italie, il ...",1,ANTS,Amendes
1,Quelles aides judiciaires trouver en France po...,Vous pouvez bénéficier de l'aide juridictionne...,0,ANTS,Amendes
2,Une mairie peut-elle refuser un formulaire cer...,"Non, une mairie ne peut pas refuser un formula...",1,ANTS,CNI/passeport
3,Comment renouveler une Carte Nationale d’Ident...,Pour renouveler une Carte Nationale d'Identité...,1,ANTS,CNI/passeport
4,Quel formulaire cerfa utiliser pour renouveler...,"Si vous ne faites pas la pré-demande, vous dev...",1,ANTS,CNI/passeport
5,Peut-on corriger les informations mises sur un...,"Non, Une fois validée, la pré-demande ne pourr...",1,ANTS,CNI/passeport
6,La mairie accepte-t-elle de corriger des faute...,"Non, Une fois validée, la pré-demande ne pourr...",1,ANTS,CNI/passeport
7,Le fils est né en France et a la nationalité f...,Il faut sélectionner : Vous êtes né en France ...,1,ANTS,CNI/passeport
8,Comment refaire une carte grise si l’ancien pr...,Le vendeur doit faire une déclaration de perte...,1,Ministère de l'intérieur ?,Carte grise
9,"Je suis un mineur, qui doit faire la demande d...",pour faire une demande de passeport pour un mi...,1,ANTS,CNI/Passeport


In [247]:
# Publish a dataset
dataset = {"name": "MFS_questions_v002", "df": mfs_dataset.to_json()}
response = requests.post(f'{API_URL}/v1/dataset', json=dataset)
resp = response.json()
resp

{'name': 'MFS_questions_v002',
 'id': 25,
 'has_query': True,
 'has_output': False,
 'has_output_true': True,
 'size': 39}

In [248]:
# Show available metrics
# - the require fields should be an existing field to the dataset used with the metric.
# - output metric can be generated from the query (see below)
response = requests.get(f'{API_URL}/v1/metrics')
response.json()

[{'name': 'judge_completude',
  'type': 'llm',
  'require': ['output', 'output_true']},
 {'name': 'judge_notator',
  'type': 'llm',
  'require': ['output', 'output_true', 'query']},
 {'name': 'qcm_exactness',
  'type': 'llm',
  'require': ['output', 'output_true']},
 {'name': 'prompt_length', 'type': 'llm', 'require': ['output']},
 {'name': 'judge_exactness',
  'type': 'llm',
  'require': ['output', 'output_true', 'query']}]

In [257]:
# Launch an experiment with a given **dataset**, **model** and set of **metrics** to compute.
# - the model generate the "output" from the "query"
# - you can also pass the "output" column instead of a model if you generated the answer by yourself...

models_to_test = ["meta-llama/Meta-Llama-3.1-8B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct"]
experiment_ids = []
for i, model in enumerate(models_to_test):
    experiment = {
        "name" : f"MFS_judge_{model}_{i}_v0",
        "dataset": "MFS_questions_v002",
        "model": {"name": model, "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        "metrics": ["prompt_length", "judge_exactness", "judge_completude", "judge_notator"],
    }
    response = requests.post(f'{API_URL}/v1/experiment', json=experiment)
    resp = response.json()
    if "id" in resp:
        experiment_ids.append(resp["id"])
        print(f'Experiment {resp["name"]} launched: {resp["experiment_status"]}')
    else:
        print(resp)

Experiment MFS_judge_meta-llama/Meta-Llama-3.1-8B-Instruct_0_v0 launched: running_answers
Experiment MFS_judge_meta-llama/Meta-Llama-3.1-70B-Instruct_1_v0 launched: running_answers


In [220]:
# Relaunch measures of an existing experiment
# (FOR THE EXAMPLE)
experiment = {
    "metrics": ["qcm_exactness", "prompt_length"],
    "skip_answers_generation": True,
}
response = requests.patch(f'{API_URL}/v1/experiment/{exp_id}', json=experiment)
response.json()

{'name': 'MFS_judge_v0',
 'metrics': ['prompt_length',
  'judge_exactness',
  'judge_completude',
  'judge_notator'],
 'readme': None,
 'experiment_set_id': None,
 'skip_answers_generation': True,
 'id': 6,
 'created_at': '2024-11-04T00:04:13.568872',
 'experiment_status': 'running_metrics',
 'num_try': 39,
 'num_success': 39,
 'dataset_id': 10,
 'model_id': 6}

In [259]:
# Read results
# --
df_all = []
for model, exp_id in zip(models_to_test, experiment_ids):
    # Get an experiment result
    response = requests.get(f'{API_URL}/v1/experiment/{exp_id}?with_results=true')
    results = response.json()["results"]
    
    # Build a result dataframe
    # --
    # build frontend measure from the observation_table (mean stc etc)
    df_metrics = {}
    for metric_results in results: 
        metric_name = metric_results["metric_name"]
        arr = np.array([x["score"] for x in metric_results["observation_table"] if (isinstance(x["score"], (int, float)))])
        df = pd.DataFrame([[
                np.mean(arr),
                np.std(arr),
                np.median(arr),
                f"{arr.mean():.2f} ± {arr.std():.2f}"  # Formatting as 'mean±std'
            ]], columns=["mean", "std", "median", "mean_std"])
        df_metrics[metric_name] = df
    
    
    # Stack the mean_std final measure
    # --
    # Create a DataFrame for mean±std
    df = pd.DataFrame({metric_name:df["mean_std"].iloc[0] for metric_name, df in df_metrics.items()}, index=[model])
    df_all.append(df)

pd.concat(df_all)

Unnamed: 0,prompt_length,judge_completude,judge_exactness,judge_notator
meta-llama/Meta-Llama-3.1-8B-Instruct,295.03 ± 107.24,26.67 ± 20.89,0.08 ± 0.27,3.51 ± 1.99
meta-llama/Meta-Llama-3.1-70B-Instruct,242.87 ± 106.95,34.74 ± 23.51,0.21 ± 0.40,4.79 ± 2.85


In [255]:
# See what's inside an experiment result
results

[{'metric_name': 'judge_completude',
  'id': 35,
  'created_at': '2024-11-04T00:35:36.216793',
  'observation_table': [{'id': 1167,
    'created_at': '2024-11-04T00:37:16.344579',
    'score': 50.0,
    'observation': '50',
    'num_line': 8,
    'error_msg': None},
   {'id': 1169,
    'created_at': '2024-11-04T00:37:16.484487',
    'score': 0.0,
    'observation': '0',
    'num_line': 4,
    'error_msg': None},
   {'id': 1170,
    'created_at': '2024-11-04T00:37:16.916563',
    'score': 20.0,
    'observation': '20',
    'num_line': 7,
    'error_msg': None},
   {'id': 1171,
    'created_at': '2024-11-04T00:37:16.972361',
    'score': 20.0,
    'observation': '20',
    'num_line': 5,
    'error_msg': None},
   {'id': 1173,
    'created_at': '2024-11-04T00:37:16.919882',
    'score': 70.0,
    'observation': '70',
    'num_line': 3,
    'error_msg': None},
   {'id': 1174,
    'created_at': '2024-11-04T00:37:17.423462',
    'score': 10.0,
    'observation': '10',
    'num_line': 1,
    