In [1]:
# ruff: noqa: T201, T203
from __future__ import annotations

import os
from pathlib import Path

if Path.cwd().name == "notebooks":
    os.chdir("..")

import logging

import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

import src.prompts.system as system_prompts
from config.config import RUN_DIR
from config.runs_vars import SUBSET_TO_DATASET_MAP
from src.constants import PAIRS_SEPARATOR
from src.evaluate import (
    analyze_results,
    get_predictions_with_gt,
    plot_usage_histograms,
    read_run_metrics_df,
    store_run_metrics_df,
)
from src.formatting import (
    format_combined_metrics_path,
    format_oracle_pairs_filepath,
    format_oracle_pairs_precomputed_dir,
    format_predictions_run_path,
    format_storing_pathes_from_run_path,
    format_subsets_ontologies_paths,
)
from src.LLM_servers.openai import OpenAIServer
from src.onto_access import OntologyAccess
from src.onto_object import OntologyEntryAttr
from src.processing import parallel_samples_process, save_oracle_pairs_with_prompts, try_load_precomputed_oracle_pairs
from src.prompts.prompts import (
    prompt_direct_entity,
    prompt_direct_entity_ontological,
    prompt_direct_entity_with_synonyms,
    prompt_sequential_hierarchy,
    prompt_sequential_hierarchy_ontological,
    prompt_sequential_hierarchy_with_synonyms,
)
from src.utils import read_oracle_pairs, save_run_results

pd.set_option("display.max_rows", None)
logging.getLogger().setLevel(logging.WARNING)
load_dotenv()

%load_ext autoreload
%autoreload 2

In [2]:
PROMPT_FUNCTIONS_MAP = {
    prompt_function.__name__.replace("prompt_", ""): prompt_function
    for prompt_function in [
        prompt_direct_entity,
        prompt_direct_entity_ontological,
        prompt_sequential_hierarchy,
        prompt_sequential_hierarchy_ontological,
        prompt_direct_entity_with_synonyms,
        prompt_sequential_hierarchy_with_synonyms,
    ]
}
# ruff: noqa: ERA001
# SYSPROMPTS_MAP = {
#     "base": system_prompts.BASELINE_INITIALIZATION_MESSAGE,
#     "natural_language": system_prompts.INTUITIVE_NATURAL_LANGUAGE_JUDGEMENT_MESSAGE,
#     "ontology_aware": system_prompts.ONTOLOGY_AWARE_REASONING_MESSAGE,
#     "synonym_aware": system_prompts.SYNONYM_AWARE_MESSAGE,
#     "none": None,
# }

In [3]:
DATASET, SUBSET = "anatomy", "human-mouse"

src_onto_path, tgt_onto_path = format_subsets_ontologies_paths(DATASET, SUBSET)
src_onto_path, tgt_onto_path

(PosixPath('data/anatomy/human-mouse/mouse.owl'),
 PosixPath('data/anatomy/human-mouse/human.owl'))

### Pre-Validation

In [None]:
onto_src = OntologyAccess(src_onto_path, annotate_on_init=True)
onto_tgt = OntologyAccess(tgt_onto_path, annotate_on_init=True)

In [None]:
src_entity, tgt_entity = read_oracle_pairs(format_oracle_pairs_filepath(DATASET, SUBSET))[0]
src_entity, tgt_entity = OntologyEntryAttr(src_entity, onto_src), OntologyEntryAttr(tgt_entity, onto_tgt)

print(src_entity)
print(tgt_entity)

for name, prompt_function in PROMPT_FUNCTIONS_MAP.items():
    print(f"Prompt function: {name}")
    print(prompt_function(src_entity, tgt_entity))
    print("=" * 50)

In [None]:
for name, prompt_function in PROMPT_FUNCTIONS_MAP.items():
    print(f"Prompt function: {name}")
    oracle_candidate_pairs = try_load_precomputed_oracle_pairs(DATASET, SUBSET, prompt_function)
    print(*oracle_candidate_pairs[0], sep="\n")


### Pre-Processing

In [None]:
ALL_DATASET_NAMES = {
    # "anatomy": ["human-mouse"]j,
    # "bioml-2024": ["snomed-fma.body", "snomed-ncit.neoplas", "snomed-ncit.pharm", "ncit-doid", "omim-ordo"],
    "largebio": ["fma-nci", "fma-snomed", "snomed-nci"],
}
prompt_functions = list(PROMPT_FUNCTIONS_MAP.values())

for dataset_name, set_names in ALL_DATASET_NAMES.items():
    for set_name in set_names:
        try:
            ontologies = []
            for onto_path in tqdm(
                format_subsets_ontologies_paths(dataset_name, set_name),
                desc=f"Loading ontologies: {dataset_name} - {set_name}",
            ):
                ontologies.append(OntologyAccess(onto_path, annotate_on_init=True))

            pairs_path = format_oracle_pairs_filepath(dataset_name, set_name)
            oracle_pairs_dir = format_oracle_pairs_precomputed_dir(dataset_name, set_name)

            pairs_with_prompts_df = save_oracle_pairs_with_prompts(
                pairs_path, *ontologies, prompt_functions, oracle_pairs_dir, PAIRS_SEPARATOR, max_workers=2
            )
        except Exception as e:  # noqa: BLE001
            print(f"Error processing {dataset_name} - {set_name}: {e}")
            continue

### Run experiments


In [None]:
MAX_WORKERS = 100
MODELS = ["gemini-2.0-flash-lite"]  # "gpt-4o-mini"

DATASETS_MAP = {
    "anatomy": ["human-mouse"],
    "bioml-2024": ["omim-ordo"], # ["snomed-fma.body", "snomed-ncit.neoplas", "snomed-ncit.pharm", # "ncit-doid"],
    "largebio": ["fma-nci"], # "fma-snomed", "snomed-nci"]
}
EXPERIMENT_TYPE = "determinism"
EXP_NAMES_MAP = { str(i):{} for i in range(4)}

onto_src, onto_tgt = None, None

In [5]:
for exp_name in EXP_NAMES_MAP:
    for DATASET in DATASETS_MAP:
        for SUBSET in DATASETS_MAP[DATASET]:
            # Load the ontologies here, if there are no precomputed prompts
            for MODEL in MODELS:
                llm_oracle = (
                    OpenAIServer(
                        api_key=os.environ["GEMINI_API_KEY"],
                        base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
                    )
                    if MODEL.startswith("gemini")
                    else OpenAIServer(api_key=os.environ["OPENAI_API_KEY"])
                )
                llm_oracle.add_system_context(system_prompts.INTUITIVE_NATURAL_LANGUAGE_JUDGEMENT_MESSAGE)

                run_path = format_predictions_run_path(DATASET, SUBSET, MODEL, EXPERIMENT_TYPE, exp_spec=exp_name)
                print(f"{run_path=} | {DATASET=} | {SUBSET=} | {MODEL=} | {exp_name=}")

                for prompt_name, prompt_function in PROMPT_FUNCTIONS_MAP.items():
                    oracle_candidate_pairs = try_load_precomputed_oracle_pairs(DATASET, SUBSET, prompt_function)

                    results, tokens_usage, confidences = parallel_samples_process(
                        oracle_candidate_pairs, llm_oracle, onto_src, onto_tgt, MODEL, MAX_WORKERS, prompt_function
                    )
                    prediction_path, stats_path, diagram_path = format_storing_pathes_from_run_path(
                        run_path, SUBSET, MODEL, prompt_name, suffix=""
                    )
                    save_run_results(results, prediction_path, columns=["Source", "Target", "Prediction", "Confidence"])
                    plot_usage_histograms(
                        tokens_usage, confidences, do_plot=False, do_print=False, suptitle=prompt_name
                    )
                    try:
                        predictions = get_predictions_with_gt(run_path, DATASET, SUBSET, MODEL, prompt_name, suffix="")
                        analyze_results(
                            predictions,
                            print_results=False,
                            plot_confusion_matrix=False,
                            subtitle=f"{SUBSET}: {MODEL} {prompt_name} | ",
                            cm_save_path=diagram_path,
                            stats_path=stats_path,
                        )
                    except Exception as e:  # noqa: BLE001
                        print(f"Error: {e}")

                store_run_metrics_df(PROMPT_FUNCTIONS_MAP, run_path, DATASET, SUBSET, MODEL)

run_path=PosixPath('outputs/anatomy/human-mouse/gemini-2.0-flash-lite/determinism/0') | DATASET='anatomy' | SUBSET='human-mouse' | MODEL='gemini-2.0-flash-lite' | exp_name='0'


Processing Lines prompt_direct_entity: 100%|██████████| 398/398 [00:10<00:00, 37.38it/s] 
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 398/398 [00:11<00:00, 35.76it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 398/398 [00:10<00:00, 38.98it/s] 
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 398/398 [00:10<00:00, 37.45it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 398/398 [00:09<00:00, 41.61it/s] 
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 398/398 [00:10<00:00, 38.78it/s]


run_path=PosixPath('outputs/bioml-2024/omim-ordo/gemini-2.0-flash-lite/determinism/0') | DATASET='bioml-2024' | SUBSET='omim-ordo' | MODEL='gemini-2.0-flash-lite' | exp_name='0'


Processing Lines prompt_direct_entity: 100%|██████████| 1464/1464 [00:37<00:00, 39.18it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1464/1464 [00:39<00:00, 36.72it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1464/1464 [00:36<00:00, 39.66it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1464/1464 [00:38<00:00, 37.88it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1464/1464 [00:39<00:00, 37.15it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1464/1464 [00:37<00:00, 39.24it/s]


run_path=PosixPath('outputs/largebio/fma-nci/gemini-2.0-flash-lite/determinism/0') | DATASET='largebio' | SUBSET='fma-nci' | MODEL='gemini-2.0-flash-lite' | exp_name='0'


Processing Lines prompt_direct_entity: 100%|██████████| 1258/1258 [00:35<00:00, 35.75it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1258/1258 [00:33<00:00, 38.04it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1258/1258 [00:34<00:00, 36.95it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1258/1258 [00:33<00:00, 37.60it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1258/1258 [00:33<00:00, 37.74it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1258/1258 [00:32<00:00, 38.55it/s]


run_path=PosixPath('outputs/anatomy/human-mouse/gemini-2.0-flash-lite/determinism/1') | DATASET='anatomy' | SUBSET='human-mouse' | MODEL='gemini-2.0-flash-lite' | exp_name='1'


Processing Lines prompt_direct_entity: 100%|██████████| 398/398 [00:09<00:00, 42.40it/s] 
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 398/398 [00:10<00:00, 36.99it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 398/398 [00:10<00:00, 36.73it/s] 
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 398/398 [00:10<00:00, 38.08it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 398/398 [00:10<00:00, 38.00it/s] 
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 398/398 [00:11<00:00, 35.30it/s]


run_path=PosixPath('outputs/bioml-2024/omim-ordo/gemini-2.0-flash-lite/determinism/1') | DATASET='bioml-2024' | SUBSET='omim-ordo' | MODEL='gemini-2.0-flash-lite' | exp_name='1'


Processing Lines prompt_direct_entity: 100%|██████████| 1464/1464 [00:38<00:00, 38.39it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1464/1464 [00:37<00:00, 38.61it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1464/1464 [00:37<00:00, 38.91it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1464/1464 [00:40<00:00, 36.28it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1464/1464 [00:40<00:00, 35.81it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1464/1464 [00:36<00:00, 40.31it/s]


run_path=PosixPath('outputs/largebio/fma-nci/gemini-2.0-flash-lite/determinism/1') | DATASET='largebio' | SUBSET='fma-nci' | MODEL='gemini-2.0-flash-lite' | exp_name='1'


Processing Lines prompt_direct_entity: 100%|██████████| 1258/1258 [00:33<00:00, 38.09it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1258/1258 [00:31<00:00, 39.42it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1258/1258 [00:33<00:00, 37.69it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1258/1258 [00:34<00:00, 36.70it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1258/1258 [00:34<00:00, 36.31it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1258/1258 [00:28<00:00, 43.39it/s]


run_path=PosixPath('outputs/anatomy/human-mouse/gemini-2.0-flash-lite/determinism/2') | DATASET='anatomy' | SUBSET='human-mouse' | MODEL='gemini-2.0-flash-lite' | exp_name='2'


Processing Lines prompt_direct_entity: 100%|██████████| 398/398 [00:10<00:00, 37.30it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 398/398 [00:10<00:00, 38.22it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 398/398 [00:10<00:00, 36.90it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 398/398 [00:10<00:00, 38.00it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 398/398 [00:10<00:00, 37.91it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 398/398 [00:11<00:00, 36.04it/s]


run_path=PosixPath('outputs/bioml-2024/omim-ordo/gemini-2.0-flash-lite/determinism/2') | DATASET='bioml-2024' | SUBSET='omim-ordo' | MODEL='gemini-2.0-flash-lite' | exp_name='2'


Processing Lines prompt_direct_entity: 100%|██████████| 1464/1464 [00:38<00:00, 37.55it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1464/1464 [00:38<00:00, 37.75it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1464/1464 [00:38<00:00, 37.55it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1464/1464 [00:44<00:00, 33.21it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1464/1464 [00:47<00:00, 30.87it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1464/1464 [00:45<00:00, 32.05it/s]


run_path=PosixPath('outputs/largebio/fma-nci/gemini-2.0-flash-lite/determinism/2') | DATASET='largebio' | SUBSET='fma-nci' | MODEL='gemini-2.0-flash-lite' | exp_name='2'


Processing Lines prompt_direct_entity: 100%|██████████| 1258/1258 [00:32<00:00, 38.64it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1258/1258 [00:35<00:00, 35.51it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1258/1258 [00:37<00:00, 33.57it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1258/1258 [00:36<00:00, 34.60it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1258/1258 [00:34<00:00, 35.96it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1258/1258 [00:34<00:00, 36.18it/s]


run_path=PosixPath('outputs/anatomy/human-mouse/gemini-2.0-flash-lite/determinism/3') | DATASET='anatomy' | SUBSET='human-mouse' | MODEL='gemini-2.0-flash-lite' | exp_name='3'


Processing Lines prompt_direct_entity: 100%|██████████| 398/398 [00:11<00:00, 34.64it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 398/398 [00:12<00:00, 33.09it/s] 
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 398/398 [00:10<00:00, 38.64it/s] 
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 398/398 [00:11<00:00, 33.94it/s] 
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 398/398 [00:10<00:00, 37.76it/s] 
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 398/398 [00:11<00:00, 34.00it/s]


run_path=PosixPath('outputs/bioml-2024/omim-ordo/gemini-2.0-flash-lite/determinism/3') | DATASET='bioml-2024' | SUBSET='omim-ordo' | MODEL='gemini-2.0-flash-lite' | exp_name='3'


Processing Lines prompt_direct_entity: 100%|██████████| 1464/1464 [00:37<00:00, 39.28it/s]
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1464/1464 [00:39<00:00, 36.93it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1464/1464 [00:39<00:00, 36.74it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1464/1464 [00:44<00:00, 32.96it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1464/1464 [00:47<00:00, 30.77it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1464/1464 [00:37<00:00, 39.08it/s]


run_path=PosixPath('outputs/largebio/fma-nci/gemini-2.0-flash-lite/determinism/3') | DATASET='largebio' | SUBSET='fma-nci' | MODEL='gemini-2.0-flash-lite' | exp_name='3'


Processing Lines prompt_direct_entity: 100%|██████████| 1258/1258 [00:35<00:00, 35.21it/s] 
Processing Lines prompt_direct_entity_ontological: 100%|██████████| 1258/1258 [00:34<00:00, 36.18it/s]
Processing Lines prompt_sequential_hierarchy: 100%|██████████| 1258/1258 [00:36<00:00, 34.67it/s]
Processing Lines prompt_sequential_hierarchy_ontological: 100%|██████████| 1258/1258 [00:37<00:00, 33.93it/s]
Processing Lines prompt_direct_entity_with_synonyms: 100%|██████████| 1258/1258 [00:31<00:00, 39.35it/s]
Processing Lines prompt_sequential_hierarchy_with_synonyms: 100%|██████████| 1258/1258 [00:35<00:00, 35.68it/s]


## Analysis


In [None]:
runs_array = []

for directory in sorted(RUN_DIR.iterdir(), key=lambda x: x.name, reverse=True):
    if not directory.is_dir():
        continue
    for file in directory.iterdir():
        if '_' not in file.name:
            continue
        subset_name, model_name = file.name.split('_')[:2]
        dataset_name = SUBSET_TO_DATASET_MAP.get(subset_name)
    runs_array.append((directory.name, model_name, subset_name, dataset_name))

len(runs_array), runs_array

In [None]:
runs = [
    ('2025-05-01_11-47-16', 'gemini-2.0-flash', 'snomed-nci', 'largebio'),
    ('2025-05-01_11-37-30', 'gemini-2.0-flash-lite', 'snomed-nci', 'largebio'),
    ('2025-05-01_11-24-31', 'gemini-2.0-flash', 'fma-snomed', 'largebio'),
    ('2025-05-01_11-19-48', 'gemini-2.0-flash-lite', 'fma-snomed', 'largebio'),
    ('2025-05-01_11-15-05', 'gemini-2.0-flash', 'snomed-ncit.pharm', 'bioml-2024'),
    ('2025-05-01_11-12-35', 'gemini-2.0-flash-lite', 'snomed-ncit.pharm', 'bioml-2024'),
    ('2025-05-01_11-07-00', 'gemini-2.0-flash', 'snomed-ncit.neoplas', 'bioml-2024'),
    ('2025-05-01_11-06-39', 'gemini-2.0-flash-lite', 'snomed-ncit.neoplas', 'bioml-2024'),
    ('2025-05-01_10-56-46', 'gemini-2.0-flash', 'snomed-fma.body', 'bioml-2024'),
    ('2025-05-01_10-50-33', 'gemini-2.0-flash-lite', 'snomed-fma.body', 'bioml-2024'),
    ('2025-05-01_10-13-02', 'gemini-2.0-flash-lite', 'fma-snomed', 'largebio'),
    ('2025-04-10_22-11-36', 'gemini-2.0-flash', 'ncit-doid', 'bioml-2024'),
    ('2025-04-10_00-33-19', 'gemini-2.0-flash-lite', 'ncit-doid', 'bioml-2024'),
    ('2025-04-06_22-05-02', 'gemini-2.0-flash-lite', 'omim-ordo', 'bioml-2024'),
    ('2025-04-06_19-27-41', 'gemini-2.0-flash', 'omim-ordo', 'bioml-2024'),
    ('2025-04-06_14-51-00', 'gemini-2.0-flash-lite', 'fma-nci', 'largebio'),
    ('2025-04-06_14-18-46', 'gemini-2.0-flash', 'fma-nci', 'largebio'),
    ('2025-04-04_19-41-28', 'gemini-2.0-flash', 'human-mouse', 'anatomy'),
    ('2025-04-04_19-23-47', 'gemini-2.0-flash-lite', 'human-mouse', 'anatomy'),
    ('2025-04-04_00-43-16', 'gemini-2.0-flash', 'fma-snomed', 'largebio'),
    ('2025-03-28_10-15-04', 'gpt-4o-mini', 'fma-nci', 'largebio'),
    ('2025-03-19_23-32-20', 'gpt-4o-mini', 'omim-ordo', 'bioml-2024'),
    ('2025-03-19_22-17-52', 'gpt-4o-mini', 'ncit-doid', 'bioml-2024'),
    ('2025-03-19_19-48-41', 'gpt-4o-mini', 'human-mouse', 'anatomy'),
    ('2025-03-14_00-40-55', 'gpt-4o-mini', 'ncit-doid', 'bioml-2024'),
    ('2025-03-12_17-21-53', 'gpt-4o-mini', 'human-mouse', 'anatomy'),
    ('2025-03-12_14-43-32', 'gpt-4o-mini', 'omim-ordo', 'bioml-2024')
 ]

suffix = "_reduced"

dfs = []
for run_subdir, model, set_name, dataset in runs:
    metrics_df = read_run_metrics_df(run_subdir, suffix)
    for col, value in zip(["Dataset", "SubSet", "Model"], [dataset, set_name, model]):
        if col not in metrics_df.columns:
            metrics_df[col] = value
    metrics_df["Run"] = run_subdir
    dfs.append(metrics_df)

results_df = pd.concat(dfs, ignore_index=True)
results_df = results_df.sort_values(by=["Dataset", "SubSet", "Model"])

results_df.to_csv(format_combined_metrics_path(suffix), index=False)
results_df

### Rerun evaluation

In [None]:
runs = [
    ('2025-05-01_11-47-16', 'gemini-2.0-flash', 'snomed-nci', 'largebio'),
    ('2025-05-01_11-37-30', 'gemini-2.0-flash-lite', 'snomed-nci', 'largebio'),
    ('2025-05-01_11-24-31', 'gemini-2.0-flash', 'fma-snomed', 'largebio'),
    ('2025-05-01_11-19-48', 'gemini-2.0-flash-lite', 'fma-snomed', 'largebio'),
    ('2025-05-01_11-15-05', 'gemini-2.0-flash', 'snomed-ncit.pharm', 'bioml-2024'),
    ('2025-05-01_11-12-35', 'gemini-2.0-flash-lite', 'snomed-ncit.pharm', 'bioml-2024'),
    ('2025-05-01_11-07-00', 'gemini-2.0-flash', 'snomed-ncit.neoplas', 'bioml-2024'),
    ('2025-05-01_11-06-39', 'gemini-2.0-flash-lite', 'snomed-ncit.neoplas', 'bioml-2024'),
    ('2025-05-01_10-56-46', 'gemini-2.0-flash', 'snomed-fma.body', 'bioml-2024'),
    ('2025-05-01_10-50-33', 'gemini-2.0-flash-lite', 'snomed-fma.body', 'bioml-2024'),
    ('2025-05-01_10-13-02', 'gemini-2.0-flash-lite', 'fma-snomed', 'largebio'),
    ('2025-04-10_22-11-36', 'gemini-2.0-flash', 'ncit-doid', 'bioml-2024'),
    ('2025-04-10_00-33-19', 'gemini-2.0-flash-lite', 'ncit-doid', 'bioml-2024'),
    ('2025-04-06_22-05-02', 'gemini-2.0-flash-lite', 'omim-ordo', 'bioml-2024'),
    ('2025-04-06_19-27-41', 'gemini-2.0-flash', 'omim-ordo', 'bioml-2024'),
    ('2025-04-06_14-51-00', 'gemini-2.0-flash-lite', 'fma-nci', 'largebio'),
    ('2025-04-06_14-18-46', 'gemini-2.0-flash', 'fma-nci', 'largebio'),
    ('2025-04-04_19-41-28', 'gemini-2.0-flash', 'human-mouse', 'anatomy'),
    ('2025-04-04_19-23-47', 'gemini-2.0-flash-lite', 'human-mouse', 'anatomy'),
    ('2025-04-04_00-43-16', 'gemini-2.0-flash', 'fma-snomed', 'largebio'),
    ('2025-03-28_10-15-04', 'gpt-4o-mini', 'fma-nci', 'largebio'),
    ('2025-03-19_23-32-20', 'gpt-4o-mini', 'omim-ordo', 'bioml-2024'),
    ('2025-03-19_22-17-52', 'gpt-4o-mini', 'ncit-doid', 'bioml-2024'),
    ('2025-03-19_19-48-41', 'gpt-4o-mini', 'human-mouse', 'anatomy'),
    ('2025-03-14_00-40-55', 'gpt-4o-mini', 'ncit-doid', 'bioml-2024'),
    ('2025-03-12_17-21-53', 'gpt-4o-mini', 'human-mouse', 'anatomy'),
    ('2025-03-12_14-43-32', 'gpt-4o-mini', 'omim-ordo', 'bioml-2024')
 ]

In [None]:
suffix = "_reduced"
for run_subdir, model, set_name, dataset_name in tqdm(runs[:]):
    run_path = RUN_DIR / run_subdir
    print(f"Run path: {run_path}, dataset: {dataset_name}, set: {set_name}, model: {model}")

    for prompt_name in tqdm(PROMPT_FUNCTIONS_MAP, disable=True):
        try:
            prediction_path, stats_path, diagram_path = format_storing_pathes_from_run_path(
                run_path, set_name, model, prompt_name, suffix
            )
            predictions = get_predictions_with_gt(run_path, dataset_name, set_name, model, prompt_name, suffix)
            analyze_results(
                predictions,
                print_results=False,
                plot_confusion_matrix=False,
                subtitle=f"{set_name}: {model} {prompt_name} | ",
                cm_save_path=diagram_path,
                stats_path=stats_path,
            )
        except Exception as e:  # noqa: BLE001
            print(f"Error: {e!s}")

    store_run_metrics_df(PROMPT_FUNCTIONS_MAP, run_path, dataset_name, set_name, model, suffix)