In [None]:
# ruff: noqa: T201, T203
from __future__ import annotations

import os
from pathlib import Path

if Path.cwd().name == "notebooks":
    os.chdir("..")

import logging

import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

import src.prompts.system as system_prompts
from config.config import RUN_DIR
from config.runs_vars import SUBSET_TO_DATASET_MAP
from src.constants import PAIRS_SEPARATOR
from src.evaluate import (
    get_predictions_with_gt,
    plot_usage_histograms,
    read_run_metrics_df,
    save_analysis_results,
    store_run_metrics_df,
)
from src.formatting import (
    format_combined_metrics_path,
    format_oracle_pairs_filepath,
    format_oracle_pairs_precomputed_dir,
    format_run_path,
    format_storing_pathes_from_run_path,
    format_subsets_ontologies_paths,
)
from src.LLM_servers.openai import OpenAIServer
from src.onto_access import OntologyAccess
from src.onto_object import OntologyEntryAttr
from src.processing import parallel_samples_process, save_oracle_pairs_with_prompts, try_load_precomputed_oracle_pairs
from src.prompts.prompts import (
    prompt_direct_entity,
    prompt_direct_entity_ontological,
    prompt_direct_entity_with_synonyms,
    prompt_sequential_hierarchy,
    prompt_sequential_hierarchy_ontological,
    prompt_sequential_hierarchy_with_synonyms,
)
from src.utils import read_oracle_pairs, save_run_results

pd.set_option("display.max_rows", None)
logging.getLogger().setLevel(logging.WARNING)
load_dotenv()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
PROMPT_FUNCTIONS_MAP = {
    prompt_function.__name__.replace("prompt_", ""): prompt_function
    for prompt_function in [
        prompt_direct_entity,
        prompt_direct_entity_ontological,
        prompt_sequential_hierarchy,
        prompt_sequential_hierarchy_ontological,
        prompt_direct_entity_with_synonyms,
        prompt_sequential_hierarchy_with_synonyms,
    ]
}
# ruff: noqa: ERA001
# SYSPROMPTS_MAP = {
#     "base": system_prompts.BASELINE_INITIALIZATION_MESSAGE,
#     "natural_language": system_prompts.INTUITIVE_NATURAL_LANGUAGE_JUDGEMENT_MESSAGE,
#     "ontology_aware": system_prompts.ONTOLOGY_AWARE_REASONING_MESSAGE,
#     "synonym_aware": system_prompts.SYNONYM_AWARE_MESSAGE,
#     "none": None,
# }

In [3]:
DATASET, SUBSET = "largebio_small", "fma-nci"

src_onto_path, tgt_onto_path = format_subsets_ontologies_paths(DATASET, SUBSET)
src_onto_path, tgt_onto_path

(PosixPath('data/largebio_small/fma-nci/oaei_FMA_small_overlapping_nci.owl'),
 PosixPath('data/largebio_small/fma-nci/oaei_NCI_small_overlapping_fma.owl'))

### Pre-Validation

In [None]:
onto_src = OntologyAccess(src_onto_path, annotate_on_init=True)
onto_tgt = OntologyAccess(tgt_onto_path, annotate_on_init=True)

In [None]:
src_entity, tgt_entity = read_oracle_pairs(format_oracle_pairs_filepath(DATASET, SUBSET))[0]
src_entity, tgt_entity = OntologyEntryAttr(src_entity, onto_src), OntologyEntryAttr(tgt_entity, onto_tgt)

print(src_entity)
print(tgt_entity)

for name, prompt_function in PROMPT_FUNCTIONS_MAP.items():
    print(f"Prompt function: {name}")
    print(prompt_function(src_entity, tgt_entity))
    print("=" * 50)

In [None]:
for name, prompt_function in PROMPT_FUNCTIONS_MAP.items():
    print(f"Prompt function: {name}")
    oracle_candidate_pairs = try_load_precomputed_oracle_pairs(DATASET, SUBSET, prompt_function)
    print(*oracle_candidate_pairs[0], sep="\n")


### Pre-Processing

In [None]:
ALL_DATASET_NAMES = {
    "anatomy": ["human-mouse"],
    "bioml-2024": ["snomed-fma.body", "snomed-ncit.neoplas", "snomed-ncit.pharm", "ncit-doid", "omim-ordo"],
    "largebio": ["fma-snomed", "snomed-nci", "fma-nci"],
    "largebio_small": ["fma-nci", "snomed-nci", "fma-nci"],
}

prompt_functions = list(PROMPT_FUNCTIONS_MAP.values())

for dataset_name, set_names in ALL_DATASET_NAMES.items():
    for set_name in set_names:
        try:
            ontologies = []
            for onto_path in tqdm(
                format_subsets_ontologies_paths(dataset_name, set_name),
                desc=f"Loading ontologies: {dataset_name} - {set_name}",
            ):
                ontologies.append(OntologyAccess(onto_path, annotate_on_init=True))

            pairs_path = format_oracle_pairs_filepath(dataset_name, set_name)
            oracle_pairs_dir = format_oracle_pairs_precomputed_dir(dataset_name, set_name)

            pairs_with_prompts_df = save_oracle_pairs_with_prompts(
                pairs_path, *ontologies, prompt_functions, oracle_pairs_dir, PAIRS_SEPARATOR, max_workers=2
            )
        except Exception as e:
            print(f"Error processing {dataset_name} - {set_name}: {e}")
            raise e
            continue

Adding prompts:   0%|          | 0/799 [00:00<?, ?it/s]

Adding prompts: 100%|██████████| 799/799 [01:23<00:00,  9.52it/s]


In [None]:
prompt_functions = list(PROMPT_FUNCTIONS_MAP.values())
dataset_name, set_name = "largebio", "fma-snomed"

ontologies = []

for onto_path in tqdm(
    format_subsets_ontologies_paths(dataset_name, set_name), desc=f"Loading ontologies: {dataset_name} - {set_name}"
):
    ontologies.append(OntologyAccess(onto_path, annotate_on_init=True))

pairs_path = format_oracle_pairs_filepath(dataset_name, set_name)
oracle_pairs_dir = format_oracle_pairs_precomputed_dir(dataset_name, set_name)

Loading ontologies: largebio - fma-snomed: 100%|██████████| 2/2 [18:47<00:00, 563.73s/it]


In [None]:
onto_src, onto_tgt = ontologies
pairs = try_load_precomputed_oracle_pairs(DATASET, "fma-snomed", prompt_function=None)
src_entity = OntologyEntryAttr(pairs[0][0], onto_src)

In [None]:
tgt_entity = OntologyEntryAttr(pairs[0][1], onto_tgt)

  http://www.ihtsdo.org/snomed#Cinchona_alkaloid
  http://www.ihtsdo.org/snomed#Cinchona_antimalarial

  http://www.ihtsdo.org/snomed#alpha_Thalassemia

  http://www.ihtsdo.org/snomed#beta_Thalassemia



### Run experiments


In [32]:
MAX_WORKERS = 10
MODELS = ["gemini-2.0-flash"]  # "gpt-4o-mini"

DATASETS_MAP = {
    "anatomy": ["human-mouse"],
    "bioml-2024": ["snomed-fma.body", "snomed-ncit.neoplas", "snomed-ncit.pharm", "ncit-doid", "omim-ordo"],
    # "largebio": ["snomed-nci", "fma-nci", "fma-snomed"]
}
EXPERIMENT_TYPE = "prompts"
EXP_NAMES_MAP = [""]

onto_src, onto_tgt = None, None

PROMPT_FUNCTIONS_MAP = {
    prompt_function.__name__.replace("prompt_", ""): prompt_function
    for prompt_function in [
        prompt_direct_entity,
        prompt_direct_entity_ontological,
        prompt_sequential_hierarchy,
        prompt_sequential_hierarchy_ontological,
        prompt_direct_entity_with_synonyms,
        prompt_sequential_hierarchy_with_synonyms,
    ]
}

In [None]:
for exp_name in EXP_NAMES_MAP:
    for DATASET in DATASETS_MAP:
        for SUBSET in DATASETS_MAP[DATASET]:
            # Load the ontologies here, if there are no precomputed prompts
            for MODEL in MODELS:
                llm_oracle = (
                    OpenAIServer(
                        api_key=os.environ["GEMINI_API_KEY"],
                        base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
                    )
                    if MODEL.startswith("gemini")
                    else OpenAIServer(api_key=os.environ["OPENAI_API_KEY"])
                )
                llm_oracle.add_system_context(system_prompts.INTUITIVE_NATURAL_LANGUAGE_JUDGEMENT_MESSAGE)

                # run_path = format_predictions_run_path(DATASET, SUBSET, MODEL, EXPERIMENT_TYPE, exp_spec=exp_name)
                run_path = format_run_path()
                print(f"{run_path=} | {DATASET=} | {SUBSET=} | {MODEL=} | {exp_name=}")

                for prompt_name, prompt_function in PROMPT_FUNCTIONS_MAP.items():
                    prediction_path, stats_path, diagram_path = format_storing_pathes_from_run_path(
                        run_path, SUBSET, MODEL, prompt_name, suffix=""
                    )
                    oracle_candidate_pairs = try_load_precomputed_oracle_pairs(
                        DATASET, SUBSET, prompt_function, suffix=""
                    )
                    results, tokens_usage, confidences = parallel_samples_process(
                        oracle_candidate_pairs, llm_oracle, onto_src, onto_tgt, MODEL, MAX_WORKERS, prompt_function
                    )
                    save_run_results(results, prediction_path, columns=["Source", "Target", "Prediction", "Confidence"])
                    plot_usage_histograms(
                        tokens_usage, confidences, do_plot=False, do_print=False, suptitle=prompt_name
                    )
                    try:
                        predictions = get_predictions_with_gt(run_path, DATASET, SUBSET, MODEL, prompt_name, suffix="")
                        save_analysis_results(
                            predictions,
                            print_results=False,
                            plot_confusion_matrix=False,
                            subtitle=f"{SUBSET}: {MODEL} {prompt_name} | ",
                            cm_save_path=diagram_path,
                            stats_path=stats_path,
                        )
                    except Exception as e:  # noqa: BLE001
                        print(f"Error: {e}")

                store_run_metrics_df(PROMPT_FUNCTIONS_MAP, run_path, DATASET, SUBSET, MODEL)

## Analysis


In [34]:
runs_array = []

for directory in sorted(RUN_DIR.iterdir(), key=lambda x: x.name, reverse=True):
    if not directory.is_dir():
        continue
    for file in directory.iterdir():
        if '_' not in file.name:
            continue
        subset_name, model_name = file.name.split('_')[:2]
        dataset_name = SUBSET_TO_DATASET_MAP.get(subset_name)
    runs_array.append((directory.name, model_name, subset_name, dataset_name))

len(runs_array), runs_array

(64,
 [('2025-05-11_00-15-12', 'gemini-1.5-flash', 'omim-ordo', 'bioml-2024'),
  ('2025-05-11_00-12-18', 'gemini-1.5-flash', 'ncit-doid', 'bioml-2024'),
  ('2025-05-11_00-06-56',
   'gemini-1.5-flash',
   'snomed-ncit.pharm',
   'bioml-2024'),
  ('2025-05-11_00-02-13',
   'gemini-1.5-flash',
   'snomed-ncit.neoplas',
   'bioml-2024'),
  ('2025-05-10_23-57-23', 'gemini-1.5-flash', 'snomed-fma.body', 'bioml-2024'),
  ('2025-05-10_23-56-32', 'gemini-1.5-flash', 'human-mouse', 'anatomy'),
  ('2025-05-10_23-41-12', 'gemini-1.5-flash', 'fma-snomed', 'largebio'),
  ('2025-05-10_23-38-28', 'gemini-1.5-flash', 'fma-nci', 'largebio'),
  ('2025-05-10_23-26-32', 'gemini-1.5-flash', 'snomed-nci', 'largebio'),
  ('2025-05-10_21-38-48',
   'gemini-2.5-flash-preview-04-17',
   'fma-snomed',
   'largebio'),
  ('2025-05-10_21-26-58',
   'gemini-2.5-flash-preview-04-17',
   'fma-nci',
   'largebio'),
  ('2025-05-10_20-48-45',
   'gemini-2.5-flash-preview-04-17',
   'snomed-nci',
   'largebio'),
  ('2025-

In [39]:
runs = [
    ("2025-05-11_00-15-12", "gemini-1.5-flash", "omim-ordo", "bioml-2024"),
    ("2025-05-11_00-12-18", "gemini-1.5-flash", "ncit-doid", "bioml-2024"),
    ("2025-05-11_00-06-56", "gemini-1.5-flash", "snomed-ncit.pharm", "bioml-2024"),
    ("2025-05-11_00-02-13", "gemini-1.5-flash", "snomed-ncit.neoplas", "bioml-2024"),
    ("2025-05-10_23-57-23", "gemini-1.5-flash", "snomed-fma.body", "bioml-2024"),
    ("2025-05-10_23-56-32", "gemini-1.5-flash", "human-mouse", "anatomy"),
    ("2025-05-10_23-41-12", "gemini-1.5-flash", "fma-snomed", "largebio"),
    ("2025-05-10_23-38-28", "gemini-1.5-flash", "fma-nci", "largebio"),
    ("2025-05-10_23-26-32", "gemini-1.5-flash", "snomed-nci", "largebio"),
    ("2025-05-10_21-38-48", "gemini-2.5-flash-preview-04-17", "fma-snomed", "largebio"),
    ("2025-05-10_21-26-58", "gemini-2.5-flash-preview-04-17", "fma-nci", "largebio"),
    ("2025-05-10_20-48-45", "gemini-2.5-flash-preview-04-17", "snomed-nci", "largebio"),
    ("2025-05-10_20-31-31", "gemini-2.0-flash", "omim-ordo", "bioml-2024"),
    ("2025-05-10_20-26-50", "gemini-2.0-flash", "ncit-doid", "bioml-2024"),
    ("2025-05-10_20-24-42", "gemini-2.5-flash-preview-04-17", "omim-ordo", "bioml-2024"),
    ("2025-05-10_20-18-58", "gemini-2.0-flash", "snomed-ncit.pharm", "bioml-2024"),
    ("2025-05-10_20-12-48", "gemini-2.5-flash-preview-04-17", "ncit-doid", "bioml-2024"),
    ("2025-05-10_20-12-05", "gemini-2.0-flash", "snomed-ncit.neoplas", "bioml-2024"),
    ("2025-05-10_20-04-10", "gemini-2.0-flash", "snomed-fma.body", "bioml-2024"),
    ("2025-05-10_20-02-37", "gemini-2.0-flash", "human-mouse", "anatomy"),
    ("2025-05-10_19-59-37", "gemini-2.5-flash-preview-04-17", "snomed-ncit.pharm", "bioml-2024"),
    ("2025-05-10_19-58-03", "gemini-2.0-flash-lite", "omim-ordo", "bioml-2024"),
    ("2025-05-10_19-54-15", "gemini-2.0-flash-lite", "ncit-doid", "bioml-2024"),
    ("2025-05-10_19-47-56", "gemini-2.0-flash-lite", "snomed-ncit.pharm", "bioml-2024"),
    ("2025-05-10_19-47-03", "gemini-2.5-flash-preview-04-17", "snomed-ncit.neoplas", "bioml-2024"),
    ("2025-05-10_19-42-52", "gemini-2.0-flash-lite", "snomed-ncit.neoplas", "bioml-2024"),
    ("2025-05-10_19-37-02", "gemini-2.0-flash-lite", "snomed-fma.body", "bioml-2024"),
    ("2025-05-10_19-35-50", "gemini-2.0-flash-lite", "human-mouse", "anatomy"),
    ("2025-05-10_19-32-25", "gemini-2.5-flash-preview-04-17", "snomed-fma.body", "bioml-2024"),
    ("2025-05-10_19-29-40", "gemini-2.5-flash-preview-04-17", "human-mouse", "anatomy"),
    ("2025-05-10_16-09-58", "gemini-2.0-flash", "fma-snomed", "largebio"),
    ("2025-05-10_16-07-22", "gemini-2.0-flash-lite", "fma-snomed", "largebio"),
    ("2025-05-10_16-05-15", "gemini-2.0-flash", "fma-nci", "largebio"),
    ("2025-05-10_16-03-43", "gemini-2.0-flash-lite", "fma-nci", "largebio"),
    ("2025-05-10_15-46-10", "gemini-2.0-flash-lite", "snomed-nci", "largebio"),
    ("2025-05-10_15-44-53", "gemini-2.0-flash", "snomed-nci", "largebio"),
    # ("2025-05-08_15-29-28", "gemini-2.0-flash", "fma-nci", "largebio"),
    # ("2025-05-01_11-47-16", "gemini-2.0-flash", "snomed-nci", "largebio"),
    # ("2025-05-01_11-37-30", "gemini-2.0-flash-lite", "snomed-nci", "largebio"),
    # ("2025-05-01_11-24-31", "gemini-2.0-flash", "fma-snomed", "largebio"),
    # ("2025-05-01_11-19-48", "gemini-2.0-flash-lite", "fma-snomed", "largebio"),
    # ("2025-05-01_11-15-05", "gemini-2.0-flash", "snomed-ncit.pharm", "bioml-2024"),
    # ("2025-05-01_11-12-35", "gemini-2.0-flash-lite", "snomed-ncit.pharm", "bioml-2024"),
    # ("2025-05-01_11-07-00", "gemini-2.0-flash", "snomed-ncit.neoplas", "bioml-2024"),
    # ("2025-05-01_11-06-39", "gemini-2.0-flash-lite", "snomed-ncit.neoplas", "bioml-2024"),
    # ("2025-05-01_10-56-46", "gemini-2.0-flash", "snomed-fma.body", "bioml-2024"),
    # ("2025-05-01_10-50-33", "gemini-2.0-flash-lite", "snomed-fma.body", "bioml-2024"),
    # ("2025-05-01_10-13-02", "gemini-2.0-flash-lite", "fma-snomed", "largebio"),
    # ("2025-04-10_22-11-36", "gemini-2.0-flash", "ncit-doid", "bioml-2024"),
    # ("2025-04-10_00-33-19", "gemini-2.0-flash-lite", "ncit-doid", "bioml-2024"),
    # ("2025-04-06_22-05-02", "gemini-2.0-flash-lite", "omim-ordo", "bioml-2024"),
    # ("2025-04-06_19-27-41", "gemini-2.0-flash", "omim-ordo", "bioml-2024"),
    # ("2025-04-06_14-51-00", "gemini-2.0-flash-lite", "fma-nci", "largebio"),
    # ("2025-04-06_14-18-46", "gemini-2.0-flash", "fma-nci", "largebio"),
    # ("2025-04-04_19-41-28", "gemini-2.0-flash", "human-mouse", "anatomy"),
    # ("2025-04-04_19-23-47", "gemini-2.0-flash-lite", "human-mouse", "anatomy"),
    # ("2025-04-04_00-43-16", "gemini-2.0-flash", "fma-snomed", "largebio"),
    ("2025-03-28_10-15-04", "gpt-4o-mini", "fma-nci", "largebio"),
    ("2025-03-19_23-32-20", "gpt-4o-mini", "omim-ordo", "bioml-2024"),
    ("2025-03-19_22-17-52", "gpt-4o-mini", "ncit-doid", "bioml-2024"),
    ("2025-03-19_19-48-41", "gpt-4o-mini", "human-mouse", "anatomy"),
    ("2025-03-14_00-40-55", "gpt-4o-mini", "ncit-doid", "bioml-2024"),
    ("2025-03-12_17-21-53", "gpt-4o-mini", "human-mouse", "anatomy"),
    ("2025-03-12_14-43-32", "gpt-4o-mini", "omim-ordo", "bioml-2024"),
]

suffix = "_reduced"

dfs = []
for run_subdir, model, set_name, dataset in runs:
    metrics_df = read_run_metrics_df(run_subdir, suffix)
    for col, value in zip(["Dataset", "SubSet", "Model"], [dataset, set_name, model]):
        if col not in metrics_df.columns:
            metrics_df[col] = value

    if "Experiment" in metrics_df.columns:
        metrics_df = metrics_df.rename(columns={"Experiment": "Prompt"})

    metrics_df["Run"] = run_subdir
    dfs.append(metrics_df)

results_df = pd.concat(dfs, ignore_index=True)
results_df = results_df.sort_values(by=["Dataset", "SubSet", "Model"])

results_df.to_csv(format_combined_metrics_path(suffix), index=False)
results_df

Unnamed: 0,Prompt,Accuracy,Precision,Recall,F1 Score,Specificity,Sensitivity,Youden's index,Dataset,SubSet,Model,Run
30,direct_entity,0.747,0.72,0.9939,0.8351,0.3,0.9939,0.2939,anatomy,human-mouse,gemini-1.5-flash,2025-05-10_23-56-32
31,direct_entity_ontological,0.747,0.72,0.9939,0.8351,0.3,0.9939,0.2939,anatomy,human-mouse,gemini-1.5-flash,2025-05-10_23-56-32
32,sequential_hierarchy,0.7312,0.7111,0.9816,0.8247,0.2778,0.9816,0.2594,anatomy,human-mouse,gemini-1.5-flash,2025-05-10_23-56-32
33,sequential_hierarchy_ontological,0.7115,0.694,0.9877,0.8152,0.2111,0.9877,0.1988,anatomy,human-mouse,gemini-1.5-flash,2025-05-10_23-56-32
34,direct_entity_with_synonyms,0.7549,0.7265,0.9939,0.8394,0.3222,0.9939,0.3161,anatomy,human-mouse,gemini-1.5-flash,2025-05-10_23-56-32
35,sequential_hierarchy_with_synonyms,0.7431,0.7188,0.9877,0.832,0.3,0.9877,0.2877,anatomy,human-mouse,gemini-1.5-flash,2025-05-10_23-56-32
114,direct_entity,0.7668,0.7407,0.9816,0.8443,0.3778,0.9816,0.3594,anatomy,human-mouse,gemini-2.0-flash,2025-05-10_20-02-37
115,direct_entity_ontological,0.7273,0.708,0.9816,0.8226,0.2667,0.9816,0.2483,anatomy,human-mouse,gemini-2.0-flash,2025-05-10_20-02-37
116,sequential_hierarchy,0.7905,0.7644,0.9755,0.8571,0.4556,0.9755,0.431,anatomy,human-mouse,gemini-2.0-flash,2025-05-10_20-02-37
117,sequential_hierarchy_ontological,0.7945,0.7681,0.9755,0.8595,0.4667,0.9755,0.4421,anatomy,human-mouse,gemini-2.0-flash,2025-05-10_20-02-37


### Rerun evaluation

In [None]:
runs = ...

In [None]:
suffix = "_reduced"
for run_subdir, model, set_name, dataset_name in tqdm(runs[:]):
    run_path = RUN_DIR / run_subdir
    print(f"Run path: {run_path}, dataset: {dataset_name}, set: {set_name}, model: {model}")

    for prompt_name in tqdm(PROMPT_FUNCTIONS_MAP, disable=True):
        try:
            prediction_path, stats_path, diagram_path = format_storing_pathes_from_run_path(
                run_path, set_name, model, prompt_name, suffix
            )
            predictions = get_predictions_with_gt(run_path, dataset_name, set_name, model, prompt_name, suffix)
            save_analysis_results(
                predictions,
                print_results=False,
                plot_confusion_matrix=False,
                subtitle=f"{set_name}: {model} {prompt_name} | ",
                cm_save_path=diagram_path,
                stats_path=stats_path,
            )
        except Exception as e:  # noqa: BLE001
            print(f"Error: {e!s}")

    store_run_metrics_df(PROMPT_FUNCTIONS_MAP, run_path, dataset_name, set_name, model, suffix)