# EDA of parents in anatomy dataset

In [1]:
# ruff: noqa: T201, T203
from __future__ import annotations

import os
from pathlib import Path

if Path.cwd().name == "notebooks":
    os.chdir("..")

import logging

from dotenv import load_dotenv

from config.config import DATA_DIR
from src.formatting import (
    format_oracle_pairs_filepath,
)
from src.onto_access import OntologyAccess
from src.onto_object import OntologyEntryAttr
from src.utils import read_oracle_pairs

logging.getLogger().setLevel(logging.WARNING)
load_dotenv()

%load_ext autoreload
%autoreload 2

In [2]:
DATASET_NAME = "anatomy"
SET_NAME = "human-mouse"
SOURCE_ONTOLOGY, TARGET_ONTOLOGY = "mouse", "human"

# DATASET_NAME = "bioml-2024"
# SET_NAME = "omim-ordo" #"ncit-doid"
# SOURCE_ONTOLOGY, TARGET_ONTOLOGY = "omim", "ordo" #"ncit", "doid"

src_onto_path = DATA_DIR / DATASET_NAME / SET_NAME / f"{SOURCE_ONTOLOGY}.owl"
tgt_onto_path = DATA_DIR / DATASET_NAME / SET_NAME / f"{TARGET_ONTOLOGY}.owl"

onto_src = OntologyAccess(src_onto_path, annotate_on_init=True)
onto_tgt = OntologyAccess(tgt_onto_path, annotate_on_init=True)

* Owlready2 * Creating new ontology human <data/anatomy/human-mouse/human.owl#>.
* Owlready2 * ADD TRIPLE data/anatomy/human-mouse/human.owl http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#Ontology
* Owlready2 *     ...loading ontology human from data/anatomy/human-mouse/human.owl...
* Owlready2 *     ...8 properties found: ObsoleteProperty, UNDEFINED_part_of, label, hasRelatedSynonym, hasDefaultNamespace, savedBy, hasDate, hasDefinition


* Owlready2 * Reseting property oboInOwl.ObsoleteProperty: new triples are now available.
* Owlready2 * Reseting property oboInOwl.hasRelatedSynonym: new triples are now available.
* Owlready2 * Reseting property oboInOwl.hasDefaultNamespace: new triples are now available.
* Owlready2 * Reseting property oboInOwl.savedBy: new triples are now available.
* Owlready2 * Reseting property oboInOwl.hasDate: new triples are now available.


### Utils Functions

In [3]:
def get_name_string(name_set):
    """Get a string representation of the name set."""
    # If the name_set is a set or list, join the elements with a comma
    if isinstance(name_set, (set, list)):
        return ", ".join(name_set)
    return str(name_set)

def get_single_name(name_set):
    """Get a single name from the name set."""
    return next(iter(name_set), None) if isinstance(name_set, (set, list)) else name_set

def select_best_direct_entity_names(
    src_entity: OntologyEntryAttr, tgt_entity: OntologyEntryAttr
) -> list[OntologyEntryAttr, OntologyEntryAttr]:
    """If there are multiple direct parents, select one and find child element for it."""
    src_parents = next(iter(src_entity.get_direct_parents()))
    tgt_parents = next(iter(tgt_entity.get_direct_parents()))
    names = [get_name_string(x.get_preffered_names()) if x else None for x in [src_parents, tgt_parents, src_entity, tgt_entity]]

    return names

def format_hierarchy(hierarchy_dict: dict[int, set[OntologyEntryAttr]], no_level: bool = False, add_thing: bool = True) -> str:
    formatted = []
    for level, parents in sorted(hierarchy_dict.items()):
        parent_name = get_name_string([get_name_string(i.get_preffered_names()) for i in parents])

        if not add_thing:
            if parent_name == "Thing":
                continue

        if no_level:
            formatted.append(parent_name)
        else:
            formatted.append(f"\tLevel {level}: {parent_name}")

    if no_level:
        return formatted
    return "\n".join(formatted)

def select_best_direct_entity_names_with_synonyms(src_entity: OntologyEntryAttr, tgt_entity: OntologyEntryAttr, add_thing: bool = True) -> list:
    """Selects preferred names and synonyms for source and target entities and their direct parents."""

    def get_parent_name(entity):
        parent = next(iter(entity.get_direct_parents()), None)
        parent_name = get_name_string(parent.get_preffered_names()) if parent else None
        if parent_name == "Thing" and not add_thing:
            return None
        return parent_name

    def get_clean_synonyms(entity, is_src):
        synonyms = list(entity.get_synonyms())
        if is_src:
            entity_class = str(onto_src.getClassByURI(entity.annotation["uri"])).split('.')[-1]
        else:
            entity_class = str(onto_tgt.getClassByURI(entity.annotation["uri"])).split('.')[-1]
        return [] if len(synonyms) == 1 and synonyms[0] == entity_class else synonyms

    src_parent_name = get_parent_name(src_entity)
    tgt_parent_name = get_parent_name(tgt_entity)

    src_entity_name = get_name_string(src_entity.get_preffered_names())
    tgt_entity_name = get_name_string(tgt_entity.get_preffered_names())

    src_synonyms = get_clean_synonyms(src_entity, True)
    tgt_synonyms = get_clean_synonyms(tgt_entity, False)

    return [
        src_parent_name,  # string
        tgt_parent_name,  # string
        src_entity_name,  # string
        tgt_entity_name,  # string
        src_synonyms,     # list of strings
        tgt_synonyms      # list of strings
    ]


def select_best_sequential_hierarchy_with_synonyms(src_parents_by_levels, tgt_parents_by_levels):
    """Selects the best synonyms for an entity and its hierarchical parents."""

    def get_synonyms_and_class(parents_by_levels, idx, is_src):
        if len(parents_by_levels) > idx:
            entry = next(iter(parents_by_levels[idx]))
            syns = entry.get_synonyms() if hasattr(entry, "get_synonyms") else []

            if is_src:
                cls = str(onto_src.getClassByURI(entry.annotation["uri"])).split('.')[-1]
            else:
                cls = str(onto_tgt.getClassByURI(entry.annotation["uri"])).split('.')[-1]

            return syns, cls
        return [], ""

    def clean(synonyms, cls):
        return [] if len(synonyms) == 1 and next(iter(synonyms)) == cls else synonyms

    src_results = [get_synonyms_and_class(src_parents_by_levels, i, True) for i in range(4)]
    tgt_results = [get_synonyms_and_class(tgt_parents_by_levels, i, False) for i in range(4)]

    src_cleaned = [clean(syns, cls) for syns, cls in src_results]
    tgt_cleaned = [clean(syns, cls) for syns, cls in tgt_results]

    return src_cleaned + tgt_cleaned


### Ontological prompts

In [73]:
def prompt_direct_entity_ontological(src_entity: OntologyEntryAttr, tgt_entity: OntologyEntryAttr) -> str:
    """Ontological prompt that uses ontology-focused language."""
    src_parent, tgt_parent, src_entity_names, tgt_entity_names = select_best_direct_entity_names(src_entity, tgt_entity)

    prompt_lines = [
        "Analyze the following entities, each originating from a distinct biomedical ontology.",
        "Your task is to assess whether they represent the **same ontological concept**, considering both their semantic meaning and hierarchical position.",
        f'\n1. Source entity: "{src_entity_names}"',
        f"\t- Direct ontological parent: {src_parent}",
        f'\n2. Target entity: "{tgt_entity_names}"',
        f"\t- Direct ontological parent: {tgt_parent}",
        "\nAre these entities **ontologically equivalent** within their respective ontologies? Respond with \"True\" or \"False\".",
    ]

    return "\n".join(prompt_lines)

def prompt_sequential_hierarchy_ontological(src_entity: OntologyEntryAttr, tgt_entity: OntologyEntryAttr) -> str:
    """Ontological prompt that uses ontology-focused language, and takes hierarchical relationships into account."""
    src_hierarchy = format_hierarchy(src_entity.get_parents_by_levels(max_level=3))
    tgt_hierarchy = format_hierarchy(tgt_entity.get_parents_by_levels(max_level=3))

    prompt_lines = [
        "Analyze the following entities, each originating from a distinct biomedical ontology.",
        "Each is represented by its **ontological lineage**, capturing its hierarchical placement from the most general to the most specific level.",
        f"\n1. Source entity ontological lineage:\n{src_hierarchy}",
        f"\n2. Target entity ontological lineage:\n{tgt_hierarchy}",
        "\nBased on their **ontological positioning, hierarchical relationships, and semantic alignment**, do these entities represent the **same ontological concept**? Respond with \"True\" or \"False\".",
    ]
    return "\n".join(prompt_lines)

### Natural language prompts

In [83]:
def prompt_direct_entity(src_entity: OntologyEntryAttr, tgt_entity: OntologyEntryAttr) -> str:
    """Regular prompt that uses natural language and is more intuitive."""
    src_parent, tgt_parent, src_entity_names, tgt_entity_names = select_best_direct_entity_names(src_entity, tgt_entity)
    prompt_lines = [
            "We have two entities from different biomedical ontologies.",
            (
                f'The first one is "{src_entity_names}"'
                + (f', which belongs to the broader category "{src_parent}"' if src_parent else "")
            ),
            (
                f'The second one is "{tgt_entity_names}"'
                + (f', which belongs to the broader category "{tgt_parent}"' if tgt_parent else "")
            ),
            '\nDo they mean the same thing? Respond with "True" or "False".',
        ]
    return "\n".join(prompt_lines)

def prompt_sequential_hierarchy(src_entity: OntologyEntryAttr, tgt_entity: OntologyEntryAttr) -> str:
    """Regular prompt that uses natural language and is more intuitive."""
    src_hierarchy = format_hierarchy(src_entity.get_parents_by_levels(max_level=3), True)
    tgt_hierarchy = format_hierarchy(tgt_entity.get_parents_by_levels(max_level=3), True)

    prompt_lines = [
            "We have two entities from different biomedical ontologies.",
            (
                f'The first one is "{src_hierarchy[0]}"'
                + (f', which belongs to the broader category "{src_hierarchy[1]}"' if len(src_hierarchy) >= 1 else "")
                + (f', under the even broader category "{src_hierarchy[2]}"' if len(src_hierarchy) >= 2 else "")
            ),
            (
                f'The second one is "{tgt_hierarchy[0]}"'
                + (f', which belongs to the broader category "{tgt_hierarchy[1]}"' if len(tgt_hierarchy) >= 1 else "")
                + (f', under the even broader category "{tgt_hierarchy[2]}"' if len(tgt_hierarchy) >= 2 else "")
            ),
            '\nDo they mean the same thing? Respond with "True" or "False".',
        ]

    return "\n".join(prompt_lines)



### Natural language prompts with synonyms

In [84]:
def prompt_direct_entity_with_synonyms(src_entity: OntologyEntryAttr, tgt_entity: OntologyEntryAttr) -> str:
    """Natural language prompt that includes synonyms for a more intuitive comparison."""
    src_parent, tgt_parent, src_entity_names, tgt_entity_names, src_synonyms, tgt_synonyms = (
        select_best_direct_entity_names_with_synonyms(src_entity, tgt_entity)
    )

    src_synonyms_text = (
    f", also known as {', '.join(f'\"{s}\"' for s in src_synonyms)}"
    if src_synonyms else ""
    )

    tgt_synonyms_text = (
        f", also known as {', '.join(f'\"{s}\"' for s in tgt_synonyms)}"
        if tgt_synonyms else ""
    )
    prompt_lines = [
        "We have two entities from different biomedical ontologies.",
        f'The first one is "{src_entity_names}"{src_synonyms_text}, which falls under the category "{src_parent}".',
        f'The second one is "{tgt_entity_names}"{tgt_synonyms_text}, which falls under the category "{tgt_parent}".',
        '\nDo they mean the same thing? Respond with "True" or "False".'
    ]
    return "\n".join(prompt_lines)


def prompt_sequential_hierarchy_with_synonyms(src_entity: OntologyEntryAttr,
                                               tgt_entity: OntologyEntryAttr) -> str:
    """Generates a natural language prompt asking whether two ontology entities (with synonyms and hierarchy)
    represent the same concept (True/False).
    """
    src_hierarchy = format_hierarchy(src_entity.get_parents_by_levels(max_level=3), True)
    tgt_hierarchy = format_hierarchy(tgt_entity.get_parents_by_levels(max_level=3), True)

    (src_syns,
     src_p1_syns, src_p2_syns, src_p3_syns,
     tgt_syns,
     tgt_p1_syns, tgt_p2_syns, tgt_p3_syns) = \
        select_best_sequential_hierarchy_with_synonyms(
            src_entity.get_parents_by_levels(max_level=3),
            tgt_entity.get_parents_by_levels(max_level=3)
        )

    def describe_entity(hierarchy: list[str],
                        entity_syns: list[str],
                        parent_syns: list[list[str]]) -> str:
        # Base name and its synonyms
        name_part = f'"{hierarchy[0]}"'
        if entity_syns:
            alt = ', '.join(f'"{s}"' for s in entity_syns)
            name_part += f', also known as {alt}'

        parts = [name_part]

        labels = ["belongs to broader category", "under the even broader category", "under the even broader category"]
        for i, parent_name in enumerate(hierarchy[1:]):
            text = f'{labels[i]} "{parent_name}"'
            if parent_syns[i]:
                alt = ', '.join(f'"{s}"' for s in parent_syns[i])
                text += f' (also known as {alt})'
            parts.append(text)

        return ', '.join(parts)

    src_desc = describe_entity(src_hierarchy, src_syns, [src_p1_syns, src_p2_syns, src_p3_syns])
    tgt_desc = describe_entity(tgt_hierarchy, tgt_syns, [tgt_p1_syns, tgt_p2_syns, tgt_p3_syns])

    prompt_lines = [
        "We have two entities from different biomedical ontologies.",
        f"The first one is {src_desc}.",
        f"The second one is {tgt_desc}.",
        '\nDo they mean the same thing? Respond with "True" or "False".'
    ]
    return "\n".join(prompt_lines)


In [85]:
src_entity, tgt_entity = read_oracle_pairs(format_oracle_pairs_filepath(DATASET_NAME, SET_NAME))[39] # 34
try:
    src_entity, tgt_entity = OntologyEntryAttr(src_entity, onto_src), OntologyEntryAttr(tgt_entity, onto_tgt)
except AssertionError:
    src_entity, tgt_entity = OntologyEntryAttr(tgt_entity, onto_src), OntologyEntryAttr(src_entity, onto_tgt)

# New prompts instances

Ontological prompts

In [86]:
print(prompt_direct_entity_ontological(src_entity, tgt_entity))

Analyze the following entities, each originating from a distinct biomedical ontology.
Your task is to assess whether they represent the **same ontological concept**, considering both their semantic meaning and hierarchical position.

1. Source entity: "alveolus epithelium"
	- Direct ontological parent: lung epithelium

2. Target entity: "Alveolar_Epithelium"
	- Direct ontological parent: Epithelium

Are these entities **ontologically equivalent** within their respective ontologies? Respond with "True" or "False".


In [87]:
print(prompt_sequential_hierarchy_ontological(src_entity, tgt_entity))

Analyze the following entities, each originating from a distinct biomedical ontology.
Each is represented by its **ontological lineage**, capturing its hierarchical placement from the most general to the most specific level.

1. Source entity ontological lineage:
	Level 0: alveolus epithelium
	Level 1: lung epithelium
	Level 2: respiratory system epithelium
	Level 3: Thing

2. Target entity ontological lineage:
	Level 0: Alveolar_Epithelium
	Level 1: Epithelium
	Level 2: Epithelial_Tissue, Normal_Tissue
	Level 3: Tissue

Based on their **ontological positioning, hierarchical relationships, and semantic alignment**, do these entities represent the **same ontological concept**? Respond with "True" or "False".


Natural language prompts

In [88]:
print(prompt_direct_entity(src_entity, tgt_entity))

We have two entities from different biomedical ontologies.
The first one is "alveolus epithelium", which belongs to the broader category "lung epithelium"
The second one is "Alveolar_Epithelium", which belongs to the broader category "Epithelium"

Do they mean the same thing? Respond with "True" or "False".


In [89]:
print(prompt_sequential_hierarchy(src_entity, tgt_entity))

We have two entities from different biomedical ontologies.
The first one is "alveolus epithelium", which belongs to the broader category "lung epithelium", under the even broader category "respiratory system epithelium"
The second one is "Alveolar_Epithelium", which belongs to the broader category "Epithelium", under the even broader category "Epithelial_Tissue, Normal_Tissue"

Do they mean the same thing? Respond with "True" or "False".


Synonym prompts

In [90]:
print(prompt_direct_entity_with_synonyms(src_entity, tgt_entity))

We have two entities from different biomedical ontologies.
The first one is "alveolus epithelium", which falls under the category "lung epithelium".
The second one is "Alveolar_Epithelium", also known as "Lung Alveolar Epithelia", "Alveolar Epithelium", "Epithelia of lung alveoli. The layer of cells covering the lining of the tiny air sacs at the end of the bronchioles.", which falls under the category "Epithelium".

Do they mean the same thing? Respond with "True" or "False".


In [91]:
print(prompt_sequential_hierarchy_with_synonyms(src_entity, tgt_entity))

We have two entities from different biomedical ontologies.
The first one is "alveolus epithelium", belongs to broader category "lung epithelium", under the even broader category "respiratory system epithelium" (also known as "respiratory system mucosa"), under the even broader category "Thing".
The second one is "Alveolar_Epithelium", also known as "Lung Alveolar Epithelia", "Alveolar Epithelium", "Epithelia of lung alveoli. The layer of cells covering the lining of the tiny air sacs at the end of the bronchioles.", belongs to broader category "Epithelium" (also known as "epithelium", "Epithelium"), under the even broader category "Epithelial_Tissue, Normal_Tissue" (also known as "Epithelial Tissue"), under the even broader category "Tissue" (also known as "Tissue Types", "Normal Tissue", "An aggregate of cells with similar or identical specialized characteristics, contributing to the performance of a specific function.  Tissues are parts of organs.", "tissue", "Tissues", "Tissue").

D

Other

In [92]:
print(format_hierarchy(src_entity.get_parents_by_levels(max_level=3), False, True))

	Level 0: alveolus epithelium
	Level 1: lung epithelium
	Level 2: respiratory system epithelium
	Level 3: Thing


In [93]:
print(format_hierarchy(tgt_entity.get_parents_by_levels(max_level=3), False, True))

	Level 0: Alveolar_Epithelium
	Level 1: Epithelium
	Level 2: Epithelial_Tissue, Normal_Tissue
	Level 3: Tissue


In [94]:
print(src_entity)
print(src_entity.get_all_entity_names())
print(src_entity.get_parents_preferred_names())
print(src_entity.get_children_preferred_names())
print(src_entity.get_synonyms())
print(src_entity.get_direct_parents())
print(src_entity.get_direct_parents().pop().get_direct_children())

{'class': mouse.MA_0001771, 'uri': 'http://mouse.owl#MA_0001771', 'preffered_names': {'alveolus epithelium'}, 'synonyms': {'MA_0001771'}, 'all_names': {'alveolus epithelium'}, 'parents': {mouse.MA_0001823, mouse.MA_0001783, owl.Thing}, 'children': set()}
{'alveolus epithelium'}
[{'respiratory system epithelium'}, {'lung epithelium'}, {'Thing'}]
[]
{'MA_0001771'}
{mouse.MA_0001783}
{mouse.MA_0001771, mouse.MA_0001772}


In [95]:
print(tgt_entity)
print(tgt_entity.get_all_entity_names())
print(tgt_entity.get_parents_preferred_names())
print(tgt_entity.get_children_preferred_names())
print(tgt_entity.get_synonyms())
print(tgt_entity.get_direct_parents())
print(tgt_entity.get_direct_parents().pop().get_direct_children())

{'class': human.NCI_C12867, 'uri': 'http://human.owl#NCI_C12867', 'preffered_names': {'Alveolar_Epithelium'}, 'synonyms': {'Lung Alveolar Epithelia', 'Alveolar Epithelium', 'Epithelia of lung alveoli. The layer of cells covering the lining of the tiny air sacs at the end of the bronchioles.'}, 'all_names': {'Lung Alveolar Epithelia', 'Alveolar Epithelium', 'Epithelia of lung alveoli. The layer of cells covering the lining of the tiny air sacs at the end of the bronchioles.', 'Alveolar_Epithelium'}, 'parents': {human.NCI_C12219, human.NCI_C12710, human.NCI_C33177, owl.Thing, human.NCI_C33904, human.NCI_C12801, human.NCI_C45714, human.NCI_C21599}, 'children': set()}
{'Lung Alveolar Epithelia', 'Alveolar Epithelium', 'Epithelia of lung alveoli. The layer of cells covering the lining of the tiny air sacs at the end of the bronchioles.', 'Alveolar_Epithelium'}
[{'Anatomic_Structure_System_or_Substance'}, {'Normal_Tissue'}, {'Epithelium'}, {'Thing'}, {'Other_Anatomic_Concept'}, {'Tissue'}, {