# EDA of parents in anatomy dataset

In [None]:
# ruff: noqa: T201, T203
from __future__ import annotations

import os
from pathlib import Path

if Path.cwd().name == "notebooks":
    os.chdir("..")

import logging

import pandas as pd
from dotenv import load_dotenv

from config.config import DATA_DIR
from src.formatting import format_oracle_pairs_filepath
from src.onto_access import OntologyAccess
from src.onto_object import OntologyEntryAttr
from src.utils import read_oracle_pairs

logging.getLogger().setLevel(logging.WARNING)
load_dotenv()

%load_ext autoreload
%autoreload 2

In [4]:
DATASET_NAME = "anatomy"
SET_NAME = "human-mouse"
SOURCE_ONTOLOGY = "mouse"
TARGET_ONTOLOGY = "human"

src_onto_path = DATA_DIR / DATASET_NAME / SET_NAME / f"{SOURCE_ONTOLOGY}.owl"
tgt_onto_path = DATA_DIR / DATASET_NAME / SET_NAME / f"{TARGET_ONTOLOGY}.owl"

onto_src = OntologyAccess(src_onto_path, annotate_on_init=True)
onto_tgt = OntologyAccess(tgt_onto_path, annotate_on_init=True)

* Owlready2 * Creating new ontology human <data/anatomy/human-mouse/human.owl#>.
* Owlready2 * ADD TRIPLE data/anatomy/human-mouse/human.owl http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#Ontology
* Owlready2 *     ...loading ontology human from data/anatomy/human-mouse/human.owl...


* Owlready2 * Reseting property oboInOwl.ObsoleteProperty: new triples are now available.
* Owlready2 * Reseting property oboInOwl.hasRelatedSynonym: new triples are now available.
* Owlready2 * Reseting property oboInOwl.hasDefaultNamespace: new triples are now available.
* Owlready2 * Reseting property oboInOwl.savedBy: new triples are now available.
* Owlready2 * Reseting property oboInOwl.hasDate: new triples are now available.


* Owlready2 *     ...8 properties found: ObsoleteProperty, UNDEFINED_part_of, label, hasRelatedSynonym, hasDefaultNamespace, savedBy, hasDate, hasDefinition


In [5]:
candidate_pairs_lines = pd.DataFrame(
    read_oracle_pairs(format_oracle_pairs_filepath(DATASET_NAME, SET_NAME)), columns=["source", "target"]
)

In [6]:
candidate_pairs_lines.head()

Unnamed: 0,source,target
0,http://human.owl#NCI_C32658,http://mouse.owl#MA_0002684
1,http://human.owl#NCI_C33075,http://mouse.owl#MA_0001249
2,http://human.owl#NCI_C33748,http://mouse.owl#MA_0001221
3,http://human.owl#NCI_C12262,http://mouse.owl#MA_0001615
4,http://human.owl#NCI_C53050,http://mouse.owl#MA_0002234


In [7]:
for col in candidate_pairs_lines.columns:
    print(f"# Total elements {col}: {candidate_pairs_lines[col].shape[0]}")
    print(f"# Duplicates {col}: {candidate_pairs_lines[col].duplicated().sum()}")
    print(f"# Unique elements {col}: {candidate_pairs_lines[col].nunique()}")

# Total elements source: 398
# Duplicates source: 25
# Unique elements source: 373
# Total elements target: 398
# Duplicates target: 20
# Unique elements target: 378


### Restructure source - target ontologies

In [None]:
def get_df(rows: list[str], onto: OntologyAccess) -> pd.DataFrame:
    data = [
        {
            "entry": row,
            "all_entity_names": OntologyEntryAttr(row, onto).get_all_entity_names(),
            "parents_preferred_names": OntologyEntryAttr(row, onto).get_parents_preferred_names(),
            "direct_parents": OntologyEntryAttr(row, onto).get_direct_parents(),
        }
        for row in rows
    ]
    data_df = pd.DataFrame(data)
    data_df["parents_preferred_names"] = data_df["parents_preferred_names"].apply(lambda x: list(map(str, x)))
    data_df["direct_parents"] = data_df["direct_parents"].astype(str)
    return data_df


tgt_df = get_df(candidate_pairs_lines.iloc[:, 0], onto_tgt)
src_df = get_df(candidate_pairs_lines.iloc[:, 1], onto_src)

In [86]:
src_df.head()

Unnamed: 0,entry,all_entity_names,parents_preferred_names,direct_parents
0,http://mouse.owl#MA_0002684,{stomach muscularis mucosa},[{'Thing'}],{owl.Thing}
1,http://mouse.owl#MA_0001249,"{palpebral gland, tarsal gland, meibomian gland}","[{'eye gland'}, {'Thing'}]",{mouse.MA_0000267}
2,http://mouse.owl#MA_0001221,{tensor tympani},"[{'tympanic cavity muscle'}, {'Thing'}]",{mouse.MA_0000256}
3,http://mouse.owl#MA_0001615,{stomach greater curvature},"[{'Thing'}, {'stomach region'}]",{mouse.MA_0002561}
4,http://mouse.owl#MA_0002234,{testicular vein},"[{'vein'}, {'venous blood vessel'}, {'blood ve...",{mouse.MA_0000067}


In [83]:
tgt_df.head()

Unnamed: 0,entry,all_entity_names,parents_preferred_names,direct_parents
0,http://human.owl#NCI_C32658,"{Gastric Muscularis Mucosa, Gastric_Muscularis...","{'Mucosa'}, {'Thing'}, {'Anatomic_Structure_Sy...","{human.NCI_C33149, human.NCI_C32656}"
1,http://human.owl#NCI_C33075,"{Meibomian Gland, Meibomian_Gland}","{'Sebaceous_Gland'}, {'Thing'}, {'Anatomic_Str...",{human.NCI_C33519}
2,http://human.owl#NCI_C33748,"{Tensor Tympani, Tensor_Tympani}","{'Body_Part'}, {'Thing'}, {'Anatomic_Structure...",{human.NCI_C33148}
3,http://human.owl#NCI_C12262,"{Greater Curvature, Greater Curvature of the S...","{'Anatomic_Structure_System_or_Substance'}, {'...",{human.NCI_C25763}
4,http://human.owl#NCI_C53050,"{Testicular Vein, Spermatic Vein, Internal Spe...","{'Body_Part'}, {'Thing'}, {'Anatomic_Structure...",{human.NCI_C12814}


<b><i> Example ontology</i></b>:

<b> Number </b>: 315

<b> Column name </b>: http://mouse.owl#MA_0000293

<b> All entity names </b>: {'pelvic girdle bone'}

<b> Parents preffered names </b>: "[{'pelvis bone'}, {'trunk bone'}, {'bone'}, {'abdomen/pelvis/perineum bone'}, {'girdle bone'}, {owl.Thing}]"

<b> Direct parents </b>: "{mouse.MA_0000291, mouse.MA_0000532}"

In [14]:
mouse_example = OntologyEntryAttr("http://mouse.owl#MA_0000293", onto_src)

parent = next(iter(mouse_example.get_direct_parents()))
child = next(iter(parent.get_direct_children()))

mouse_example, parent, child

(mouse.MA_0000293, mouse.MA_0000532, mouse.MA_0000293)