In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import ujson
import sys
import os

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from typing import Optional, Union

from bigbio.dataloader import BigBioConfigHelpers

sys.path.append('../../../..')
from umls_utils import UmlsMappings
from bigbio_utils import CUIS_TO_REMAP, CUIS_TO_EXCLUDE, DATASET_NAMES, VALIDATION_DOCUMENT_IDS
from bigbio_utils import dataset_to_documents, dataset_to_df, resolve_abbreviation, get_left_context, get_right_context

conhelps = BigBioConfigHelpers()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `d

In [None]:
from bioel.ontology import BiomedicalOntology

test_cases = [{'filepath': 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo',
        'name': 'disease ontology',
        'prefix_to_keep': None,
        'entity_type': 'DISEASE',
        'abbrev': 'DOID'},
        {'filepath': 'http://purl.obolibrary.org/obo/cl.obo',
        'name': 'cell ontology',
        'prefix_to_keep': 'CL',
        'entity_type': 'CELL_TYPE',
        'abbrev': 'CL'},
        {'filepath': 'http://purl.obolibrary.org/obo/uberon.obo',
        'name': 'uberon',
        'prefix_to_keep': 'UBERON',
        'entity_type': 'TISSUE',
        'abbrev': 'UBERON'},
        {'filepath': 'http://purl.obolibrary.org/obo/obi.obo',
        'name': 'ontology of biological investigations',
        'prefix_to_keep': 'OBI',
        'entity_type': 'EXP_ASSAY',
        'abbrev': 'OBI'},
        {'filepath': 'https://ftp.expasy.org/databases/cellosaurus/cellosaurus.obo',
        'name': 'cellosaurus',
        'prefix_to_keep': None,
        'entity_type': 'CELL_LINE',
        'abbrev': 'CVCL'},
        {'filepath': 'http://purl.obolibrary.org/obo/go.obo',
        'name': 'gene ontology',
        'prefix_to_keep': None,
        'entity_type': 'SUBCELLULAR',
        'abbrev': 'GO'},
        {'filepath': 'http://purl.obolibrary.org/obo/po.obo',
        'name': 'plant ontology',
        'prefix_to_keep': "PO",
        'entity_type': 'TISSUE',
        'abbrev': 'PO'},
        {'filepath': 'http://purl.obolibrary.org/obo/chebi.obo',
        'name': 'ChEBI',
        'prefix_to_keep': None,
        'entity_type': 'SMALL_MOLECULE',
        'abbrev': 'CHEBI'},
        ]

for obo_dict in test_cases:
    ontology = BiomedicalOntology.load_obo(**obo_dict)
    display(ontology.entities[:10])

## Get the ontology

In [5]:
from bioel.ontology import BiomedicalOntology

obo_dict = {'filepath': 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo',
        'name': 'disease ontology',
        'prefix_to_keep': None,
        'entity_type': 'DISEASE',
        'abbrev': 'DOID'}

ontology = BiomedicalOntology.load_obo(**obo_dict)

[2024-04-01 14:51:14] [ontology.py] [INFO] Reading OBO ontology from https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo
100%|██████████| 11537/11537 [00:00<00:00, 52108.79it/s]


In [9]:
ontology.entities[0]

BiomedicalEntity(cui='DOID:0001816', name='angiosarcoma', types=['DISEASE'], aliases=['hemangiosarcoma'], definition='A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.', equivalant_cuis=['DOID:267', 'DOID:4508'], taxonomy=None, metadata=None)

In [8]:
ontology.entities[0].cui

'DOID:0001816'

In [3]:
print("Attributes and Methods : ", dir(ontology))

Attributes and Methods :  ['__annotations__', '__class__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'abbrev', 'entities', 'get_aliases', 'get_canonical_name', 'get_definition', 'get_types', 'load_csv', 'load_json', 'load_mesh', 'load_ncbi_taxon', 'load_obo', 'load_umls', 'metadata', 'name', 'types']


## Get the entities in the ontology

In [4]:
ontology_entities = ontology.entities

In [5]:
display(ontology.entities[0])

BiomedicalEntity(cui='DOID:0001816', name='angiosarcoma', types=['DISEASE'], aliases=['hemangiosarcoma'], definition='A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.', equivalant_cuis=['DOID:267', 'DOID:4508'], taxonomy=None, metadata=None)

In [6]:
# Create a list of dictionaries, each representing a row in the DataFrame
data = [{
    'cui': entity.cui,
    'name': entity.name,
    'types': entity.types,
    'aliases': entity.aliases if hasattr(entity, 'aliases') else [],
    'definition': entity.definition,
    'equivalent_cuis': entity.equivalent_cuis if hasattr(entity, 'equivalent_cuis') and entity.equivalent_cuis is not None else [],
    'taxonomy': entity.taxonomy,
    'metadata': entity.metadata
} for entity in ontology_entities]

# Create the pandas DataFrame
ontology_df = pd.DataFrame(data)

# Display the DataFrame
ontology_df.head()  # This prints the first few rows of the DataFrame

    

Unnamed: 0,cui,name,types,aliases,definition,equivalent_cuis,taxonomy,metadata
0,DOID:0001816,angiosarcoma,[DISEASE],[hemangiosarcoma],A vascular cancer that derives_from the cells ...,[],,
1,DOID:0002116,pterygium,[DISEASE],[surfer's eye],A corneal disease that is characterized by a t...,[],,
2,DOID:0014667,disease of metabolism,[DISEASE],[metabolic disease],A disease that involving errors in metabolic p...,[],,
3,DOID:0040001,shrimp allergy,[DISEASE],[],A crustacean allergy that has_allergic_trigger...,[],,
4,DOID:0040002,aspirin allergy,[DISEASE],"[acetylsalicylic acid allergy, ASA allergy]",A drug allergy that has_allergic_trigger acety...,[],,


### Check if the dataset has any null definition

In [7]:
null_definitions = ontology_df['definition'].isnull()

# Get the index of rows with null definitions
null_definitions_indices = ontology_df[ontology_df['definition'].isnull()].index
print(f"Indices of rows with null definitions: {null_definitions_indices.tolist()}")


Indices of rows with null definitions: [5978, 5989, 5992, 6012, 6016, 6017, 6026, 6029, 6030, 6060, 6080, 6082, 6085, 6086, 6088, 6093, 6094, 6102, 6103, 6104, 6106, 6107, 6109, 6110, 6111, 6121, 6123, 6124, 6125, 6126, 6135, 6140, 6145, 6150, 6154, 6157, 6158, 6164, 6166, 6167, 6171, 6172, 6173, 6175, 6176, 6177, 6178, 6180, 6188, 6192, 6193, 6195, 6199, 6200, 6202, 6204, 6207, 6208, 6218, 6219, 6221, 6223, 6224, 6226, 6228, 6231, 6232, 6238, 6239, 6243, 6244, 6246, 6247, 6249, 6251, 6256, 6261, 6266, 6278, 6282, 6283, 6285, 6290, 6291, 6292, 6294, 6295, 6297, 6298, 6300, 6306, 6307, 6308, 6309, 6310, 6311, 6312, 6313, 6314, 6318, 6322, 6328, 6330, 6338, 6340, 6343, 6344, 6345, 6346, 6347, 6348, 6349, 6354, 6361, 6365, 6366, 6369, 6371, 6378, 6382, 6388, 6390, 6392, 6393, 6400, 6401, 6402, 6403, 6404, 6405, 6406, 6407, 6409, 6419, 6420, 6421, 6422, 6423, 6426, 6428, 6429, 6430, 6432, 6434, 6438, 6444, 6445, 6447, 6448, 6449, 6451, 6452, 6453, 6455, 6457, 6458, 6459, 6463, 6467, 6468, 

### Get canonical name of entities in the ontology

In [8]:
def get_canonical_name(df):
    # Extract tuples of CUI and canonical names (aliases) from the DataFrame
    canonical_names = {row['cui']: row['name'] for index, row in df.iterrows()}
    return canonical_names

In [9]:
ontology_to_name = get_canonical_name(ontology_df)
print("\nThis mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term")
print('\nThe first 5 elements are:')
print(list(ontology_to_name.items())[:5])


This mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term

The first 5 elements are:
[('DOID:0001816', 'angiosarcoma'), ('DOID:0002116', 'pterygium'), ('DOID:0014667', 'disease of metabolism'), ('DOID:0040001', 'shrimp allergy'), ('DOID:0040002', 'aspirin allergy')]


### Get aliases of entities in the ontology

In [10]:
def get_aliases(df):
    # aliases are collected into a flat list instead of a list of lists
    def flatten_aliases(alias_lists):
        # Flatten a list of lists into a single list
        return [alias for sublist in alias_lists for alias in sublist]
    
    # Group by 'cui', aggregate aliases into a list, and then flatten the list of lists
    grouped = df.groupby('cui')['aliases'].agg(flatten_aliases).to_dict()

    return grouped

In [11]:
ontology_to_alias = get_aliases(ontology_df)
print(list(ontology_to_alias.items())[:5])

[('DOID:0001816', ['hemangiosarcoma']), ('DOID:0002116', ["surfer's eye"]), ('DOID:0014667', ['metabolic disease']), ('DOID:0040001', []), ('DOID:0040002', ['acetylsalicylic acid allergy', 'ASA allergy'])]


### Get definition of entities in the ontology

In [12]:
def get_definition(df):
    # Filter out rows where the definition column is null
    df_non_null_definitions = df.dropna(subset=['definition'])
    
    # Extract tuples of CUI and definition from the DataFrame
    definitions_dict = {row['cui']: row['definition'] for index, row in df_non_null_definitions.iterrows()}
    
    return definitions_dict

In [13]:
ontology_cui2definition = get_definition(ontology_df)
print('\nThe first 5 elements are:')
print(list(ontology_cui2definition.items())[:5])


The first 5 elements are:
[('DOID:0001816', 'A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.'), ('DOID:0002116', 'A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation.'), ('DOID:0014667', 'A disease that involving errors in metabolic processes of building or degradation of molecules.'), ('DOID:0040001', 'A crustacean allergy that has_allergic_trigger shrimp.'), ('DOID:0040002', 'A drug allergy that has_allergic_trigger acetylsalicylic acid.')]


### Get types of entities in the ontology

In [14]:
def get_types(df):
    '''
    Get type of entities in the ontology
    df: panda dataframe
    '''
    # Extract tuples of CUI and types from the DataFrame
    types = {row['cui']: row['types'] for index, row in df.iterrows()}
    return types

In [15]:
ontology_cui2tui = get_types(ontology_df)
print('\nThe first 5 elements are:')
print(list(ontology_cui2tui.items())[:5])


The first 5 elements are:
[('DOID:0001816', ['DISEASE']), ('DOID:0002116', ['DISEASE']), ('DOID:0014667', ['DISEASE']), ('DOID:0040001', ['DISEASE']), ('DOID:0040002', ['DISEASE'])]


# Function

In [16]:
def process_obo_ontology(ontology, 
                     dataset,
                     model):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - dataset : str
    Name of the dataset
    - ontology : str (only umls for now)
    Ontology associated with the dataset
    '''
    
    # Entities in the ontology
    ontology_entities = ontology.entities
    # Create a list of dictionaries, each representing a row in the DataFrame
    data = [{
    'cui': entity.cui,
    'name': entity.name,
    'types': entity.types,
    'aliases': entity.aliases if hasattr(entity, 'aliases') else [],
    'definition': entity.definition,
    'equivalent_cuis': entity.equivalent_cuis if hasattr(entity, 'equivalent_cuis') and entity.equivalent_cuis is not None else [],
    'taxonomy': entity.taxonomy,
    'metadata': entity.metadata
    } for entity in ontology_entities]

    # Create the pandas DataFrame
    ontology_df = pd.DataFrame(data)
    
    # Get canonical name of entities in the ontology
    ontology_to_name = get_canonical_name(ontology_df)
    # Get aliases of entities in the ontology
    ontology_to_alias = get_aliases(ontology_df)
    # Get definition of entities in the ontology
    ontology_cui2definition = get_definition(ontology_df)
    # Get types of entities in the ontology
    ontology_cui2tui = get_types(ontology_df)


    # Check if the directory exists, and create it if it does not
    directory_path = f'../data/{model}/{dataset}/'
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

    ontology_entities = []
    for cui, name in tqdm(ontology_to_name.items()):
        d = {}
        ent_type = ontology_cui2tui[cui]
        d['type'] = ent_type
        other_aliases = [x for x in ontology_to_alias[cui] if x != name]
        joined_aliases = ' ; '.join(other_aliases)
        # d['cui'] = f"ontology_name:{cui}"
        d['cui'] = f"ontology:{cui}"
        d['title'] = name
        if cui in ontology_cui2definition:
            definition = ontology_cui2definition[cui]
        else:
            definition = None

        if len(other_aliases) > 0:
            if definition is not None:
                d['description'] = f"{name} ( {ent_type} : {joined_aliases} ) [ {definition} ]"
            else:
                d['description'] = f"{name} ( {ent_type} : {joined_aliases} )"
        else:
            if definition is not None:
                d['description'] = f"{name} ( {ent_type} ) [ {definition} ]"
            else:
                d['description'] = f"{name} ( {ent_type} )"

        ontology_entities.append(d)

    pickle.dump(ontology_entities, open(f'../data/{model}/{dataset}/dictionary.pickle', 'wb'))
    entities = pickle.load(open(f'../data/{model}/{dataset}/dictionary.pickle', 'rb'))
    return entities


## Test

In [17]:
obo_dict = {'filepath': 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo',
        'name': 'disease ontology',
        'prefix_to_keep': None,
        'entity_type': 'DISEASE',
        'abbrev': 'DOID'}

obo_dict2 = {'filepath': 'http://purl.obolibrary.org/obo/chebi.obo',
        'name': 'ChEBI',
        'prefix_to_keep': None,
        'entity_type': 'SMALL_MOLECULE',
        'abbrev': 'CHEBI'}

obo_dict3 = {'filepath': 'http://purl.obolibrary.org/obo/po.obo',
        'name': 'plant ontology',
        'prefix_to_keep': "PO",
        'entity_type': 'TISSUE',
        'abbrev': 'PO'}
        
obo_dict4 = {'filepath': 'http://purl.obolibrary.org/obo/cl.obo',
        'name': 'cell ontology',
        'prefix_to_keep': 'CL',
        'entity_type': 'CELL_TYPE',
        'abbrev': 'CL'}

obo_dict5 = {'filepath': 'http://purl.obolibrary.org/obo/uberon.obo',
        'name': 'uberon',
        'prefix_to_keep': 'UBERON',
        'entity_type': 'TISSUE',
        'abbrev': 'UBERON'}

obo_dict6 = {'filepath': 'http://purl.obolibrary.org/obo/obi.obo',
        'name': 'ontology of biological investigations',
        'prefix_to_keep': 'OBI',
        'entity_type': 'EXP_ASSAY',
        'abbrev': 'OBI'}

obo_dict7 = {'filepath': 'https://ftp.expasy.org/databases/cellosaurus/cellosaurus.obo',
        'name': 'cellosaurus',
        'prefix_to_keep': None,
        'entity_type': 'CELL_LINE',
        'abbrev': 'CVCL'}

obo_dict8 = {'filepath': 'http://purl.obolibrary.org/obo/go.obo',
        'name': 'gene ontology',
        'prefix_to_keep': None,
        'entity_type': 'SUBCELLULAR',
        'abbrev': 'GO'}

In [18]:
ontology = BiomedicalOntology.load_obo(**obo_dict)
dataset = "test"
model = "test"

[2024-03-16 14:19:58] [ontology.py] [INFO] Reading OBO ontology from https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo
100%|██████████| 11511/11511 [00:00<00:00, 263488.80it/s]


In [19]:
process_obo_ontology(ontology, 
                     dataset,
                     model)

entities = pickle.load(open(f'../data/{model}/{dataset}/dictionary.pickle', 'rb'))

100%|██████████| 11511/11511 [00:00<00:00, 82474.60it/s]


In [20]:
entities

[{'type': ['DISEASE'],
  'cui': 'ontology:DOID:0001816',
  'title': 'angiosarcoma',
  'description': "angiosarcoma ( ['DISEASE'] : hemangiosarcoma ) [ A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels. ]"},
 {'type': ['DISEASE'],
  'cui': 'ontology:DOID:0002116',
  'title': 'pterygium',
  'description': "pterygium ( ['DISEASE'] : surfer's eye ) [ A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation. ]"},
 {'type': ['DISEASE'],
  'cui': 'ontology:DOID:0014667',
  'title': 'disease of metabolism',
  'description': "disease of metabolism ( ['DISEASE'] : metabolic disease ) [ A disease that involving errors in metabolic processes of building or degradation of molecules. ]"},
 {'type': ['DISEASE'],
  'cui': 'ontology:DOID:0040001',
  'title': 'shrimp allergy',
  'description': "shrimp allergy ( ['DISEASE'] ) [ A crust

# REDO EVERYTHING BUT WITHOUT PANDA DATAFRAME

## Get the ontology

In [42]:
from bioel.ontology import BiomedicalOntology

obo_dict = {'filepath': 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo',
        'name': 'disease ontology',
        'prefix_to_keep': None,
        'entity_type': 'DISEASE',
        'abbrev': 'DOID'}

ontology = BiomedicalOntology.load_obo(**obo_dict)

[2024-03-11 20:27:08] [ontology.py] [INFO] Reading OBO ontology from https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo
100%|██████████| 11511/11511 [00:00<00:00, 48925.32it/s]


### list_entities

In [43]:
ontology_entities = ontology.entities
display(ontology.entities[0])

BiomedicalEntity(cui='DOID:0001816', name='angiosarcoma', types=['DISEASE'], aliases=['hemangiosarcoma'], definition='A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.', equivalant_cuis=['DOID:267', 'DOID:4508'], taxonomy=None, metadata=None)

In [44]:
def entities2list(ontology):
    ontology_entities = ontology.entities
    # Create a list of dictionaries, each representing a row in the DataFrame
    data = [{
        'cui': entity.cui,
        'name': entity.name,
        'types': entity.types,
        'aliases': entity.aliases if hasattr(entity, 'aliases') else [],
        'definition': entity.definition,
        'equivalent_cuis': entity.equivalent_cuis if hasattr(entity, 'equivalent_cuis') and entity.equivalent_cuis is not None else [],
        'taxonomy': entity.taxonomy,
        'metadata': entity.metadata
    } for entity in ontology_entities]

    return data

In [45]:
data = entities2list(ontology)
data

[{'cui': 'DOID:0001816',
  'name': 'angiosarcoma',
  'types': ['DISEASE'],
  'aliases': ['hemangiosarcoma'],
  'definition': 'A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.',
  'equivalent_cuis': [],
  'taxonomy': None,
  'metadata': None},
 {'cui': 'DOID:0002116',
  'name': 'pterygium',
  'types': ['DISEASE'],
  'aliases': ["surfer's eye"],
  'definition': 'A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation.',
  'equivalent_cuis': [],
  'taxonomy': None,
  'metadata': None},
 {'cui': 'DOID:0014667',
  'name': 'disease of metabolism',
  'types': ['DISEASE'],
  'aliases': ['metabolic disease'],
  'definition': 'A disease that involving errors in metabolic processes of building or degradation of molecules.',
  'equivalent_cuis': [],
  'taxonomy': None,
  'metadata': None},
 {'cui': 'DOID:0040001',
  'name': '

### Check if the dataset has any null definition

In [46]:
# Get the index of rows with null definitions
null_definitions_indices = [i for i, entity in enumerate(data) if entity.get('definition') is None]

print(f"Indices of rows with null definitions: {null_definitions_indices}")


Indices of rows with null definitions: [5978, 5989, 5992, 6012, 6016, 6017, 6026, 6029, 6030, 6060, 6080, 6082, 6085, 6086, 6088, 6093, 6094, 6102, 6103, 6104, 6106, 6107, 6109, 6110, 6111, 6121, 6123, 6124, 6125, 6126, 6135, 6140, 6145, 6150, 6154, 6157, 6158, 6164, 6166, 6167, 6171, 6172, 6173, 6175, 6176, 6177, 6178, 6180, 6188, 6192, 6193, 6195, 6199, 6200, 6202, 6204, 6207, 6208, 6218, 6219, 6221, 6223, 6224, 6226, 6228, 6231, 6232, 6238, 6239, 6243, 6244, 6246, 6247, 6249, 6251, 6256, 6261, 6266, 6278, 6282, 6283, 6285, 6290, 6291, 6292, 6294, 6295, 6297, 6298, 6300, 6306, 6307, 6308, 6309, 6310, 6311, 6312, 6313, 6314, 6318, 6322, 6328, 6330, 6338, 6340, 6343, 6344, 6345, 6346, 6347, 6348, 6349, 6354, 6361, 6365, 6366, 6369, 6371, 6378, 6382, 6388, 6390, 6392, 6393, 6400, 6401, 6402, 6403, 6404, 6405, 6406, 6407, 6409, 6419, 6420, 6421, 6422, 6423, 6426, 6428, 6429, 6430, 6432, 6434, 6438, 6444, 6445, 6447, 6448, 6449, 6451, 6452, 6453, 6455, 6457, 6458, 6459, 6463, 6467, 6468, 

### Get canonical name of entities in the ontology

In [47]:
def get_canonical_name(data):
    '''
    Get name of entities in the ontology
    data: list of dict
    '''
    # Extract tuples of CUI and canonical names from the data
    canonical_names = {entity.get('cui'): entity.get('name') for entity in data}
    return canonical_names

In [48]:
entity_to_name = get_canonical_name(data)
print("\nThis mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term")
print('\nThe first 5 elements are:')
print(list(entity_to_name.items())[:5])


This mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term

The first 5 elements are:
[('DOID:0001816', 'angiosarcoma'), ('DOID:0002116', 'pterygium'), ('DOID:0014667', 'disease of metabolism'), ('DOID:0040001', 'shrimp allergy'), ('DOID:0040002', 'aspirin allergy')]


### Get aliases of entities in the ontology

In [49]:
def get_aliases(data):
    '''
    Get aliases of entities in the ontology
    data: list of dict
    '''
    # Extract tuples of CUI and aliases from the Data
    aliases = {entity.get('cui'): entity.get('aliases') for entity in data}
    return aliases

In [50]:
entity_to_aliases = get_aliases(data)
print("\nThis mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term")
print('\nThe first 5 elements are:')
print(list(entity_to_aliases.items())[:5])


This mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term

The first 5 elements are:
[('DOID:0001816', ['hemangiosarcoma']), ('DOID:0002116', ["surfer's eye"]), ('DOID:0014667', ['metabolic disease']), ('DOID:0040001', []), ('DOID:0040002', ['acetylsalicylic acid allergy', 'ASA allergy'])]


### Get definition of entities in the ontology

In [51]:
def get_definition(data):
    '''
    Get definition of entities in the ontology
    data: list of dict
    '''
    # Extract tuples of CUI and definition from the Data
    definitions_dict = {entity['cui']: entity['definition'] for entity in data if entity['definition'] is not None}
    return definitions_dict

In [52]:
cui2definition = get_definition(data)
print("\nThis mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term")
print('\nThe first 5 elements are:')
print(list(cui2definition.items())[:5])


This mapping is a specific application of the canonical name concept within the context of UMLS, linking each unique concept (identified by its CUI) to its preferred term

The first 5 elements are:
[('DOID:0001816', 'A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.'), ('DOID:0002116', 'A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation.'), ('DOID:0014667', 'A disease that involving errors in metabolic processes of building or degradation of molecules.'), ('DOID:0040001', 'A crustacean allergy that has_allergic_trigger shrimp.'), ('DOID:0040002', 'A drug allergy that has_allergic_trigger acetylsalicylic acid.')]


### Get types of entities in the ontology

In [53]:
def get_types(data):
    '''
    Get type of entities in the ontology
    data: list of dict
    '''
    # Extract tuples of CUI and types from the Data
    types = {entity['cui']: entity['types'] for entity in data}
    return types

In [54]:
cui2tui = get_types(data)
print('\nThe first 5 elements are:')
print(list(cui2tui.items())[:5])


The first 5 elements are:
[('DOID:0001816', ['DISEASE']), ('DOID:0002116', ['DISEASE']), ('DOID:0014667', ['DISEASE']), ('DOID:0040001', ['DISEASE']), ('DOID:0040002', ['DISEASE'])]


# BEST VERSION : process obo ontology

In [21]:
import os
import pickle
from tqdm import tqdm

def process_obo_ontology(ontology, data_path, obo_dict):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
        Ontology associated with the dataset
    - data_path : str
        Path where to load and save dictionary.pickle
    - obo_dict : dict
        dictionary with the parameter expected by load_obo method : 
        filepath, name, prefix_to_keep, entity_type, abbrev
    '''
    
    ontology = BiomedicalOntology.load_obo(**obo_dict)
    
    equivalent_cuis = False
    if ontology.entities[0].equivalant_cuis is not None : 
        equivalent_cuis = True
        
    'If dictionary already processed, load it else process and load it'
    entity_dictionary_pkl_path = os.path.join(data_path, 'dictionary.pickle')
    
    if os.path.isfile(entity_dictionary_pkl_path): 
        print("Loading stored processed entity dictionary...")
        with open(entity_dictionary_pkl_path, 'rb') as read_handle:
            entities = pickle.load(read_handle)
        
        return entities, equivalent_cuis
    
    ontology_entities = []
    for entity in tqdm(ontology.entities):      
        if entity.aliases != "":
            if entity.definition != "":
                new_entity = {
                    'type': entity.types,
                    'cui': entity.cui,
                    'title': entity.name,
                    'description': f"{entity.name} ( {entity.types} : {entity.aliases} ) [{entity.definition}]"
                }
            else : 
                new_entity = {
                    'type': entity.types,
                    'cui': entity.cui,
                    'title': entity.name,
                    'description': f"{entity.name} ( {entity.types} : {entity.aliases} )"
                }
                
        else : 
            if entity.definition != "":
                new_entity = {
                        'type': entity.types,
                        'cui': entity.cui,
                        'title': entity.name,
                        'description': f"{entity.name} ( {entity.types} ) [{entity.definition}]"
                    }
            else : 
                new_entity = {
                        'type': entity.types,
                        'cui': entity.cui,
                        'title': entity.name,
                        'description': f"{entity.name} ( {entity.types} )"
                    }
        ontology_entities.append(new_entity)
        
    # Check if the directory exists, and create it if it does not
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    # Save entities to pickle file
    with open(os.path.join(data_path, "dictionary.pickle"), "wb") as f:
        pickle.dump(ontology_entities, f)

    entities = pickle.load(open(os.path.join(data_path, "dictionary.pickle"), "rb"))
    return entities, equivalent_cuis



In [22]:
from bioel.ontology import BiomedicalOntology

obo_dict = {'filepath': 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo',
        'name': 'disease ontology',
        'prefix_to_keep': None,
        'entity_type': 'DISEASE',
        'abbrev': 'DOID'}

ontology = f"{obo_dict['name']}"
model = "arboel"
dataset = "random"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

entities = process_obo_ontology(ontology = ontology,
                                    data_path= data_path,
                                    obo_dict = obo_dict, )

[2024-04-02 17:55:08] [ontology.py] [INFO] Reading OBO ontology from https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo


/home2/cye73/data/arboel/random


100%|██████████| 11537/11537 [00:00<00:00, 52864.51it/s]
100%|██████████| 11537/11537 [00:00<00:00, 515782.53it/s]


In [24]:
entities[0]

{'type': ['DISEASE'],
 'cui': 'DOID:0001816',
 'title': 'angiosarcoma',
 'description': "angiosarcoma ( ['DISEASE'] : ['hemangiosarcoma'] ) [A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.]"}

In [58]:
entities

[{'type': ['DISEASE'],
  'cui': 'DOID:0001816',
  'title': 'angiosarcoma',
  'description': "angiosarcoma ( ['DISEASE'] : hemangiosarcoma ) [ A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels. ]"},
 {'type': ['DISEASE'],
  'cui': 'DOID:0002116',
  'title': 'pterygium',
  'description': "pterygium ( ['DISEASE'] : surfer's eye ) [ A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation. ]"},
 {'type': ['DISEASE'],
  'cui': 'DOID:0014667',
  'title': 'disease of metabolism',
  'description': "disease of metabolism ( ['DISEASE'] : metabolic disease ) [ A disease that involving errors in metabolic processes of building or degradation of molecules. ]"},
 {'type': ['DISEASE'],
  'cui': 'DOID:0040001',
  'title': 'shrimp allergy',
  'description': "shrimp allergy ( ['DISEASE'] ) [ A crustacean allergy that has_allergic_trig

# THIRD VERSION : INTEGRATING WITH THE CLASS (look above for latest version)

In [21]:
def process_obo_ontology(ontology, 
                     data_path):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
    Ontology associated with the dataset
    - data_path : str
    Path where to load and save dictionary.pickle
    '''
    
    # Get canonical name of entities in the ontology
    cui2name = ontology.get_canonical_name()
    # Get aliases of entities in the ontology
    cui2alias = ontology.get_aliases()
    # Get definition of entities in the ontology
    cui2definition = ontology.get_definition()
    # Get types of entities in the ontology
    cui2tui = ontology.get_types()


    # Check if the directory exists, and create it if it does not
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    ontology_entities = []
    for cui, name in tqdm(cui2name.items()):
        d = {}
        ent_type = cui2tui[cui]
        d['type'] = ent_type
        other_aliases = [x for x in cui2alias[cui] if x != name]
        joined_aliases = ' ; '.join(other_aliases)
        d['cui'] = f"{cui}"
        d['title'] = name
        if cui in cui2definition:
            definition = cui2definition[cui]
        else:
            definition = None

        if len(other_aliases) > 0:
            if definition is not None:
                d['description'] = f"{name} ( {ent_type} : {joined_aliases} ) [ {definition} ]"
            else:
                d['description'] = f"{name} ( {ent_type} : {joined_aliases} )"
        else:
            if definition is not None:
                d['description'] = f"{name} ( {ent_type} ) [ {definition} ]"
            else:
                d['description'] = f"{name} ( {ent_type} )"

        ontology_entities.append(d)

    pickle.dump(ontology_entities, open(os.path.join(data_path, 'dictionary.pickle'), 'wb'))
    entities = pickle.load(open(os.path.join(data_path, 'dictionary.pickle'), 'rb'))
    
    # pickle.dump(ontology_entities, open(f'../data/{model}/{dataset}/dictionary.pickle', 'wb'))
    # entities = pickle.load(open(f'../data/{model}/{dataset}/dictionary.pickle', 'rb'))
    return entities



In [22]:
ontology = BiomedicalOntology.load_obo(**obo_dict)
dataset = "test2"
model = "test2"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)

entities = process_obo_ontology(
    ontology = ontology,
    data_path = data_path 
    )

entities

[2024-03-16 14:20:13] [ontology.py] [INFO] Reading OBO ontology from https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo
100%|██████████| 11511/11511 [00:00<00:00, 262340.57it/s]
100%|██████████| 11511/11511 [00:00<00:00, 455258.63it/s]


[{'type': ['DISEASE'],
  'cui': 'DOID:0001816',
  'title': 'angiosarcoma',
  'description': "angiosarcoma ( ['DISEASE'] : hemangiosarcoma ) [ A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels. ]"},
 {'type': ['DISEASE'],
  'cui': 'DOID:0002116',
  'title': 'pterygium',
  'description': "pterygium ( ['DISEASE'] : surfer's eye ) [ A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation. ]"},
 {'type': ['DISEASE'],
  'cui': 'DOID:0014667',
  'title': 'disease of metabolism',
  'description': "disease of metabolism ( ['DISEASE'] : metabolic disease ) [ A disease that involving errors in metabolic processes of building or degradation of molecules. ]"},
 {'type': ['DISEASE'],
  'cui': 'DOID:0040001',
  'title': 'shrimp allergy',
  'description': "shrimp allergy ( ['DISEASE'] ) [ A crustacean allergy that has_allergic_trig