# Create MEDIC entity dictionary

In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import pickle
import ujson
import sys
import os
import csv

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from typing import Optional, Union

from bigbio.dataloader import BigBioConfigHelpers

sys.path.append('../../../..')
sys.path.append('..')
from DataModule import process_mention_dataset, process_umls_ontology, process_obo_ontology
from umls_utils import UmlsMappings
from bigbio_utils import CUIS_TO_REMAP, CUIS_TO_EXCLUDE, DATASET_NAMES, VALIDATION_DOCUMENT_IDS
from bigbio_utils import dataset_to_documents, dataset_to_df, resolve_abbreviation, get_left_context, get_right_context
from bioel.ontology import BiomedicalOntology

conhelps = BigBioConfigHelpers()

In [19]:
import csv

# Specify the path to your TSV file
file_path = '/mitchell/entity-linking/kbs/medic.tsv'

key_dict = ["DiseaseName", 
            "DiseaseID", 
            "AltDiseaseIDs", 
            "Definition", 
            "ParentIDs", 
            "TreeNumbers", 
            "ParentTreeNumbers",
            "Synonyms",
            "SlimMappings"]
# Open the TSV file
with open(file_path, newline='') as tsvfile:
    # Create a CSV reader specifying the delimiter as a tab character
    reader = csv.reader(tsvfile, delimiter='\t')
    
    # Initialize a counter
    counter = 0
    
    ontology = []
    # Iterate over the rows in the file
    for row in reader:
        dict = {}
        # Print the current row
        if counter > 28 :
            for i, elements in enumerate(row) :
                dict[key_dict[i]] = elements
            disease_ids = [dict["DiseaseID"]]  # Start with DiseaseID in the list
            # If AltDiseaseIDs exists and is not empty, extend the list with its elements
            if "AltDiseaseIDs" in dict and dict["AltDiseaseIDs"]:
                # Split AltDiseaseIDs by comma and extend the disease_ids list
                disease_ids.extend(dict["AltDiseaseIDs"].split(','))
            # Replace DiseaseID with the combined list
            dict["DiseaseID"] = disease_ids
            
            ontology.append(dict)
        # Increment the counter
        counter += 1

In [20]:
[ontology[i] for i in range(2,5)]

[{'DiseaseName': '15q24 Microdeletion',
  'DiseaseID': ['MESH:C579849', 'DO:DOID:0060395'],
  'AltDiseaseIDs': 'DO:DOID:0060395',
  'Definition': '',
  'ParentIDs': 'MESH:D002872|MESH:D008607|MESH:D025063',
  'TreeNumbers': 'C10.597.606.360/C579849|C16.131.260/C579849|C16.320.180/C579849|C23.550.210.050.500.500/C579849|C23.888.592.604.646/C579849|F03.625.539/C579849',
  'ParentTreeNumbers': 'C10.597.606.360|C16.131.260|C16.320.180|C23.550.210.050.500.500|C23.888.592.604.646|F03.625.539',
  'Synonyms': '15q24 Deletion|15q24 Microdeletion Syndrome|Interstitial Deletion of Chromosome 15q24',
  'SlimMappings': 'Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms'},
 {'DiseaseName': '16p11.2 Deletion Syndrome',
  'DiseaseID': ['MESH:C579850'],
  'AltDiseaseIDs': '',
  'Definition': '',
  'ParentIDs': 'MESH:D001321|MESH:D002872|MESH:D008607|MESH:D025063',
  'TreeNumbers': 'C10.597.606.360/C579850|C16.131.260/C579850|C1

In [21]:
# import csv

# # Specify the path to your TSV file
# file_path = '/mitchell/entity-linking/kbs/medic.tsv'

# # Open the TSV file
# with open(file_path, newline='') as tsvfile:
#     # Create a CSV reader specifying the delimiter as a tab character
#     reader = csv.reader(tsvfile, delimiter='\t')
    
#     # Initialize a counter
#     counter = 0
    
#     # Iterate over the rows in the file
#     for row in reader:
#         # Print the current row
#         if counter > 28: 
#             print(row)
        
#         # Increment the counter
#         counter += 1
        
#         # Check if we've printed the first 5 rows
#         if counter == 31:
#             break


# Medic ontology

In [26]:
def load_medic(medic_dir):
    "This is needed because it's neither obo nor umls, it's a sort of preprocessing"

    key_dict = ["DiseaseName", 
                "DiseaseID", 
                "AltDiseaseIDs", 
                "Definition", 
                "ParentIDs", 
                "TreeNumbers", 
                "ParentTreeNumbers",
                "Synonyms",
                "SlimMappings"]
    # Open the TSV file
    with open(medic_dir, newline='') as tsvfile:
        # Create a CSV reader specifying the delimiter as a tab character
        reader = csv.reader(tsvfile, delimiter='\t')
        
        # Initialize a counter
        counter = 0
        
        ontology = []
        # Iterate over the rows in the file
        for row in reader:
            dict = {}
            # Print the current row
            if counter > 28 :
                for i, elements in enumerate(row) :
                    dict[key_dict[i]] = elements
                
                ontology.append(dict)
            # Increment the counter
            counter += 1
    
    return ontology

In [6]:
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
ontology = load_medic(ontology_dir)
[ontology[i] for i in range(2,5)]

[{'DiseaseName': '15q24 Microdeletion',
  'DiseaseID': 'MESH:C579849',
  'AltDiseaseIDs': 'DO:DOID:0060395',
  'Definition': '',
  'ParentIDs': 'MESH:D002872|MESH:D008607|MESH:D025063',
  'TreeNumbers': 'C10.597.606.360/C579849|C16.131.260/C579849|C16.320.180/C579849|C23.550.210.050.500.500/C579849|C23.888.592.604.646/C579849|F03.625.539/C579849',
  'ParentTreeNumbers': 'C10.597.606.360|C16.131.260|C16.320.180|C23.550.210.050.500.500|C23.888.592.604.646|F03.625.539',
  'Synonyms': '15q24 Deletion|15q24 Microdeletion Syndrome|Interstitial Deletion of Chromosome 15q24',
  'SlimMappings': 'Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms'},
 {'DiseaseName': '16p11.2 Deletion Syndrome',
  'DiseaseID': 'MESH:C579850',
  'AltDiseaseIDs': '',
  'Definition': '',
  'ParentIDs': 'MESH:D001321|MESH:D002872|MESH:D008607|MESH:D025063',
  'TreeNumbers': 'C10.597.606.360/C579850|C16.131.260/C579850|C16.320.180/C579850|C23.5

In [7]:
def medic_get_canonical_name(entities):
    '''
    Get name of entities in the ontology
    data: list of dict
    '''
    canonical_names = {entity['DiseaseID']: entity['DiseaseName'] for entity in entities}
    return canonical_names

def medic_get_aliases(entities):
    '''
    Get aliases of entities in the ontology
    data: list of dict
    '''
    aliases = {entity['DiseaseID']: entity['Synonyms'] for entity in entities}
    return aliases

def medic_get_definition(entities):
    '''
    Get definition of entities in the ontology
    data: list of dict
    '''
    definitions_dict = {entity['DiseaseID']: entity['Definition'] for entity in entities if entity['Definition'] is not None}
    return definitions_dict

def medic_get_types(entities):
    '''
    Get type of entities in the ontology
    data: list of dict
    '''
    # Extract tuples of CUI and types from the Data
    types = {entity['DiseaseID']: entity['SlimMappings'] for entity in entities}
    return types

In [8]:
cui2name = medic_get_canonical_name(ontology)
print(list(cui2name.items())[0:4])

[('MESH:C538288', '10p Deletion Syndrome (Partial)'), ('MESH:C535484', '13q deletion syndrome'), ('MESH:C579849', '15q24 Microdeletion'), ('MESH:C579850', '16p11.2 Deletion Syndrome')]


In [9]:
cui2alias = medic_get_aliases(ontology)
print(list(cui2alias.items())[:5])

[('MESH:C538288', 'Chromosome 10, 10p- Partial|Chromosome 10, monosomy 10p|Chromosome 10, Partial Deletion (short arm)|Monosomy 10p'), ('MESH:C535484', "Chromosome 13q deletion|Chromosome 13q deletion syndrome|Chromosome 13q monosomy|Chromosome 13q syndrome|Deletion 13q|Deletion 13q syndrome|Monosomy 13q|Monosomy 13q syndrome|Orbeli's syndrome|Orbeli syndrome"), ('MESH:C579849', '15q24 Deletion|15q24 Microdeletion Syndrome|Interstitial Deletion of Chromosome 15q24'), ('MESH:C579850', ''), ('MESH:C567076', '17-Alpha-Hydroxylase-17,20-Lyase Deficiency, Combined Complete|17-Alpha-Hydroxylase-17,20-Lyase Deficiency, Combined Partial')]


In [10]:
cui2def = medic_get_definition(ontology)
print(list(cui2def.items())[:5])

[('MESH:C538288', ''), ('MESH:C535484', ''), ('MESH:C579849', ''), ('MESH:C579850', ''), ('MESH:C567076', '')]


In [11]:
cui2type = medic_get_types(ontology)
print(list(cui2type.items())[:5])

[('MESH:C538288', 'Congenital abnormality|Genetic disease (inborn)|Pathology (process)'), ('MESH:C535484', 'Congenital abnormality|Genetic disease (inborn)|Pathology (process)'), ('MESH:C579849', 'Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms'), ('MESH:C579850', 'Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms'), ('MESH:C567076', 'Congenital abnormality|Endocrine system disease|Genetic disease (inborn)|Metabolic disease|Urogenital disease (female)|Urogenital disease (male)')]


In [12]:
def process_medic_ontology(ontology,
                           data_path, 
                           ontology_dir, 
                        ):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
    Ontology associated with the dataset
    - data_path : str
    Path where to load and save dictionary.pickle
    - ontology_dir : str
    Path to medic data
    '''
    
    entities = load_medic(ontology_dir)
    
    # Get canonical name of entities in the ontology
    cui2name = medic_get_canonical_name(entities)
    # Get aliases of entities in the ontology
    cui2alias = medic_get_aliases(entities)
    # Get definition of entities in the ontology
    cui2definition = medic_get_definition(entities)
    # Get types of entities in the ontology
    cui2tui = medic_get_types(entities)


    # Check if the directory exists, and create it if it does not
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    ontology_entities = []
    for cui, name in tqdm(cui2name.items()):
        d = {}
        ent_type = cui2tui[cui]
        d['type'] = ent_type
        # other_aliases = [x for x in cui2alias[cui] if x != name]
        # joined_aliases = ' ; '.join(other_aliases)
        d['cui'] = f"{cui}"
        d['title'] = name
        if cui2definition[cui] != "":
            definition = cui2definition[cui]
        else:
            definition = None

        if cui2alias[cui] is not None:
            if definition is not None:
                d['description'] = f"{name} ( {ent_type} : {cui2alias[cui]} ) [ {definition} ]"
            else:
                d['description'] = f"{name} ( {ent_type} : {cui2alias[cui]} )"
        else:
            if definition is not None:
                d['description'] = f"{name} ( {ent_type} ) [ {definition} ]"
            else:
                d['description'] = f"{name} ( {ent_type} )"

        ontology_entities.append(d)

    pickle.dump(ontology_entities, open(os.path.join(data_path, 'dictionary.pickle'), 'wb'))
    entities = pickle.load(open(os.path.join(data_path, 'dictionary.pickle'), 'rb'))
    return entities


In [13]:
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

new_entities = process_medic_ontology(ontology = ontology,
                                    data_path= data_path,
                                    ontology_dir = ontology_dir, )

/home2/cye73/data/arboel/ncbi_disease


100%|██████████| 13189/13189 [00:00<00:00, 708977.46it/s]


In [31]:
new_entities[0:5]

[{'type': 'Congenital abnormality|Genetic disease (inborn)|Pathology (process)',
  'cui': 'MESH:C538288',
  'title': '10p Deletion Syndrome (Partial)',
  'description': '10p Deletion Syndrome (Partial) ( Congenital abnormality|Genetic disease (inborn)|Pathology (process) : Chromosome 10, 10p- Partial|Chromosome 10, monosomy 10p|Chromosome 10, Partial Deletion (short arm)|Monosomy 10p )'},
 {'type': 'Congenital abnormality|Genetic disease (inborn)|Pathology (process)',
  'cui': 'MESH:C535484',
  'title': '13q deletion syndrome',
  'description': "13q deletion syndrome ( Congenital abnormality|Genetic disease (inborn)|Pathology (process) : Chromosome 13q deletion|Chromosome 13q deletion syndrome|Chromosome 13q monosomy|Chromosome 13q syndrome|Deletion 13q|Deletion 13q syndrome|Monosomy 13q|Monosomy 13q syndrome|Orbeli's syndrome|Orbeli syndrome )"},
 {'type': 'Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms',
 

# Function for preparing the mentions in the dataset into the right format for each model

In [14]:

def process_mention_dataset(ontology,
                            dataset,
                            data_path,
                            ontology_type,
                            ontology_dir: Optional[str] = None,
                            mention_id: Optional[bool] = True,
                            context_doc_id: Optional[bool] = True,
                            label: Optional[bool] = True
                            ): 
    '''
    This function prepares the mentions data :  Creates the train.jsonl, valid.jsonl, test.jsonl
    Each .jsonl contains data in the following format : 
    {'mention': mention, 
    'mention_id': ID of the mention, (optional)
    'context_left': context before mention,
    'context_right': context after mention, 
    'context_doc_id': ID of the doc, (optional)
    'type': type
    'label_id': label_id,
    'label': entity description, (optional)
    'label_title': entity title
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
    Ontology associated with the dataset
    - dataset : str
    Name of the dataset
    - data_path : str
    Path where to load and save dictionary.pickle
    - ontology_type : str
    'obo' or 'umls' and possibly others
    - umls_dir : str
    Path to the ontology (umls, medic etc...)
    '''
    data = conhelps.for_config_name(f'{dataset}_bigbio_kb').load_dataset()
    exclude = CUIS_TO_EXCLUDE[dataset]
    remap = CUIS_TO_REMAP[dataset]

    'If dictionary already processed, load it else process and load it'
    entity_dictionary_pkl_path = os.path.join(data_path, 'dictionary.pickle')
    
    if os.path.isfile(entity_dictionary_pkl_path): 
        print("Loading stored processed entity dictionary...")
        with open(entity_dictionary_pkl_path, 'rb') as read_handle:
            entities = pickle.load(read_handle)
    else :
        if ontology_type == "obo" :
            entities = process_obo_ontology(ontology, data_path)
        elif ontology_type == "medic" : 
            entities = process_medic_ontology(ontology, data_path, ontology_dir)
        elif ontology_type == "umls" : 
            entities = process_umls_ontology(ontology, data_path, ontology_dir)
        else : 
            print("ERROR!")

    entity_dictionary = {d['cui']:d for d in tqdm(entities)} #CC1

    # if dataset == 'ncbi_disease': #CC2
    #     # Need to redo this since we have multiple synonymous CUIs for ncbi_disease
    #     entity_dictionary = {cui:d for d in tqdm(entities) for cui in d['cui']}
    #     cui_synsets = {}
    #     for subdict in tqdm(entities): 
    #         for cui in subdict['cui']:
    #             if cui in subdict:
    #                 print(cui, cui_synsets[cui], subdict['cui'])
    #             cui_synsets[cui] = subdict['cui'] 
    #     with open(os.path.join(data_path, 'cui_synsets.json'), 'w') as f:
    #         f.write(ujson.dumps(cui_synsets, indent=2))

    if dataset in VALIDATION_DOCUMENT_IDS:
        validation_pmids = VALIDATION_DOCUMENT_IDS[dataset]
    else:
        print("ERROR!!!")
        
    # Convert BigBio dataset to pandas DataFrame
    df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude, val_split_ids=validation_pmids)
    print(df)
    # print(df)
    # Return dictionary of documents in BigBio dataset
    docs = dataset_to_documents(data)
    label_len = df['db_ids'].map(lambda x: len(x)).max()
    print("Max labels on one doc:", label_len)

    for split in df.split.unique():
        print(split)

        ents_in_split = []
        for d in tqdm(df.query("split == @split").to_dict(orient='records'),
                      desc=f"Creating correct mention format for {split} dataset"):
            abbrev_resolved = False
            offsets = d['offsets']
            doc_id = d['document_id']
            doc = docs[doc_id]
            mention = d['text']
            
            # Get offsets and context
            start = offsets[0][0] # start on the mention
            end = offsets[-1][-1] # end of the mention
            before_context = doc[:start] # left context
            after_context = doc[end:] # right context
            
            
            # ArboEL can't handle multi-labels, so we randomly choose one.
            if len(d['db_ids']) == 1:
                label_id = d['db_ids'][0]

            # ncbi_disease is a special case that requires extra care
            # elif dataset == 'ncbi_disease':
            #     labels = []
            #     used_cuis = set([])
            #     choosable_ids = []
            #     for db_id in d['db_ids']:
            #         if db_id in used_cuis:
            #             continue
            #         else:
            #             used_cuis.update(set(entity_dictionary[db_id]['cuis']))
            #         choosable_ids.append(db_id)

            #     label_id = np.random.choice(choosable_ids)
            
            else:
                label_id = np.random.choice(d['db_ids'])

            # Check if we missed something
            if label_id not in entity_dictionary:
                # print(label_id)
                continue
            

            output = [{
                'mention': mention, 
                'context_left': before_context,
                'context_right': after_context, 
                'type': d['type'][0],
                'label_id': label_id,
                'label_title': entity_dictionary[label_id]['title'],
            }]
            
            # print("I am here :", output)
            
            if mention_id:
                output[0]['mention_id'] = d.get('mention_id', None)
        
            if context_doc_id:
                output[0]['context_doc_id'] = d.get('document_id', None)
                
            if context_doc_id:
                output[0]['label'] = d.get(entity_dictionary[label_id]['description'], None)
            
            ents_in_split.extend(output)
            

        split_name = split
        if split =='validation':
            split_name = 'valid'
        with open(os.path.join(data_path, f'{split_name}.jsonl'), 'w') as f:
            f.write('\n'.join([ujson.dumps(x) for x in ents_in_split]))
        return ents_in_split
            

In [15]:
ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data_test"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

ontology_type = "medic"
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'

# ontology = "MeSH"
# model = "arboel"
# dataset = "bc5cdr"
# abs_path = "/home2/cye73/data"
# data_path = os.path.join(abs_path, model, dataset)
# print(data_path)

# ontology_type = "umls"
# ontology_dir = "/mitchell/entity-linking/2017AA/META/"

mentions = process_mention_dataset(ontology = ontology,
                        dataset = dataset,
                        data_path = data_path,
                        ontology_type = ontology_type,
                        ontology_dir = ontology_dir, 
                        mention_id = True,
                        context_doc_id = True,
                        label = True)

/home2/cye73/data_test/arboel/ncbi_disease


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading stored processed entity dictionary...


100%|██████████| 13189/13189 [00:00<00:00, 3142392.38it/s]


     document_id         offsets                                     text  \
2       10021369      [[43, 76]]        adenomatous polyposis coli tumour   
3       10021369     [[93, 132]]  adenomatous polyposis coli (APC) tumour   
1       10021369    [[357, 372]]                          colon carcinoma   
4       10021369    [[955, 970]]                          colon carcinoma   
0       10021369  [[1090, 1096]]                                   cancer   
...          ...             ...                                      ...   
6804     9988281   [[996, 1015]]                      breast malignancies   
6794     9988281  [[1123, 1147]]                 invasive lobular cancers   
6795     9988281  [[1152, 1179]]              low-grade ductal carcinomas   
6797     9988281  [[1269, 1286]]                        ductal carcinomas   
6798     9988281  [[1387, 1410]]                  sporadic breast cancers   

                   type          db_ids  split  mention_id  
2            [

Creating correct mention format for train dataset: 100%|██████████| 5065/5065 [00:00<00:00, 260160.05it/s]


In [34]:
mentions[0]

{'mention': 'adenomatous polyposis coli tumour',
 'context_left': 'Identification of APC2, a homologue of the ',
 'context_right': ' suppressor.\nThe adenomatous polyposis coli (APC) tumour-suppressor protein controls the Wnt signalling pathway by forming a complex with glycogen synthase kinase 3beta (GSK-3beta), axin/conductin and betacatenin. Complex formation induces the rapid degradation of betacatenin. In colon carcinoma cells, loss of APC leads to the accumulation of betacatenin in the nucleus, where it binds to and activates the Tcf-4 transcription factor (reviewed in [1] [2]). Here, we report the identification and genomic structure of APC homologues. Mammalian APC2, which closely resembles APC in overall domain structure, was functionally analyzed and shown to contain two SAMP domains, both of which are required for binding to conductin. Like APC, APC2 regulates the formation of active betacatenin-Tcf complexes, as demonstrated using transient transcriptional activation assays

In [35]:
import json

abs_path2 = "/home2/cye73/data_test/arboel/ncbi_disease/train.jsonl"
train_data = []

# Open the file in text mode ('r'), not binary mode ('rb'), since we're reading text data
with open(abs_path2, 'r') as read_handle:
    for line in read_handle:
        # Each line is a complete JSON object
        train_data.append(json.loads(line))

# Now, train_data is a list of dictionaries, where each dictionary is a line from your jsonl file
for i in range(5) :
    print("------") 
    for key, value in train_data[i].items():
        print(f"{key}: {value}")


------
mention: adenomatous polyposis coli tumour
context_left: Identification of APC2, a homologue of the 
context_right:  suppressor.
The adenomatous polyposis coli (APC) tumour-suppressor protein controls the Wnt signalling pathway by forming a complex with glycogen synthase kinase 3beta (GSK-3beta), axin/conductin and betacatenin. Complex formation induces the rapid degradation of betacatenin. In colon carcinoma cells, loss of APC leads to the accumulation of betacatenin in the nucleus, where it binds to and activates the Tcf-4 transcription factor (reviewed in [1] [2]). Here, we report the identification and genomic structure of APC homologues. Mammalian APC2, which closely resembles APC in overall domain structure, was functionally analyzed and shown to contain two SAMP domains, both of which are required for binding to conductin. Like APC, APC2 regulates the formation of active betacatenin-Tcf complexes, as demonstrated using transient transcriptional activation assays in APC -/

# TEST

In [16]:
def load_medic(ontology_dir):
    "This is needed because it's neither obo nor umls, it's a sort of preprocessing"

    key_dict = ["DiseaseName", 
                "DiseaseID", 
                "AltDiseaseIDs", 
                "Definition", 
                "ParentIDs", 
                "TreeNumbers", 
                "ParentTreeNumbers",
                "Synonyms",
                "SlimMappings"]
    # Open the TSV file
    with open(ontology_dir, newline='') as tsvfile:
        # Create a CSV reader specifying the delimiter as a tab character
        reader = csv.reader(tsvfile, delimiter='\t')
        
        # Initialize a counter
        counter = 0
        
        ontology = []
        # Iterate over the rows in the file
        for row in reader:
            dict = {}
            # Print the current row
            if counter > 28 :
                for i, elements in enumerate(row) :
                    dict[key_dict[i]] = elements
                disease_ids = [dict["DiseaseID"]]
                'Put the alternative disease IDs in DiseaseID (they are also valid cuis)'
                # If AltDiseaseIDs exists and is not empty, extend the list with its elements
                if "AltDiseaseIDs" in dict and dict["AltDiseaseIDs"]:
                    # Split AltDiseaseIDs by comma and extend the disease_ids list
                    disease_ids.extend(dict["AltDiseaseIDs"].split(','))
                # Replace DiseaseID with the combined list
                dict["DiseaseID"] = disease_ids
                    
                ontology.append(dict)
            # Increment the counter
            counter += 1
    
    return ontology

In [17]:
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
ontology = load_medic(ontology_dir)
[ontology[i] for i in range(6,9)]

[{'DiseaseName': '18-Hydroxylase deficiency',
  'DiseaseID': ['MESH:C537806', 'OMIM:203400|OMIM:610600'],
  'AltDiseaseIDs': 'OMIM:203400|OMIM:610600',
  'Definition': '',
  'ParentIDs': 'MESH:D006994',
  'TreeNumbers': 'C19.053.500.480/C537806',
  'ParentTreeNumbers': 'C19.053.500.480',
  'Synonyms': '18-alpha hydroxylase deficiency|18-HYDROXYLASE DEFICIENCY|18-Oxidase Deficiency|Aldosterone deficiency 1|Aldosterone deficiency due to defect in 18-hydroxylase|ALDOSTERONE DEFICIENCY DUE TO DEFECT IN STEROID 18-HYDROXYLASE|ALDOSTERONE DEFICIENCY DUE TO DEFICIENCY OF STEROID 18-OXIDASE|ALDOSTERONE DEFICIENCY I|ALDOSTERONE DEFICIENCY II|Aldosterone Deficiency Type I|Aldosterone Deficiency Type II|CMO I Deficiency|CMO II Deficiency|Corticosterone methyloxidase type 1 deficiency|Corticosterone Methyloxidase Type I Deficiency|Corticosterone Methyloxidase Type II Deficiency|FHHA1A|FHHA1B|HYPERRENINEMIC HYPOALDOSTERONISM, FAMILIAL, 1|Hyperreninemic Hypoaldosteronism, Familial, Type I|Steroid 18

In [18]:
def medic_get_canonical_name(entities):
    '''
    Get name of entities in the ontology
    entities: list of dict, where each dict represents an entity and
    'DiseaseID' is expected to be a list of IDs.
    '''
    canonical_names = {}
    for entity in entities:
        for disease_id in entity['DiseaseID']:
            canonical_names[disease_id] = entity['DiseaseName']
    return canonical_names

def medic_get_aliases(entities):
    '''
    Get aliases of entities in the ontology
    data: list of dict
    '''
    aliases = {}
    for entity in entities:
        for disease_id in entity['DiseaseID']:
            aliases[disease_id] = entity['Synonyms']
    return aliases


def medic_get_definition(entities):
    '''
    Get definition of entities in the ontology
    entities: list of dict
    '''
    definitions_dict = {}
    for entity in entities:
        if entity['Definition']:
            for disease_id in entity['DiseaseID']:
                definitions_dict[disease_id] = entity['Definition']
    return definitions_dict

def medic_get_types(entities):
    '''
    Get type of entities in the ontology
    entities: list of dict
    '''
    types = {}
    for entity in entities:
        for disease_id in entity['DiseaseID']:
            types[disease_id] = entity['SlimMappings']
    return types


In [19]:
import os
import pickle
from tqdm import tqdm

def process_medic2_ontology(ontology, data_path, ontology_dir):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
        Ontology associated with the dataset
    - data_path : str
        Path where to load and save dictionary.pickle
    - ontology_dir : str
        Path to medic data
    '''
    
    entities = load_medic(ontology_dir)  # Assuming this returns a list of dictionaries like the provided example
    
    ontology_entities = []
    for entity in tqdm(entities):
        # Combining 'DiseaseID' and 'AltDiseaseIDs' into a single list, ensuring no duplicates
        cui_list = [entity['DiseaseID']]
        alt_ids = entity['AltDiseaseIDs'].split('|') if entity['AltDiseaseIDs'] else []
        for alt_id in alt_ids:
            if alt_id not in cui_list:
                cui_list.append(alt_id)
                
        if entity['Synonyms'] != "":
            if entity['Definition'] != "":
                new_entity = {
                    'type': entity['SlimMappings'],
                    'cui': cui_list,
                    'title': entity['DiseaseName'],
                    'description': f"{entity['DiseaseName']} ( {entity['SlimMappings']} : {entity['Synonyms']} ) [{entity['Definition']}]"
                }
            else : 
                new_entity = {
                    'type': entity['SlimMappings'],
                    'cui': cui_list,
                    'title': entity['DiseaseName'],
                    'description': f"{entity['DiseaseName']} ( {entity['SlimMappings']} : {entity['Synonyms']} )"
                }
                
        else : 
            if entity['Definition'] != "":
                new_entity = {
                        'type': entity['SlimMappings'],
                        'cui': cui_list,
                        'title': entity['DiseaseName'],
                        'description': f"{entity['DiseaseName']} ( {entity['SlimMappings']}) [{entity['Definition']}]"
                    }
            else : 
                new_entity = {
                        'type': entity['SlimMappings'],
                        'cui': cui_list,
                        'title': entity['DiseaseName'],
                        'description': f"{entity['DiseaseName']} ( {entity['SlimMappings']})"
                    }
                
            
        ontology_entities.append(new_entity)

    # Save entities to pickle file
    with open(os.path.join(data_path, 'dictionary.pickle'), 'wb') as f:
        pickle.dump(ontology_entities, f)

    # Optional: Load and return to confirm save was successful
    with open(os.path.join(data_path, 'dictionary.pickle'), 'rb') as f:
        entities = pickle.load(f)
        
    return entities


In [20]:
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

entities = process_medic2_ontology(ontology = ontology,
                                    data_path= data_path,
                                    ontology_dir = ontology_dir, )

/home2/cye73/data/arboel/ncbi_disease


100%|██████████| 13189/13189 [00:00<00:00, 70474.68it/s]


In [21]:
entities[2]

{'type': 'Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms',
 'cui': [['MESH:C579849', 'DO:DOID:0060395'], 'DO:DOID:0060395'],
 'title': '15q24 Microdeletion',
 'description': '15q24 Microdeletion ( Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms : 15q24 Deletion|15q24 Microdeletion Syndrome|Interstitial Deletion of Chromosome 15q24 )'}

In [22]:
#path_entity = '/home2/cye73/data_test2/arboel/ncbi_disease/dictionary.pickle'
path_entity = '/home2/cye73/data/arboel/ncbi_disease/dictionary.pickle'
path_entity2 = '/home2/cye73/arboEL2/data/arboel/ncbi_disease/dictionary.pickle'
with open(path_entity, 'rb') as read_handle:
    dict = pickle.load(read_handle)
with open(path_entity2, 'rb') as read_handle:
    dict2 = pickle.load(read_handle)

print("dict :\n", dict[2])
print("dict2 :\n", dict2[2])

dict :
 {'type': 'Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms', 'cui': [['MESH:C579849', 'DO:DOID:0060395'], 'DO:DOID:0060395'], 'title': '15q24 Microdeletion', 'description': '15q24 Microdeletion ( Congenital abnormality|Genetic disease (inborn)|Mental disorder|Nervous system disease|Pathology (process)|Signs and symptoms : 15q24 Deletion|15q24 Microdeletion Syndrome|Interstitial Deletion of Chromosome 15q24 )'}
dict2 :
 {'type': 'Disease', 'cui': 'MESH:C579849', 'title': '15q24 Microdeletion', 'cuis': ['MESH:C579849'], 'description': '15q24 Microdeletion ( Disease : 15q24 Deletion ; 15q24 Microdeletion Syndrome ; Interstitial Deletion of Chromosome 15q24 )'}


In [95]:
cui_synsets = {}
for subdict in tqdm(entities): 
    for cui in subdict['cui']:
        if cui in subdict:
            print(cui, cui_synsets[cui], subdict['cui'])
        cui_synsets[cui] = subdict['cui'] 
with open(os.path.join(data_path, 'cui_synsets.json'), 'w') as f:
    f.write(ujson.dumps(cui_synsets, indent=2))

100%|██████████| 13189/13189 [00:00<00:00, 575834.32it/s]


In [1]:

def process_mention2_dataset(ontology,
                            dataset,
                            data_path,
                            ontology_type,
                            ontology_dir: Optional[str] = None,
                            mention_id: Optional[bool] = True,
                            context_doc_id: Optional[bool] = True,
                            label: Optional[bool] = True
                            ): 
    '''
    This function prepares the mentions data :  Creates the train.jsonl, valid.jsonl, test.jsonl
    Each .jsonl contains data in the following format : 
    {'mention': mention, 
    'mention_id': ID of the mention, (optional)
    'context_left': context before mention,
    'context_right': context after mention, 
    'context_doc_id': ID of the doc, (optional)
    'type': type
    'label_id': label_id,
    'label': entity description, (optional)
    'label_title': entity title
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
    Ontology associated with the dataset
    - dataset : str
    Name of the dataset
    - data_path : str
    Path where to load and save dictionary.pickle
    - ontology_type : str
    'obo' or 'umls' and possibly others
    - umls_dir : str
    Path to the ontology (umls, medic etc...)
    '''
    data = conhelps.for_config_name(f'{dataset}_bigbio_kb').load_dataset()
    exclude = CUIS_TO_EXCLUDE[dataset]
    remap = CUIS_TO_REMAP[dataset]

    'If dictionary already processed, load it else process and load it'
    entity_dictionary_pkl_path = os.path.join(data_path, 'dictionary.pickle')
    
    if os.path.isfile(entity_dictionary_pkl_path): 
        print("Loading stored processed entity dictionary...")
        with open(entity_dictionary_pkl_path, 'rb') as read_handle:
            entities = pickle.load(read_handle)
    else :
        if ontology_type == "obo" :
            entities = process_obo_ontology(ontology, data_path)
        elif ontology_type == "medic" : 
            entities = process_medic2_ontology(ontology, data_path, ontology_dir)
        elif ontology_type == "umls" : 
            entities = process_umls_ontology(ontology, data_path, ontology_dir)
        else : 
            print("ERROR!")

    entity_dictionary = {d['cui']:d for d in tqdm(entities)} #CC1

    if dataset == 'ncbi_disease': #CC2
        # Need to redo this since we have multiple synonymous CUIs for ncbi_disease
        entity_dictionary = {cui:d for d in tqdm(entities) for cui in d['cui']}
        cui_synsets = {}
        for subdict in tqdm(entities): 
            for cui in subdict['cui']:
                if cui in subdict:
                    print(cui, cui_synsets[cui], subdict['cui'])
                cui_synsets[cui] = subdict['cui'] 
        with open(os.path.join(data_path, 'cui_synsets.json'), 'w') as f:
            f.write(ujson.dumps(cui_synsets, indent=2))

    if dataset in VALIDATION_DOCUMENT_IDS:
        validation_pmids = VALIDATION_DOCUMENT_IDS[dataset]
    else:
        print("ERROR!!!")
        
    # Convert BigBio dataset to pandas DataFrame
    df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude, val_split_ids=validation_pmids)
    # Return dictionary of documents in BigBio dataset
    docs = dataset_to_documents(data)
    label_len = df['db_ids'].map(lambda x: len(x)).max()
    print("Max labels on one doc:", label_len)

    for split in df.split.unique():
        print(split)

        ents_in_split = []
        for d in tqdm(df.query("split == @split").to_dict(orient='records'),
                      desc=f"Creating correct mention format for {split} dataset"):
            abbrev_resolved = False
            offsets = d['offsets']
            doc_id = d['document_id']
            doc = docs[doc_id]
            mention = d['text']
            
            # Get offsets and context
            start = offsets[0][0] # start on the mention
            end = offsets[-1][-1] # end of the mention
            before_context = doc[:start] # left context
            after_context = doc[end:] # right context
            
            
            # ArboEL can't handle multi-labels, so we randomly choose one.
            if len(d['db_ids']) == 1:
                label_id = d['db_ids'][0]

            # ncbi_disease is a special case that requires extra care
            elif dataset == 'ncbi_disease':
                labels = []
                used_cuis = set([])
                choosable_ids = []
                for db_id in d['db_ids']:
                    if db_id in used_cuis:
                        continue
                    else:
                        used_cuis.update(set(entity_dictionary[db_id]['cuis']))
                    choosable_ids.append(db_id)

                label_id = np.random.choice(choosable_ids)
            
            else:
                label_id = np.random.choice(d['db_ids'])

            # Check if we missed something
            if label_id not in entity_dictionary:
                # print(label_id)
                continue
            

            
            output = [{
                'mention': mention, 
                'context_left': before_context,
                'context_right': after_context, 
                'type': d['type'][0],
                'label_id': label_id,
                'label_title': entity_dictionary[label_id]['title'],
            }]
            
            if mention_id:
                output[0]['mention_id'] = d.get('mention_id', None)
        
            if context_doc_id:
                output[0]['context_doc_id'] = d.get('document_id', None)
                
            if context_doc_id:
                output[0]['label'] = d.get(entity_dictionary[label_id]['description'], None)

            ents_in_split.extend(output)

        split_name = split
        if split =='validation':
            split_name = 'valid'
        with open(os.path.join(data_path, f'{split_name}.jsonl'), 'w') as f:
            f.write('\n'.join([ujson.dumps(x) for x in ents_in_split]))
        return output
            

NameError: name 'Optional' is not defined

In [None]:
ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data_test"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

ontology_type = "medic"
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
# ontology_dir="/mitchell/entity-linking/2017AA/META/"

entity_dictionary = process_mention2_dataset(ontology = ontology,
                        dataset = dataset,
                        data_path = data_path,
                        ontology_type = ontology_type,
                        ontology_dir = ontology_dir, 
                        mention_id = True,
                        context_doc_id = True,
                        label = True)

/home2/cye73/data_test/arboel/ncbi_disease


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading stored processed entity dictionary...


100%|██████████| 13189/13189 [00:00<00:00, 2602986.80it/s]
100%|██████████| 13189/13189 [00:00<00:00, 1408818.71it/s]
100%|██████████| 13189/13189 [00:00<00:00, 716776.70it/s]


Max labels on one doc: 5
train


Creating correct mention format for train dataset:   0%|          | 14/5065 [00:00<00:00, 213528.20it/s]


KeyError: 'MESH:D016889'

In [2]:
import os
import pickle 
import json

ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data_test"
data_path = os.path.join(abs_path, model, dataset)

mentions = []

with open(os.path.join(data_path, "train.jsonl"), 'r')  as read_handle :
    for line in read_handle:
        mentions.append(json.loads(line))

for i in range(5) :
    print("------") 
    for key, value in mentions[i].items():
        print(f"{key}: {value}")

------
mention: adenomatous polyposis coli tumour
context_left: Identification of APC2, a homologue of the 
context_right:  suppressor.
The adenomatous polyposis coli (APC) tumour-suppressor protein controls the Wnt signalling pathway by forming a complex with glycogen synthase kinase 3beta (GSK-3beta), axin/conductin and betacatenin. Complex formation induces the rapid degradation of betacatenin. In colon carcinoma cells, loss of APC leads to the accumulation of betacatenin in the nucleus, where it binds to and activates the Tcf-4 transcription factor (reviewed in [1] [2]). Here, we report the identification and genomic structure of APC homologues. Mammalian APC2, which closely resembles APC in overall domain structure, was functionally analyzed and shown to contain two SAMP domains, both of which are required for binding to conductin. Like APC, APC2 regulates the formation of active betacatenin-Tcf complexes, as demonstrated using transient transcriptional activation assays in APC -/

# TEST 2
## Only this one works, rest is just testing !!

In [14]:
def load_medic3(medic_dir):
    "This is needed because it's neither obo nor umls, it's a sort of preprocessing"

    key_dict = ["DiseaseName", 
                "DiseaseID", 
                "AltDiseaseIDs", 
                "Definition", 
                "ParentIDs", 
                "TreeNumbers", 
                "ParentTreeNumbers",
                "Synonyms",
                "SlimMappings"]
    # Open the TSV file
    with open(medic_dir, newline='') as tsvfile:
        # Create a CSV reader specifying the delimiter as a tab character
        reader = csv.reader(tsvfile, delimiter='\t')
        
        # Initialize a counter
        counter = 0
        
        ontology = []
        # Iterate over the rows in the file
        for row in reader:
            dict = {}
            # Print the current row
            if counter > 28 :
                for i, elements in enumerate(row) :
                    dict[key_dict[i]] = elements
                
                ontology.append(dict)
            # Increment the counter
            counter += 1
    
    return ontology

In [61]:
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
ontology = load_medic3(ontology_dir)
[ontology[i] for i in range(6,9)]

[{'DiseaseName': '18-Hydroxylase deficiency',
  'DiseaseID': 'MESH:C537806',
  'AltDiseaseIDs': 'OMIM:203400|OMIM:610600',
  'Definition': '',
  'ParentIDs': 'MESH:D006994',
  'TreeNumbers': 'C19.053.500.480/C537806',
  'ParentTreeNumbers': 'C19.053.500.480',
  'Synonyms': '18-alpha hydroxylase deficiency|18-HYDROXYLASE DEFICIENCY|18-Oxidase Deficiency|Aldosterone deficiency 1|Aldosterone deficiency due to defect in 18-hydroxylase|ALDOSTERONE DEFICIENCY DUE TO DEFECT IN STEROID 18-HYDROXYLASE|ALDOSTERONE DEFICIENCY DUE TO DEFICIENCY OF STEROID 18-OXIDASE|ALDOSTERONE DEFICIENCY I|ALDOSTERONE DEFICIENCY II|Aldosterone Deficiency Type I|Aldosterone Deficiency Type II|CMO I Deficiency|CMO II Deficiency|Corticosterone methyloxidase type 1 deficiency|Corticosterone Methyloxidase Type I Deficiency|Corticosterone Methyloxidase Type II Deficiency|FHHA1A|FHHA1B|HYPERRENINEMIC HYPOALDOSTERONISM, FAMILIAL, 1|Hyperreninemic Hypoaldosteronism, Familial, Type I|Steroid 18-Hydroxylase Deficiency|Stero

In [15]:
import os
import pickle
from tqdm import tqdm

def process_medic3_ontology(ontology, data_path, ontology_dir):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
        Ontology associated with the dataset
    - data_path : str
        Path where to load and save dictionary.pickle
    - ontology_dir : str
        Path to medic data
    '''
    
    entities = load_medic3(ontology_dir)  # Assuming this returns a list of dictionaries like the provided example
    
    ontology_entities = []
    for entity in tqdm(entities):
        # Combining 'DiseaseID' and 'AltDiseaseIDs' into a single list, ensuring no duplicates
        cui_list = [entity['DiseaseID']]
        alt_ids = entity['AltDiseaseIDs'].split('|') if entity['AltDiseaseIDs'] else []
        for alt_id in alt_ids:
            if alt_id not in cui_list and alt_id[:2] != "DO":
                cui_list.append(alt_id)
                
        if entity['Synonyms'] != "":
            if entity['Definition'] != "":
                new_entity = {
                    'type': 'Disease',
                    'cui': entity['DiseaseID'],
                    'title': entity['DiseaseName'],
                    'cuis': cui_list,
                    'description': f"{entity['DiseaseName']} ( Disease : {entity['Synonyms']} ) [{entity['Definition']}]"
                }
            else : 
                new_entity = {
                    'type': 'Disease',
                    'cui': entity['DiseaseID'],
                    'title': entity['DiseaseName'],
                    'cuis': cui_list,
                    'description': f"{entity['DiseaseName']} ( Disease : {entity['Synonyms']} )"
                }
                
        else : 
            if entity['Definition'] != "":
                new_entity = {
                        'type': 'Disease',
                        'cui': entity['DiseaseID'],
                        'title': entity['DiseaseName'],
                        'cuis': cui_list,
                        'description': f"{entity['DiseaseName']} ( Disease) [{entity['Definition']}]"
                    }
            else : 
                new_entity = {
                        'type': 'Disease',
                        'cui': entity['DiseaseID'],
                        'title': entity['DiseaseName'],
                        'cuis': cui_list,
                        'description': f"{entity['DiseaseName']} ( Disease)"
                    }
                
            
        ontology_entities.append(new_entity)

    # Save entities to pickle file
    with open(os.path.join(data_path, 'dictionary.pickle'), 'wb') as f:
        pickle.dump(ontology_entities, f)
        
    return ontology_entities



In [16]:
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

entities = process_medic3_ontology(ontology = ontology,
                                    data_path= data_path,
                                    ontology_dir = ontology_dir, )

entities[0]

/home2/cye73/data/arboel/ncbi_disease


100%|██████████| 13189/13189 [00:00<00:00, 540458.95it/s]


{'type': 'Disease',
 'cui': 'MESH:C538288',
 'title': '10p Deletion Syndrome (Partial)',
 'cuis': ['MESH:C538288'],
 'description': '10p Deletion Syndrome (Partial) ( Disease : Chromosome 10, 10p- Partial|Chromosome 10, monosomy 10p|Chromosome 10, Partial Deletion (short arm)|Monosomy 10p )'}

In [97]:
# entities[0]

In [103]:
#path_entity = '/home2/cye73/data_test2/arboel/ncbi_disease/dictionary.pickle'
path_entity = f'/home2/cye73/data/arboel/{dataset}/dictionary.pickle'
path_entity2 = f'/home2/cye73/arboEL2/data/arboel/{dataset}/dictionary.pickle'
with open(path_entity, 'rb') as read_handle:
    dict = pickle.load(read_handle)
with open(path_entity2, 'rb') as read_handle:
    dict2 = pickle.load(read_handle)

print("dict :\n", dict[6])
print("dict2 :\n", dict2[6])

dict :
 {'type': 'Disease', 'cui': 'MESH:C537806', 'title': '18-Hydroxylase deficiency', 'cuis': ['MESH:C537806', 'OMIM:203400', 'OMIM:610600'], 'description': '18-Hydroxylase deficiency ( Disease : 18-alpha hydroxylase deficiency|18-HYDROXYLASE DEFICIENCY|18-Oxidase Deficiency|Aldosterone deficiency 1|Aldosterone deficiency due to defect in 18-hydroxylase|ALDOSTERONE DEFICIENCY DUE TO DEFECT IN STEROID 18-HYDROXYLASE|ALDOSTERONE DEFICIENCY DUE TO DEFICIENCY OF STEROID 18-OXIDASE|ALDOSTERONE DEFICIENCY I|ALDOSTERONE DEFICIENCY II|Aldosterone Deficiency Type I|Aldosterone Deficiency Type II|CMO I Deficiency|CMO II Deficiency|Corticosterone methyloxidase type 1 deficiency|Corticosterone Methyloxidase Type I Deficiency|Corticosterone Methyloxidase Type II Deficiency|FHHA1A|FHHA1B|HYPERRENINEMIC HYPOALDOSTERONISM, FAMILIAL, 1|Hyperreninemic Hypoaldosteronism, Familial, Type I|Steroid 18-Hydroxylase Deficiency|Steroid 18-Oxidase Deficiency )'}
dict2 :
 {'type': 'Disease', 'cui': 'MESH:C5378

In [81]:
# print(dict2[0].keys())

In [104]:
for i in range(10):
    print("dict2 :\n", dict2[i])

dict2 :
 {'type': 'Disease', 'cui': 'MESH:C538288', 'title': '10p Deletion Syndrome (Partial)', 'cuis': ['MESH:C538288'], 'description': '10p Deletion Syndrome (Partial) ( Disease : Chromosome 10, 10p- Partial ; Chromosome 10, monosomy 10p ; Chromosome 10, Partial Deletion (short arm) ; Monosomy 10p )'}
dict2 :
 {'type': 'Disease', 'cui': 'MESH:C535484', 'title': '13q deletion syndrome', 'cuis': ['MESH:C535484'], 'description': "13q deletion syndrome ( Disease : Chromosome 13q deletion ; Chromosome 13q deletion syndrome ; Chromosome 13q monosomy ; Chromosome 13q syndrome ; Deletion 13q ; Deletion 13q syndrome ; Monosomy 13q ; Monosomy 13q syndrome ; Orbeli's syndrome ; Orbeli syndrome )"}
dict2 :
 {'type': 'Disease', 'cui': 'MESH:C579849', 'title': '15q24 Microdeletion', 'cuis': ['MESH:C579849'], 'description': '15q24 Microdeletion ( Disease : 15q24 Deletion ; 15q24 Microdeletion Syndrome ; Interstitial Deletion of Chromosome 15q24 )'}
dict2 :
 {'type': 'Disease', 'cui': 'MESH:C579850'

In [105]:
for i in range(10):
    print("dict :\n", dict[i])

dict :
 {'type': 'Disease', 'cui': 'MESH:C538288', 'title': '10p Deletion Syndrome (Partial)', 'cuis': ['MESH:C538288'], 'description': '10p Deletion Syndrome (Partial) ( Disease : Chromosome 10, 10p- Partial|Chromosome 10, monosomy 10p|Chromosome 10, Partial Deletion (short arm)|Monosomy 10p )'}
dict :
 {'type': 'Disease', 'cui': 'MESH:C535484', 'title': '13q deletion syndrome', 'cuis': ['MESH:C535484'], 'description': "13q deletion syndrome ( Disease : Chromosome 13q deletion|Chromosome 13q deletion syndrome|Chromosome 13q monosomy|Chromosome 13q syndrome|Deletion 13q|Deletion 13q syndrome|Monosomy 13q|Monosomy 13q syndrome|Orbeli's syndrome|Orbeli syndrome )"}
dict :
 {'type': 'Disease', 'cui': 'MESH:C579849', 'title': '15q24 Microdeletion', 'cuis': ['MESH:C579849'], 'description': '15q24 Microdeletion ( Disease : 15q24 Deletion|15q24 Microdeletion Syndrome|Interstitial Deletion of Chromosome 15q24 )'}
dict :
 {'type': 'Disease', 'cui': 'MESH:C579850', 'title': '16p11.2 Deletion Syn

In [126]:

def process_mention3_dataset(ontology,
                            dataset,
                            data_path,
                            ontology_type,
                            ontology_dir: Optional[str] = None,
                            mention_id: Optional[bool] = True,
                            context_doc_id: Optional[bool] = True,
                            label: Optional[bool] = True
                            ): 
    '''
    This function prepares the mentions data :  Creates the train.jsonl, valid.jsonl, test.jsonl
    Each .jsonl contains data in the following format : 
    {'mention': mention, 
    'mention_id': ID of the mention, (optional)
    'context_left': context before mention,
    'context_right': context after mention, 
    'context_doc_id': ID of the doc, (optional)
    'type': type
    'label_id': label_id,
    'label': entity description, (optional)
    'label_title': entity title
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
    Ontology associated with the dataset
    - dataset : str
    Name of the dataset
    - data_path : str
    Path where to load and save dictionary.pickle
    - ontology_type : str
    'obo' or 'umls' and possibly others
    - umls_dir : str
    Path to the ontology (umls, medic etc...)
    '''
    data = conhelps.for_config_name(f'{dataset}_bigbio_kb').load_dataset()
    exclude = CUIS_TO_EXCLUDE[dataset]
    remap = CUIS_TO_REMAP[dataset]

    'If dictionary already processed, load it else process and load it'
    entity_dictionary_pkl_path = os.path.join(data_path, 'dictionary.pickle')
    
    if os.path.isfile(entity_dictionary_pkl_path): 
        print("Loading stored processed entity dictionary...")
        with open(entity_dictionary_pkl_path, 'rb') as read_handle:
            entities = pickle.load(read_handle)
    else :
        if ontology_type == "obo" :
            entities = process_obo_ontology(ontology, data_path)
        elif ontology_type == "medic" : 
            entities = process_medic3_ontology(ontology, data_path, ontology_dir)
        elif ontology_type == "umls" : 
            entities = process_umls_ontology(ontology, data_path, ontology_dir)
        else : 
            print("ERROR!")
        
    entity_dictionary = {d['cui']:d for d in tqdm(entities)} #CC1

    if dataset == 'ncbi_disease': #CC2
        # Need to redo this since we have multiple synonymous CUIs for ncbi_disease
        entity_dictionary = {cui:d for d in tqdm(entities) for cui in d['cuis']}
        cui_synsets = {}
        for subdict in tqdm(entities): 
            for cui in subdict['cuis']:
                if cui in subdict:
                    print(cui, cui_synsets[cui], subdict['cuis'])
                cui_synsets[cui] = subdict['cuis'] 
        with open(os.path.join(data_path, 'cui_synsets.json'), 'w') as f:
            f.write(ujson.dumps(cui_synsets, indent=2))

    if dataset in VALIDATION_DOCUMENT_IDS:
        validation_pmids = VALIDATION_DOCUMENT_IDS[dataset]
    else:
        print("ERROR!!!")
        
    # Convert BigBio dataset to pandas DataFrame
    df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude, val_split_ids=validation_pmids)
    # Return dictionary of documents in BigBio dataset
    docs = dataset_to_documents(data)
    label_len = df['db_ids'].map(lambda x: len(x)).max()
    print("Max labels on one doc:", label_len)

    for split in df.split.unique():
        print(split)

        ents_in_split = []
        for d in tqdm(df.query("split == @split").to_dict(orient='records'),
                      desc=f"Creating correct mention format for {split} dataset"):
            abbrev_resolved = False
            offsets = d['offsets']
            doc_id = d['document_id']
            doc = docs[doc_id]
            mention = d['text']
            
            # Get offsets and context
            start = offsets[0][0] # start on the mention
            end = offsets[-1][-1] # end of the mention
            before_context = doc[:start] # left context
            after_context = doc[end:] # right context
            
            
            # ArboEL can't handle multi-labels, so we randomly choose one.
            if len(d['db_ids']) == 1:
                label_id = d['db_ids'][0]

            # ncbi_disease is a special case that requires extra care
            elif dataset == 'ncbi_disease':
                labels = []
                used_cuis = set([])
                choosable_ids = []
                for db_id in d['db_ids']:
                    if db_id in used_cuis:
                        continue
                    else:
                        used_cuis.update(set(entity_dictionary[db_id]['cuis']))
                    choosable_ids.append(db_id)

                label_id = np.random.choice(choosable_ids)
            
            else:
                label_id = np.random.choice(d['db_ids'])

            # Check if we missed something
            if label_id not in entity_dictionary:
                print(label_id)
                continue
            

            
            output = [
                {
                    "mention": mention,
                    "context_left": before_context,
                    "context_right": after_context,
                    "type": d["type"][0],
                    "label_id": label_id,
                    "label_title": entity_dictionary[label_id]["title"],
                    "label": entity_dictionary[label_id]["description"],
                }
            ]
            
            if mention_id:
                output[0]["mention_id"] = d.get("mention_id", None)

            if context_doc_id:
                if ontology == "medic":
                    output[0]["context_doc_id"] = d.get("document_id", None)
                else:
                    output[0]["context_doc_id"] = d.get("context_doc_id", None)

            ents_in_split.extend(output)

        split_name = split
        if split =='validation':
            split_name = 'valid'
        with open(os.path.join(data_path, f'{split_name}.jsonl'), 'w') as f:
            f.write('\n'.join([ujson.dumps(x) for x in ents_in_split]))
            
    return output
            

In [127]:
ontology = "medic"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

ontology_type = "medic"
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
# ontology_dir="/mitchell/entity-linking/2017AA/META/"

# data_path = f'/home2/cye73/arboEL2/data/arboel/{dataset}'

entity_dictionary = process_mention3_dataset(ontology = ontology,
                        dataset = dataset,
                        data_path = data_path,
                        ontology_type = ontology_type,
                        ontology_dir = ontology_dir, 
                        mention_id = True,
                        context_doc_id = True,
                        label = True)

/home2/cye73/data/arboel/ncbi_disease


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading stored processed entity dictionary...


100%|██████████| 13189/13189 [00:00<00:00, 2804495.59it/s]
100%|██████████| 13189/13189 [00:00<00:00, 2448704.15it/s]
100%|██████████| 13189/13189 [00:00<00:00, 2200512.17it/s]


Max labels on one doc: 5
['train' 'validation' 'test']
train


Creating correct mention format for train dataset: 100%|██████████| 5065/5065 [00:00<00:00, 318545.98it/s]


validation


Creating correct mention format for validation dataset: 100%|██████████| 780/780 [00:00<00:00, 255610.37it/s]


test


Creating correct mention format for test dataset: 100%|██████████| 960/960 [00:00<00:00, 359383.42it/s]


In [121]:
import os
import pickle 
import json

ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)

mentions = []

with open(os.path.join(data_path, "train.jsonl"), 'r')  as read_handle :
    for line in read_handle:
        mentions.append(json.loads(line))

for i in range(5) :
    print("------") 
    for key, value in mentions[i].items():
        print(f"{key}: {value}")

------
mention: adenomatous polyposis coli tumour
context_left: Identification of APC2, a homologue of the 
context_right:  suppressor.
The adenomatous polyposis coli (APC) tumour-suppressor protein controls the Wnt signalling pathway by forming a complex with glycogen synthase kinase 3beta (GSK-3beta), axin/conductin and betacatenin. Complex formation induces the rapid degradation of betacatenin. In colon carcinoma cells, loss of APC leads to the accumulation of betacatenin in the nucleus, where it binds to and activates the Tcf-4 transcription factor (reviewed in [1] [2]). Here, we report the identification and genomic structure of APC homologues. Mammalian APC2, which closely resembles APC in overall domain structure, was functionally analyzed and shown to contain two SAMP domains, both of which are required for binding to conductin. Like APC, APC2 regulates the formation of active betacatenin-Tcf complexes, as demonstrated using transient transcriptional activation assays in APC -/

# TEST 3
### Use load_medic from ontology.py

In [33]:
ontology_dir = '/mitchell/entity-linking/kbs/medic.tsv'
# ontology = "MEDIC"
model = "arboel"
dataset = "ncbi_disease"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)
print(data_path)

/home2/cye73/data/arboel/ncbi_disease


In [34]:
import os
import pickle
from tqdm import tqdm

def process_medic4_ontology(ontology, data_path, ontology_dir):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
        Ontology associated with the dataset
    - data_path : str
        Path where to load and save dictionary.pickle
    - ontology_dir : str
        Path to medic ontology
    '''
    
    # Use the class method to load the MEDIC ontology and get a new instance of BiomedicalOntology
    ontology = BiomedicalOntology.load_medic(filepath=ontology_dir, name="medic")
    
    # Check if equivalent CUIs are present for the first entity
    first_entity_cui = next(iter(ontology.entities))
    equivalent_cuis = bool(ontology.entities[first_entity_cui].equivalant_cuis)
        
    'If dictionary already processed, load it else process and load it'
    entity_dictionary_pkl_path = os.path.join(data_path, 'dictionary.pickle')
    
    if os.path.isfile(entity_dictionary_pkl_path): 
        print("Loading stored processed entity dictionary...")
        with open(entity_dictionary_pkl_path, 'rb') as read_handle:
            entities = pickle.load(read_handle)
        
        return entities, equivalent_cuis
    
    ontology_entities = []
    for entity in tqdm(ontology.entities):      
        if entity.aliases != "":
            if entity.definition != "":
                new_entity = {
                    'type': 'Disease',
                    'cui': entity.cui,
                    'title': entity.name,
                    'cuis': entity.equivalant_cuis,
                    'description': f"{entity.name} ( Disease : {entity.aliases} ) [{entity.definition}]"
                }
            else : 
                new_entity = {
                    'type': 'Disease',
                    'cui': entity.cui,
                    'title': entity.name,
                    'cuis': entity.equivalant_cuis,
                    'description': f"{entity.name} ( Disease : {entity.aliases} )"
                }
                
        else : 
            if entity.definition != "":
                new_entity = {
                        'type': 'Disease',
                        'cui': entity.cui,
                        'title': entity.name,
                        'cuis': entity.equivalant_cuis,
                        'description': f"{entity.name} ( Disease) [{entity.definition}]"
                    }
            else : 
                new_entity = {
                        'type': 'Disease',
                        'cui': entity.cui,
                        'title': entity.name,
                        'cuis': entity.equivalant_cuis,
                        'description': f"{entity.name} ( Disease)"
                    }
        ontology_entities.append(new_entity)
    
    # Check if the directory exists, and create it if it does not
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    # Save entities to pickle file
    with open(os.path.join(data_path, 'dictionary.pickle'), 'wb') as f:
        pickle.dump(ontology_entities, f)
        
    return ontology_entities, equivalent_cuis



In [35]:
entities = process_medic4_ontology(ontology = "medic", 
                            data_path = data_path, 
                            ontology_dir = ontology_dir)

entities[0]

[2024-04-06 22:58:16] [ontology.py] [INFO] Reading medic from /mitchell/entity-linking/kbs/medic.tsv


Loading stored processed entity dictionary...


[{'type': 'Disease',
  'cui': 'MESH:C538288',
  'title': '10p Deletion Syndrome (Partial)',
  'cuis': ['MESH:C538288'],
  'description': '10p Deletion Syndrome (Partial) ( Disease : Chromosome 10, 10p- Partial|Chromosome 10, monosomy 10p|Chromosome 10, Partial Deletion (short arm)|Monosomy 10p )'},
 {'type': 'Disease',
  'cui': 'MESH:C535484',
  'title': '13q deletion syndrome',
  'cuis': ['MESH:C535484'],
  'description': "13q deletion syndrome ( Disease : Chromosome 13q deletion|Chromosome 13q deletion syndrome|Chromosome 13q monosomy|Chromosome 13q syndrome|Deletion 13q|Deletion 13q syndrome|Monosomy 13q|Monosomy 13q syndrome|Orbeli's syndrome|Orbeli syndrome )"},
 {'type': 'Disease',
  'cui': 'MESH:C579849',
  'title': '15q24 Microdeletion',
  'cuis': ['MESH:C579849'],
  'description': '15q24 Microdeletion ( Disease : 15q24 Deletion|15q24 Microdeletion Syndrome|Interstitial Deletion of Chromosome 15q24 )'},
 {'type': 'Disease',
  'cui': 'MESH:C579850',
  'title': '16p11.2 Deletion 

In [26]:

def process_mention4_dataset(ontology,
                            dataset,
                            data_path,
                            ontology_dir: Optional[str] = None,
                            mention_id: Optional[bool] = True,
                            context_doc_id: Optional[bool] = True,
                            ): 
    '''
    This function prepares the mentions data :  Creates the train.jsonl, valid.jsonl, test.jsonl
    Each .jsonl contains data in the following format : 
    {'mention': mention, 
    'mention_id': ID of the mention, (optional)
    'context_left': context before mention,
    'context_right': context after mention, 
    'context_doc_id': ID of the doc, (optional)
    'type': type
    'label_id': label_id,
    'label': entity description, (optional)
    'label_title': entity title
    
    Parameters 
    ----------
    - ontology : str
    Ontology associated with the dataset
    - dataset : str
    Name of the dataset
    - data_path : str
    Path where to load and save dictionary.pickle
    - umls_dir : str
    Path to the ontology (umls, medic etc...)
    '''
    data = conhelps.for_config_name(f'{dataset}_bigbio_kb').load_dataset()
    exclude = CUIS_TO_EXCLUDE[dataset]
    remap = CUIS_TO_REMAP[dataset]

    if ontology == "obo" :
        entities, equivalant_cuis = process_obo_ontology(ontology, data_path)
    elif ontology == "medic" : 
        entities, equivalant_cuis = process_medic4_ontology(ontology, data_path, ontology_dir)
    elif ontology == "umls" : 
        entities, equivalant_cuis = process_umls_ontology(ontology, data_path, ontology_dir)
    else : 
        print("ERROR!")
        
    entity_dictionary = {d['cui']:d for d in tqdm(entities)} #CC1

    "For ontology with multiples cuis"
    if equivalant_cuis : 
        # Need to redo this since we have multiple synonymous CUIs for ncbi_disease
        entity_dictionary = {cui:d for d in tqdm(entities) for cui in d['cuis']}
        cui_synsets = {}
        for subdict in tqdm(entities): 
            for cui in subdict['cuis']:
                if cui in subdict:
                    print(cui, cui_synsets[cui], subdict['cuis'])
                cui_synsets[cui] = subdict['cuis'] 
        with open(os.path.join(data_path, 'cui_synsets.json'), 'w') as f:
            f.write(ujson.dumps(cui_synsets, indent=2))

    if dataset in VALIDATION_DOCUMENT_IDS:
        validation_pmids = VALIDATION_DOCUMENT_IDS[dataset]
    else:
        print("ERROR!!!")
        
    # Convert BigBio dataset to pandas DataFrame
    df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude, val_split_ids=validation_pmids)
    # Return dictionary of documents in BigBio dataset
    docs = dataset_to_documents(data)
    label_len = df['db_ids'].map(lambda x: len(x)).max()
    print("Max labels on one doc:", label_len)

    for split in df.split.unique():
        print(split)

        ents_in_split = []
        for d in tqdm(df.query("split == @split").to_dict(orient='records'),
                      desc=f"Creating correct mention format for {split} dataset"):
            abbrev_resolved = False
            offsets = d['offsets']
            doc_id = d['document_id']
            doc = docs[doc_id]
            mention = d['text']
            
            # Get offsets and context
            start = offsets[0][0] # start on the mention
            end = offsets[-1][-1] # end of the mention
            before_context = doc[:start] # left context
            after_context = doc[end:] # right context
            
            
            # ArboEL can't handle multi-labels, so we randomly choose one.
            if len(d['db_ids']) == 1:
                label_id = d['db_ids'][0]

            # For ontology with multiples cuis
            elif equivalant_cuis : 
                labels = []
                used_cuis = set([])
                choosable_ids = []
                for db_id in d['db_ids']:
                    if db_id in used_cuis:
                        continue
                    else:
                        used_cuis.update(set(entity_dictionary[db_id]['cuis']))
                    choosable_ids.append(db_id)

                label_id = np.random.choice(choosable_ids)
            
            else:
                label_id = np.random.choice(d['db_ids'])

            # Check if we missed something
            if label_id not in entity_dictionary:
                print(label_id)
                continue
            

            
            output = [
                {
                    "mention": mention,
                    "context_left": before_context,
                    "context_right": after_context,
                    "type": d["type"][0],
                    "label_id": label_id,
                    "label_title": entity_dictionary[label_id]["title"],
                    "label": entity_dictionary[label_id]["description"],
                }
            ]
            
            if mention_id:
                output[0]["mention_id"] = d.get("mention_id", None)

            if context_doc_id:
                output[0]["context_doc_id"] = d.get("document_id", None)

            ents_in_split.extend(output)

        split_name = split
        if split =='validation':
            split_name = 'valid'
        with open(os.path.join(data_path, f'{split_name}.jsonl'), 'w') as f:
            f.write('\n'.join([ujson.dumps(x) for x in ents_in_split]))
            
    return output
            

In [29]:
mentions = process_mention4_dataset(ontology = "medic",
                                    dataset = dataset,
                                    data_path = data_path,
                                    ontology_dir = ontology_dir,
                                    mention_id = True,
                                    context_doc_id = True,
                                    )

mentions[0]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading stored processed entity dictionary...


100%|██████████| 13189/13189 [00:00<00:00, 1521415.72it/s]
100%|██████████| 13189/13189 [00:00<00:00, 1559546.54it/s]
100%|██████████| 13189/13189 [00:00<00:00, 1249546.55it/s]


Max labels on one doc: 5
train


Creating correct mention format for train dataset: 100%|██████████| 5065/5065 [00:00<00:00, 198853.82it/s]


validation


Creating correct mention format for validation dataset: 100%|██████████| 780/780 [00:00<00:00, 167677.57it/s]


test


Creating correct mention format for test dataset: 100%|██████████| 960/960 [00:00<00:00, 253623.82it/s]


{'mention': 'sporadic breast cancers',
 'context_left': 'Localization of human BRCA1 and its loss in high-grade, non-inherited breast carcinomas.\nAlthough the link between the BRCA1 tumour-suppressor gene and hereditary breast and ovarian cancer is established, the role, if any, of BRCA1 in non-familial cancers is unclear. BRCA1 mutations are rare in sporadic cancers, but loss of BRCA1 resulting from reduced expression or incorrect subcellular localization is postulated to be important in non-familial breast and ovarian cancers. Epigenetic loss, however, has not received general acceptance due to controversy regarding the subcellular localization of BRCA1 proteins, reports of which have ranged from exclusively nuclear, to conditionally nuclear, to the ER/golgi, to cytoplasmic invaginations into the nucleus. In an attempt to resolve this issue, we have comprehensively characterized 19 anti-BRCA1 antibodies. These reagents detect a 220-kD protein localized in discrete nuclear foci in al

# TEST 4
### INCLUDE resolve_abbrevs