# MESH

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import pickle
import ujson
import sys
import os
import csv

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from typing import Optional, Union

from bigbio.dataloader import BigBioConfigHelpers

sys.path.append('../../../..')
sys.path.append('..')
from DataModule import process_mention_dataset, process_umls_ontology, process_obo_ontology
from umls_utils import UmlsMappings
from bigbio_utils import CUIS_TO_REMAP, CUIS_TO_EXCLUDE, DATASET_NAMES, VALIDATION_DOCUMENT_IDS
from bigbio_utils import dataset_to_documents, dataset_to_df, resolve_abbreviation, get_left_context, get_right_context
from bioel.ontology import BiomedicalOntology

conhelps = BigBioConfigHelpers()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `d

In [21]:
from dataclasses import dataclass, field, asdict
from typing import List, Optional, Union
from tqdm import tqdm

import obonet
import csv

from umls_utils import UmlsMappings

@dataclass
class BiomedicalEntity:
    """
    Class for keeping track of all relevant fields in an ontology
    """

    cui: str
    name: str
    types: List[str]
    aliases: List[str] = field(default_factory=list)
    definition: Optional[str] = None
    equivalant_cuis: Optional[List[str]] = None
    taxonomy: Optional[str] = None
    metadata: Optional[dict] = None
    extra_data: Optional[dict] = None

@dataclass
class BiomedicalOntology:
    name: str
    types: List[str] = field(default_factory=list)
    entities: List[BiomedicalEntity] = field(
        default_factory=list
    )  # Dict mapping CUI: BiomedicalEntity
    abbrev: Optional[str] = None  # Abbreviated name of ontology if different than name
    metadata: Optional[dict] = None
    mappings: dict = field(default_factory=dict)

    def load_mesh(self, path=None, api_key=""):
        umls = UmlsMappings(umls_dir=path, umls_api_key=api_key)

        # Get the Canonial Names
        lowercase = False
        mesh_to_name = umls.get_canonical_name(
            ontologies_to_include=["MSH"],
            use_umls_curies=False,
            mapping_cols={"MSH": "sdui"},
            prefixes={"MSH": "MESH"},
            lowercase=lowercase,
        )

        mesh_to_alias = umls.get_aliases(
            ontologies_to_include=["MSH"],
            use_umls_curies=False,
            mapping_cols={"MSH": "sdui"},
            prefixes={"MSH": "MESH"},
            lowercase=lowercase,
        )

        mesh_cui2definition = umls.get_definition(
            ontologies_to_include=["MSH"],
            use_umls_curies=False,
            mapping_cols={"MSH": "sdui"},
            prefixes={"MSH": "MESH"},
            lowercase=lowercase,
        )

        mesh_to_types, mesh_to_groups = umls.get_types_and_groups(
            ontologies_to_include=["MSH"],
            use_umls_curies=False,
            mapping_cols={"MSH": "sdui"},
            prefixes={"MSH": "MESH"},
            lowercase=lowercase,
        )

        for cui, name in tqdm(mesh_to_name.items()):
            ent_type = mesh_to_types[cui]
            other_aliases = [x for x in mesh_to_alias[cui] if x != name]
            joined_aliases = " ; ".join(other_aliases)
            entity = BiomedicalEntity(
                cui=cui,
                name=name,
                types=ent_type,
                aliases=joined_aliases,
                definition=mesh_cui2definition[cui] if cui in mesh_cui2definition else None,
                metadata={
                    "group": mesh_to_groups[cui],
                },
            )
            self.entities.append(entity)

    def load_ncbi_taxon(self):
        pass

    def load_csv(self):
        pass

    def load_json(self):
        pass




# Process mesh ontology

In [23]:
import os
import pickle
from tqdm import tqdm

def process_mesh_ontology(ontology, data_path, ontology_dir):
    '''
    This function prepares the entity data : dictionary.pickle
    
    Parameters 
    ----------
    - ontology : str (only umls for now)
        Ontology associated with the dataset
    - data_path : str
        Path where to load and save dictionary.pickle
    - ontology_dir : str
        Path to medic ontology
    '''
    
    ontology = BiomedicalOntology(name="mesh")
    ontology.load_mesh(path = ontology_dir)
    
    equivalent_cuis = False
    if ontology.entities[0].equivalant_cuis is not None : 
        equivalent_cuis = True
        
    'If dictionary already processed, load it else process and load it'
    entity_dictionary_pkl_path = os.path.join(data_path, 'dictionary.pickle')
    
    if os.path.isfile(entity_dictionary_pkl_path): 
        print("Loading stored processed entity dictionary...")
        with open(entity_dictionary_pkl_path, 'rb') as read_handle:
            entities = pickle.load(read_handle)
        
        return entities, equivalent_cuis
    
    ontology_entities = []
    for entity in tqdm(ontology.entities):      
        if entity.aliases != "":
            if entity.definition != "":
                new_entity = {
                    'type': 'Disease',
                    'cui': entity.cui,
                    'title': entity.name,
                    'description': f"{entity.name} ( Disease : {entity.aliases} ) [{entity.definition}]"
                }
            else : 
                new_entity = {
                    'type': 'Disease',
                    'cui': entity.cui,
                    'title': entity.name,
                    'description': f"{entity.name} ( Disease : {entity.aliases} )"
                }
                
        else : 
            if entity.definition != "":
                new_entity = {
                        'type': 'Disease',
                        'cui': entity.cui,
                        'title': entity.name,
                        'description': f"{entity.name} ( Disease) [{entity.definition}]"
                    }
            else : 
                new_entity = {
                        'type': 'Disease',
                        'cui': entity.cui,
                        'title': entity.name,
                        'description': f"{entity.name} ( Disease)"
                    }
        ontology_entities.append(new_entity)
    
    # Check if the directory exists, and create it if it does not
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    # Save entities to pickle file
    with open(os.path.join(data_path, 'dictionary.pickle'), 'wb') as f:
        pickle.dump(ontology_entities, f)
        
    return ontology_entities, equivalent_cuis



In [24]:
ontology_dir="/mitchell/entity-linking/2017AA/META/"
ontology = "mesh"
model = "arboel"
dataset = "bc5cdr"
abs_path = "/home2/cye73/data"
data_path = os.path.join(abs_path, model, dataset)

In [None]:
entities = process_mesh_ontology(ontology = "mesh", 
                            data_path = data_path, 
                            ontology_dir = ontology_dir)

In [26]:
entities[0][0]

{'type': 'Disease',
 'cui': 'MESH:C000002',
 'title': 'bevonium',
 'description': 'bevonium ( Disease : 2-(hydroxymethyl)-N,N-dimethylpiperidinium benzilate ; piribenzil methyl sulfate ; bevonium methylsulfate ; bevonium metilsulfate ; CG 201 ; Acabel ; bevonium sulfate (1:1) ; bevonium methyl sulfate ) [None]'}

# MENTION 

In [29]:

def process_mention_dataset(ontology,
                            dataset,
                            data_path,
                            ontology_dir: Optional[str] = None,
                            mention_id: Optional[bool] = True,
                            context_doc_id: Optional[bool] = True,
                            ): 
    '''
    This function prepares the mentions data :  Creates the train.jsonl, valid.jsonl, test.jsonl
    Each .jsonl contains data in the following format : 
    {'mention': mention, 
    'mention_id': ID of the mention, (optional)
    'context_left': context before mention,
    'context_right': context after mention, 
    'context_doc_id': ID of the doc, (optional)
    'type': type
    'label_id': label_id,
    'label': entity description, (optional)
    'label_title': entity title
    
    Parameters 
    ----------
    - ontology : str
    Ontology associated with the dataset
    - dataset : str
    Name of the dataset
    - data_path : str
    Path where to load and save dictionary.pickle
    - ontology_dir : str
    Path to the ontology (umls, medic etc...)
    '''
    data = conhelps.for_config_name(f'{dataset}_bigbio_kb').load_dataset()
    exclude = CUIS_TO_EXCLUDE[dataset]
    remap = CUIS_TO_REMAP[dataset]

    if ontology == "obo" :
        entities, equivalant_cuis = process_obo_ontology(ontology, data_path)
    elif ontology == "medic" : 
        entities, equivalant_cuis = process_medic_ontology(ontology, data_path, ontology_dir)
    elif ontology == "umls" : 
        entities, equivalant_cuis = process_umls_ontology(ontology, data_path, ontology_dir)
    elif ontology == "mesh" : 
        entities, equivalant_cuis = process_mesh_ontology(ontology, data_path, ontology_dir)
    else : 
        print("ERROR!")
        
    entity_dictionary = {d['cui']:d for d in tqdm(entities)} #CC1

    # For ontology with multiples cuis
    if equivalant_cuis : 
        # Need to redo this since we have multiple synonymous CUIs for ncbi_disease
        entity_dictionary = {cui:d for d in tqdm(entities) for cui in d['cuis']}
        cui_synsets = {}
        for subdict in tqdm(entities): 
            for cui in subdict['cuis']:
                if cui in subdict:
                    print(cui, cui_synsets[cui], subdict['cuis'])
                cui_synsets[cui] = subdict['cuis'] 
        with open(os.path.join(data_path, 'cui_synsets.json'), 'w') as f:
            f.write(ujson.dumps(cui_synsets, indent=2))

    if dataset in VALIDATION_DOCUMENT_IDS:
        validation_pmids = VALIDATION_DOCUMENT_IDS[dataset]
    else:
        print("ERROR!!!")
        
    # Convert BigBio dataset to pandas DataFrame
    df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude, val_split_ids=validation_pmids)
    # Return dictionary of documents in BigBio dataset
    docs = dataset_to_documents(data)
    label_len = df['db_ids'].map(lambda x: len(x)).max()
    print("Max labels on one doc:", label_len)

    for split in df.split.unique():
        print(split)

        ents_in_split = []
        for d in tqdm(df.query("split == @split").to_dict(orient='records'),
                      desc=f"Creating correct mention format for {split} dataset"):
            abbrev_resolved = False
            offsets = d['offsets']
            doc_id = d['document_id']
            doc = docs[doc_id]
            mention = d['text']
            
            # Get offsets and context
            start = offsets[0][0] # start on the mention
            end = offsets[-1][-1] # end of the mention
            before_context = doc[:start] # left context
            after_context = doc[end:] # right context
            
            
            # ArboEL can't handle multi-labels, so we randomly choose one.
            if len(d['db_ids']) == 1:
                label_id = d['db_ids'][0]

            # For ontology with multiples cuis
            elif equivalant_cuis : 
                labels = []
                used_cuis = set([])
                choosable_ids = []
                for db_id in d['db_ids']:
                    if db_id in used_cuis:
                        continue
                    else:
                        used_cuis.update(set(entity_dictionary[db_id]['cuis']))
                    choosable_ids.append(db_id)

                label_id = np.random.choice(choosable_ids)
            
            else:
                label_id = np.random.choice(d['db_ids'])

            # Check if we missed something
            if label_id not in entity_dictionary:
                print(label_id)
                continue
            

            
            output = [
                {
                    "mention": mention,
                    "context_left": before_context,
                    "context_right": after_context,
                    "type": d["type"][0],
                    "label_id": label_id,
                    "label_title": entity_dictionary[label_id]["title"],
                    "label": entity_dictionary[label_id]["description"],
                }
            ]
            
            if mention_id:
                output[0]["mention_id"] = d.get("mention_id", None)

            if context_doc_id:
                output[0]["context_doc_id"] = d.get("document_id", None)

            ents_in_split.extend(output)

        split_name = split
        if split =='validation':
            split_name = 'valid'
        with open(os.path.join(data_path, f'{split_name}.jsonl'), 'w') as f:
            f.write('\n'.join([ujson.dumps(x) for x in ents_in_split]))
            
    return output
            

In [30]:
mentions = process_mention_dataset(ontology = ontology,
                                    dataset = dataset,
                                    data_path = data_path,
                                    ontology_dir = ontology_dir,
                                    mention_id = True,
                                    context_doc_id = True,
                                    )

mentions[0]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading cached UMLS data from /mitchell/entity-linking/2017AA/META/.cached_df.feather
Filtering by ontologies
Removing null definitions


100%|██████████| 268162/268162 [00:01<00:00, 162406.63it/s]


Loading stored processed entity dictionary...


100%|██████████| 268162/268162 [00:00<00:00, 1990670.90it/s]


Max labels on one doc: 6
validation


Creating correct mention format for validation dataset: 100%|██████████| 9638/9638 [00:00<00:00, 46265.44it/s]


test


Creating correct mention format for test dataset: 100%|██████████| 9750/9750 [00:00<00:00, 356798.53it/s]

train



Creating correct mention format for train dataset: 100%|██████████| 9428/9428 [00:00<00:00, 355061.40it/s]


{'mention': 'NO',
 'context_left': 'Mechanisms of FK 506-induced hypertension in the rat.\n-Tacrolimus (FK 506) is a powerful, widely used immunosuppressant. The clinical utility of FK 506 is complicated by substantial hypertension and nephrotoxicity. To clarify the mechanisms of FK 506-induced hypertension, we studied the chronic effects of FK 506 on the synthesis of endothelin-1 (ET-1), the expression of mRNA of ET-1 and endothelin-converting enzyme-1 (ECE-1), the endothelial nitric oxide synthase (eNOS) activity, and the expression of mRNA of eNOS and C-type natriuretic peptide (CNP) in rat blood vessels. In addition, the effect of the specific endothelin type A receptor antagonist FR 139317 on FK 506-induced hypertension in rats was studied. FK 506, 5 mg. kg-1. d-1 given for 4 weeks, elevated blood pressure from 102+/-13 to 152+/-15 mm Hg and increased the synthesis of ET-1 and the levels of ET-1 mRNA in the mesenteric artery (240% and 230%, respectively). Little change was observe