In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import ujson
import sys
import os

import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict

from bigbio.dataloader import BigBioConfigHelpers


sys.path.append('..')
from umls_utils import UmlsMappings
from bigbio_utils import CUIS_TO_REMAP, CUIS_TO_EXCLUDE, DATASET_NAMES, VALIDATION_DOCUMENT_IDS
from bigbio_utils import dataset_to_documents, dataset_to_df, resolve_abbreviation

conhelps = BigBioConfigHelpers()


# Make dictionary of possible entity links

## Process UMLS into dictionary

For each entity:
* Get canonical name
* Get GCD of types (if possible)
* Get definition
  
Then output dictionary of entities

In [2]:
def get_type_gcd(types, type2geneology, cached_types: dict = None):
    '''
    Find the most granular single parent type for an entity with multiple types

    If an entity is assigned multiple types with disjoint type hierarchies

    
    '''
    if len(types) == 1:
        t = types[0]
        if t not in cached_types:
            cached_types[t] = type2geneology[t][-1]
        return cached_types[t]

    type_tuple = tuple(sorted(types))
    if type_tuple in cached_types:
        return cached_types[type_tuple]
    else:
        geneologies = [type2geneology[t] for t in types]
        min_len = min([len(x) for x in geneologies])
        arr = np.array([gen[:min_len] for gen in geneologies])
        unique_types = np.unique(arr[:,0])
        if len(unique_types) > 1:
            output_types = ', '.join(list(set([x[-1] for x in geneologies])))
            cached_types[type_tuple] = output_types
            return output_types
        else:
            for i in np.arange(1, arr.shape[1]):
                curr_unique = np.unique(arr[:,i])

                if len(curr_unique) > 1 or i >= 3:
                    cached_types[type_tuple] = unique_types[0]
                    return unique_types[0]
                else:
                    unique_types = curr_unique


type2geneology = ujson.load(open('../data/tui2type_hierarchy.json'))
cached_types = {}
# umls.umls.tui.map(lambda x: get_type_gcd(x, type2geneology, cached_types)).values
    

In [3]:
umls_dir="/mitchell/entity-linking/2017AA/META/"

sem_groups = pd.read_csv(
            os.path.join(umls_dir, "SemGroups.txt"),
            sep="|",
            usecols=[0, 1, 2],
            names=["group", "name", "tui"],
        )

group2name = sem_groups[['group','name']].drop_duplicates().set_index('group').name.to_dict()

with open('../data/group2name.json','w') as f:
    f.write(ujson.dumps(group2name))

umls = UmlsMappings(
    umls_dir=umls_dir, debug=False, force_reprocess=False, 
)





Loading cached UMLS data from /mitchell/entity-linking/2017AA/META/.cached_df.feather


In [4]:
umls.umls.head()

Unnamed: 0,cui,lang,ispref,aui,scui,sdui,sab,tty,alias,rank,tui,group,def
0,C0000005,ENG,Y,A26634265,M0019694,D012711,MSH,PEP,(131)I-Macroaggregated Albumin,4,"[T116, T121, T130]",[CHEM],
1,C0000074,ENG,Y,A26606894,M0016680,D010742,MSH,PEP,1-Alkyl-2-Acylphosphatidates,4,[T109],[CHEM],
2,C0000132,ENG,Y,A26665454,M0004272,D002787,MSH,PEP,15-Ketosteryl Oleate Hydrolase,4,"[T116, T126]",[CHEM],
3,C0000137,ENG,Y,A26650280,M0019173,D012335,MSH,PEP,15S RNA,4,"[T114, T123]",[CHEM],
4,C0000151,ENG,Y,A26647507,M0020412,D013196,MSH,PEP,17 beta-Hydroxy-5 beta-Androstan-3-One,4,"[T109, T121]",[CHEM],


In [6]:
lowercase=False

# Full UMLS Canonical Names
print("Getting Names")
umls_to_name = umls.get_canonical_name(ontologies_to_include="all",
    use_umls_curies=True,
    lowercase=lowercase,)

Getting Names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["identifier"] = df["cui"]


In [7]:
all_umls_df = umls.umls.query('lang == "ENG"').groupby('cui').agg({'alias': lambda x: list(set(x)), 'tui':'first', 'group': 'first', 'def':'first'}).reset_index()
all_umls_df['name'] = all_umls_df.cui.map(umls_to_name)
all_umls_df['alias'] = all_umls_df[['name','alias']].apply(lambda x: list(set(x[1]) - set([x[0]])) , axis=1)
all_umls_df['cui'] = all_umls_df['cui'].map(lambda x: 'UMLS' + x)
all_umls_df['has_definition'] = all_umls_df['def'].map(lambda x: x is not None)
all_umls_df['num_aliases'] = all_umls_df['alias'].map(lambda x: len(x))
all_umls_df.head()

Unnamed: 0,cui,alias,tui,group,def,name,has_definition,num_aliases
0,UMLSC0000005,[(131)I-MAA],"[T116, T121, T130]",[CHEM],,(131)I-Macroaggregated Albumin,False,1
1,UMLSC0000039,"[1,2 Dipalmitoyl Glycerophosphocholine, Dipalm...","[T109, T121]",[CHEM],Synthetic phospholipid used in liposomes and l...,"1,2-Dipalmitoylphosphatidylcholine",True,15
2,UMLSC0000052,"[1,4-Alpha glucan branching enzyme, Branching ...","[T116, T126]",[CHEM],"In glycogen or amylopectin synthesis, the enzy...","1,4-alpha-Glucan Branching Enzyme",True,22
3,UMLSC0000074,[1 Alkyl 2 Acylphosphatidates],[T109],[CHEM],,1-Alkyl-2-Acylphosphatidates,False,1
4,UMLSC0000084,"[gamma Carboxyglutamic Acid, 3-Amino-1,1,3-pro...","[T116, T123]",[CHEM],"Found in various tissues, particularly in four...",1-Carboxyglutamic Acid,True,7


0          UMLSC0000005
1          UMLSC0000039
2          UMLSC0000052
3          UMLSC0000074
4          UMLSC0000084
               ...     
3465002    UMLSC4316761
3465003    UMLSC4316762
3465004    UMLSC4316763
3465005    UMLSC4316764
3465006    UMLSC4316765
Name: cui, Length: 3465007, dtype: object

In [18]:
all_umls_df.head()

Unnamed: 0,cui,alias,tui,group,def,name
0,C0000005,[(131)I-MAA],"[T116, T121, T130]",[CHEM],,(131)I-Macroaggregated Albumin
1,C0000039,"[Phosphatidylcholine, Dipalmitoyl, 1,2 Dipalmi...","[T109, T121]",[CHEM],Synthetic phospholipid used in liposomes and l...,"1,2-Dipalmitoylphosphatidylcholine"
2,C0000052,"[1,4-Alpha glucan branching enzyme, 1,4-alpha-...","[T116, T126]",[CHEM],"In glycogen or amylopectin synthesis, the enzy...","1,4-alpha-Glucan Branching Enzyme"
3,C0000074,[1 Alkyl 2 Acylphosphatidates],[T109],[CHEM],,1-Alkyl-2-Acylphosphatidates
4,C0000084,"[3-Amino-1,1,3-propanetricarboxylic Acid, gamm...","[T116, T123]",[CHEM],"Found in various tissues, particularly in four...",1-Carboxyglutamic Acid


Unnamed: 0,cui,alias,tui,group,def,name,has_definition,num_aliases
0,C0000005,[(131)I-MAA],"[T116, T121, T130]",[CHEM],,(131)I-Macroaggregated Albumin,False,1
1,C0000039,"[Phosphatidylcholine, Dipalmitoyl, 1,2 Dipalmi...","[T109, T121]",[CHEM],Synthetic phospholipid used in liposomes and l...,"1,2-Dipalmitoylphosphatidylcholine",True,15
2,C0000052,"[1,4-Alpha glucan branching enzyme, 1,4-alpha-...","[T116, T126]",[CHEM],"In glycogen or amylopectin synthesis, the enzy...","1,4-alpha-Glucan Branching Enzyme",True,22
3,C0000074,[1 Alkyl 2 Acylphosphatidates],[T109],[CHEM],,1-Alkyl-2-Acylphosphatidates,False,1
4,C0000084,"[3-Amino-1,1,3-propanetricarboxylic Acid, gamm...","[T116, T123]",[CHEM],"Found in various tissues, particularly in four...",1-Carboxyglutamic Acid,True,7


In [6]:
lowercase=False

# Full UMLS Canonical Names
print("Getting Names")
umls_to_name = umls.get_canonical_name(ontologies_to_include="all",
    use_umls_curies=True,
    lowercase=lowercase,)

# Full aliases
print("Geting aliases")
umls_to_alias = umls.get_aliases(
    ontologies_to_include="all",
    use_umls_curies=True,
    lowercase=lowercase,
)


Getting Names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["identifier"] = df["cui"]


Geting aliases


In [174]:
umls_cui2types = umls.umls.groupby('cui').tui.first().to_dict()

In [175]:
print("Getting types")
type2geneology = ujson.load(open('../data/tui2type_hierarchy.json'))
cached_types = {}
cui2types = {}
for cui, types in tqdm(umls_cui2types.items()):
    cui2types[cui] = get_type_gcd(types, type2geneology, cached_types)

    

Getting types


100%|██████████| 3465486/3465486 [00:03<00:00, 1075809.41it/s]


In [176]:
umls_definitions = pd.read_csv(
            os.path.join(umls_dir, "MRDEF.RRF"),
            sep="|",
            usecols=[0,1,4,5,],
            names=["cui", "aui", "sab", 'def',],
        )
umls_definitions.aui.value_counts()

A24110393    6
A7570204     5
A7572881     5
A7573153     5
A7569947     5
            ..
A24360400    1
A1703321     1
A1703324     1
A1701895     1
A27782058    1
Name: aui, Length: 244685, dtype: int64

In [185]:
umls_definitions

Unnamed: 0,cui,aui,sab,def
0,C0000039,A0016515,MSH,Synthetic phospholipid used in liposomes and l...
1,C0000052,A0016535,MSH,"In glycogen or amylopectin synthesis, the enzy..."
2,C0000084,A0016576,MSH,"Found in various tissues, particularly in four..."
3,C0000096,A0526764,MSH,A potent cyclic nucleotide phosphodiesterase i...
4,C0000097,A0016587,MSH,A dopaminergic neurotoxic compound which produ...
...,...,...,...,...
269271,C4305359,A27765928,SNOMEDCT_US,Potential for trauma to skin or mucous membran...
269272,C4305375,A27778477,SNOMEDCT_US,A platelet granule disorder with manifestation...
269273,C4305376,A27776098,SNOMEDCT_US,A rare and isolated orofacial defect with mani...
269274,C4305378,A27782528,SNOMEDCT_US,An exceedingly rare autosomal recessive neurol...


In [189]:
len(umls_definitions.cui.unique())

195998

In [190]:
cui2definition = umls.get_definition(
    ontologies_to_include="all",
    use_umls_curies=True,
    lowercase=lowercase,
)

Filtering by ontologies
Removing null definitions


In [192]:
cui2definition

{'C0000039': 'Synthetic phospholipid used in liposomes and lipid bilayers to study biological membranes. It is also a major constituent of PULMONARY SURFACTANTS.',
 'C0000052': 'In glycogen or amylopectin synthesis, the enzyme that catalyzes the transfer of a segment of a 1,4-alpha-glucan chain to a primary hydroxy group in a similar glucan chain. EC 2.4.1.18.',
 'C0000084': 'Found in various tissues, particularly in four blood-clotting proteins including prothrombin, in kidney protein, in bone protein, and in the protein present in various ectopic calcifications.',
 'C0000096': 'A potent cyclic nucleotide phosphodiesterase inhibitor; due to this action, the compound increases cyclic AMP and cyclic GMP in tissue and thereby activates CYCLIC NUCLEOTIDE-REGULATED PROTEIN KINASES',
 'C0000097': 'A dopaminergic neurotoxic compound which produces irreversible clinical, chemical, and pathological alterations that mimic those found in Parkinson disease.',
 'C0000098': 'An active neurotoxic me

In [193]:
umls_entities = []
for cui, name in tqdm(umls_to_name.items()):
    d = {}
    ent_type = cui2types[cui]
    other_aliases = [x for x in umls_to_alias[cui] if x != name]
    joined_aliases = ' ; '.join(other_aliases)
    d['cui'] = f"UMLS:{cui}"
    d['title'] = name
    d['type'] = ent_type
    if cui in cui2definition:
        definition = cui2definition[cui]
    else:
        definition = None

    if len(other_aliases) > 0:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} )"
    else:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} )"

    umls_entities.append(d)

pickle.dump(umls_entities, open('../data/arboel/medmentions_full/dictionary.pickle', 'wb'))

100%|██████████| 3465007/3465007 [00:08<00:00, 412181.68it/s]


In [None]:
umls_entities[7]

{'cui': 'UMLS:C0000098',
 'title': '1-Methyl-4-phenylpyridinium',
 'type': 'Chemical',
 'description': '1-Methyl-4-phenylpyridinium ( Chemical : 1-Methyl-4-phenylpyridinium Ion ; Cyperquat ; N-Methyl-4-phenylpyridine ; 1-Methyl-4-phenylpyridine ; N-Methyl-4-phenylpyridinium ; 1-Methyl-4-phenylpyridinium ; 1 Methyl 4 phenylpyridinium ; 1 Methyl 4 phenylpyridinium Ion ; N Methyl 4 phenylpyridine ; 1 Methyl 4 phenylpyridine ; Pyridinium, 1-methyl-4-phenyl- ; N METHYL 4 PHENYLPYRIDINIUM ; METHYLPHENYLPYRIDINIUM 01 04 ; 1-Methyl-4-phenylpyridinium [Chemical/Ingredient] ; MPP+ ; An active neurotoxic metabolite of 1-METHYL-4-PHENYL-1,2,3,6-TETRAHYDROPYRIDINE. The compound reduces dopamine levels, inhibits the biosynthesis of catecholamines, depletes cardiac norepinephrine and inactivates tyrosine hydroxylase. These and other toxic effects lead to cessation of oxidative phosphorylation, ATP depletion, and cell death. The compound, which is related to PARAQUAT, has also been used as an herbicid

## Process dict for MedMentions ST21PV

In [1]:
st21pv_vocabs = [
        "MSH",
        "CPT",
        "FMA",
        "GO",
        "HGNC",
        "HPO",
        "ICD10",
        "ICD10CM",
        "ICD9CM",
        "MDR",
        "MTH",
        "NCBI",
        "NCI",
        "NDDF",
        # "MED-RT",
        "NDFRT",
        "OMIM",
        "RXNORM",
        "SNOMEDCT_US",
    ]
with open('../data/st21pv_vocabs.json', 'w') as f:
    f.write(ujson.dumps(st21pv_vocabs))

unique_vocabs = umls.umls.sab.unique()
for x in st21pv_vocabs:
    if x not in unique_vocabs:
        print(x)
        print(unique_vocabs)
    assert x in unique_vocabs

st21pv_types = ujson.load(open("../data/st21pv_subtypes.json", "r"))


print("Getting ST21PV Aliases")
st21pv_to_alias = umls.get_aliases(
    ontologies_to_include=st21pv_vocabs,
    types_to_include=st21pv_types,
    use_umls_curies=True,
    lowercase=lowercase,
)



NameError: name 'ujson' is not defined

In [195]:
st21pv_entities = []
for cui, aliases in tqdm(st21pv_to_alias.items()):
    d = {}
    ent_type = cui2types[cui]
    name = umls_to_name[cui]
    other_aliases = [x for x in aliases if x != name]
    joined_aliases = ' ; '.join(other_aliases)
    
    d['cui'] = f"UMLS:{cui}"
    d['title'] = umls_to_name[cui]
    d['type'] = ent_type
    if cui in cui2definition:
        definition = cui2definition[cui]
    else:
        definition = None

    if len(other_aliases) > 0:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} )"
    else:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} )"
        
    st21pv_entities.append(d)

pickle.dump(st21pv_entities, open('../data/arboel/medmentions_st21pv/dictionary.pickle', 'wb'))

100%|██████████| 2369483/2369483 [00:05<00:00, 416522.84it/s]


## Process dict for BC5CDR

In [196]:
lowercase = False

mesh_to_alias = umls.get_aliases(
        ontologies_to_include=["MSH"],
        use_umls_curies=False,
        mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
        lowercase=lowercase,
    )

mesh_to_name = umls.get_canonical_name(
        ontologies_to_include=["MSH"],
        use_umls_curies=False,
        mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
        lowercase=lowercase,)

mesh_to_types, mesh_to_groups = umls.get_types_and_groups(
        ontologies_to_include=["MSH"],
        use_umls_curies=False,
        mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
        lowercase=lowercase,
)


In [197]:
mesh2definition = umls.get_definition(
    ontologies_to_include=['MSH'],
    use_umls_curies=False,
    mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
    lowercase=lowercase,
)

Filtering by ontologies
Removing null definitions


In [198]:
bc5cdr_entities = []
for cui, aliases in tqdm(mesh_to_alias.items()):
    d = {}
    ent_type = mesh_to_groups[cui][0]
    type_string = ' , '.join([group2name[x] for x in mesh_to_groups[cui]])
    name = mesh_to_name[cui]
    other_aliases = [x for x in aliases if x != name]
    joined_aliases = ' ; '.join(other_aliases)
    
    d['cui'] = cui
    d['title'] = mesh_to_name[cui]
    d['type'] = ent_type
    if cui in mesh2definition:
        definition = mesh2definition[cui]
    else:
        definition = None

    if len(other_aliases) > 0:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} )"
    else:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} )"

    bc5cdr_entities.append(d)

pickle.dump(bc5cdr_entities, open('../data/arboel/bc5cdr/dictionary.pickle', 'wb'))

100%|██████████| 268162/268162 [00:00<00:00, 323996.80it/s]


In [199]:
bc5cdr_entities

[{'cui': 'MESH:C000002',
  'title': 'bevonium',
  'type': 'CHEM',
  'description': 'bevonium ( CHEM : 2-(hydroxymethyl)-N,N-dimethylpiperidinium benzilate ; piribenzil methyl sulfate ; bevonium methylsulfate ; bevonium metilsulfate ; CG 201 ; Acabel ; bevonium sulfate (1:1) ; bevonium methyl sulfate )'},
 {'cui': 'MESH:C000006',
  'title': 'insulin, neutral',
  'type': 'CHEM',
  'description': 'insulin, neutral ( CHEM : Insulin, 8A-L-threonine-10A-L-isoleucine- ; neutral insulin ; insulin pork ; insulin (pork) ; insulin (swine) ; insulin (ox), 8(A)-L-threonine-10(A)-L-isoleucine- ; Actrapid insulin ; Novo MC insulin )'},
 {'cui': 'MESH:C000009',
  'title': 'N-acetylglucosaminylasparagine',
  'type': 'CHEM',
  'description': "N-acetylglucosaminylasparagine ( CHEM : L-Asparagine, N-(2-acetylamino)-2-deoxy-beta-D-glucopyranosyl- ; aspartylglucosylamine ; aspartylglycosamine ; 2-acetamido-1-(beta-L-aspartamido)-1,2-dideoxy-beta-D-glucose ; 2-acetamido-1-N-(4'-L-aspartyl)-2-deoxy-beta-D-glu

## Process dict for NLMChem

In [200]:
umls_dir="/mitchell/entity-linking/2022AA/META/"

umls = UmlsMappings(
    umls_dir=umls_dir, debug=False, force_reprocess=False, 
)

lowercase = False

mesh_to_alias = umls.get_aliases(
        ontologies_to_include=["MSH"],
        use_umls_curies=False,
        mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
        lowercase=lowercase,
    )

mesh_to_name = umls.get_canonical_name(
        ontologies_to_include=["MSH"],
        use_umls_curies=False,
        mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
        lowercase=lowercase,)

mesh_to_types, mesh_to_groups = umls.get_types_and_groups(
        ontologies_to_include=["MSH"],
        use_umls_curies=False,
        mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
        lowercase=lowercase,
)


Loading cached UMLS data from /mitchell/entity-linking/2022AA/META/.cached_df.feather


In [201]:
mesh2definition = umls.get_definition(
    ontologies_to_include=['MSH'],
    use_umls_curies=False,
    mapping_cols={"MSH": "sdui"},
        prefixes={"MSH": "MESH"},
    lowercase=lowercase,
)

Filtering by ontologies
Removing null definitions


In [202]:
nlmchem_entities = []
for cui, aliases in tqdm(mesh_to_alias.items()):
    d = {}
    ent_type = mesh_to_groups[cui][0]
    type_string = ' , '.join([group2name[x] for x in mesh_to_groups[cui]])
    name = mesh_to_name[cui]
    other_aliases = [x for x in aliases if x != name]
    joined_aliases = ' ; '.join(other_aliases)
    d['cui'] = cui
    d['title'] = mesh_to_name[cui]
    d['type'] = ent_type
    if cui in mesh2definition:
        definition = mesh2definition[cui]
    else:
        definition = None

    if len(other_aliases) > 0:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} : {joined_aliases} )"
    else:
        if definition is not None:
            d['description'] = f"{name} ( {ent_type} ) [ {definition} ]"
        else:
            d['description'] = f"{name} ( {ent_type} )"

    nlmchem_entities.append(d)

pickle.dump(nlmchem_entities, open('../data/arboel/nlmchem/dictionary.pickle', 'wb'))

100%|██████████| 348733/348733 [00:01<00:00, 270043.35it/s]


In [203]:
nlmchem_entities[0]

{'cui': 'MESH:C000002',
 'title': 'bevonium',
 'type': 'CHEM',
 'description': 'bevonium ( CHEM : 2-(hydroxymethyl)-N,N-dimethylpiperidinium benzilate ; piribenzil methyl sulfate ; bevonium methylsulfate ; bevonium metilsulfate ; CG 201 ; Acabel ; bevonium sulfate (1:1) ; bevonium methyl sulfate )'}

In [204]:
nlmchem_entities[100300]

{'cui': 'MESH:C005894',
 'title': 'xylamidine',
 'type': 'CHEM',
 'description': 'xylamidine ( CHEM : N-(2-(3-methoxyphenoxy)propyl)-m-tolylacetamidine tosylate ; XYLAMIDINE TOSYLATE ANHYDROUS ; xylamidine tosylate )'}

## Process Entrez for GNormPlus and NLM-Gene

In [2]:
entrez = pd.read_csv(
    "../data/gene_info.tsv",
    delimiter="\t",
    usecols=[
        "#tax_id",
        "GeneID",
        "Symbol",
        "Synonyms",
        "Symbol_from_nomenclature_authority",
        "Full_name_from_nomenclature_authority",
        "Other_designations",
        "type_of_gene",
        "description",
        "dbXrefs"
    ],
    na_filter=False,
    low_memory=False,
).rename(
    {
        "Symbol_from_nomenclature_authority": "official_symbol",
        "Full_name_from_nomenclature_authority": "official_name",
        '#tax_id':'tax_id'
    }, axis=1,
)
entrez.columns = [x.lower() for x in entrez.columns]

In [3]:
entrez.to_feather('../data/ncbigene.feather')

In [7]:
entrez.tax_id.value_counts()

4565      155756
9606      141472
3708      123229
3818      109241
106335    106304
           ...  
28896          1
566546         1
568076         1
568703         1
7              1
Name: tax_id, Length: 41110, dtype: int64

In [4]:
pd.read_feather('../data/ncbigene.feather')

Unnamed: 0,tax_id,geneid,symbol,synonyms,dbxrefs,description,type_of_gene,official_symbol,official_name,other_designations
0,7,5692769,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
1,9,2827857,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
2,11,10823747,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
3,14,6951813,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
4,19,3758873,NEWENTRY,-,-,Record to support submission of GeneRIFs for a...,other,-,-,-
...,...,...,...,...,...,...,...,...,...,...
41456246,2998809,2935004,OrniCt014,-,-,tRNA-Ser,tRNA,-,-,-
41456247,2998809,2935005,OrniCt017,-,-,tRNA-Phe,tRNA,-,-,-
41456248,2998809,3276630,rps12,-,-,ribosomal protein S12,protein-coding,-,-,ribosomal protein S12
41456249,2998809,4036376,OrniCt037,-,-,tRNA-Ile,tRNA,-,-,-


In [12]:
tax2name = ujson.load(open('../data/tax2name.json', 'r'))
# for dataset in ['gnormplus','nlm_gene']:
for dataset in  ['gnormplus', 'nlm_gene']:
    print("filter by taxa")
    taxa = ujson.load(open(f'../data/taxonomy_subsets/{dataset}.json'))
    print(len(taxa))
    taxa_mask = (entrez.tax_id.isin(taxa)) & (~entrez.type_of_gene.isin(['unknown','tRNA','biological-region'])) & (entrez.description != 'hypothetical protein') & (~entrez.official_name.map(lambda x: x.lower().startswith("predicted"))) 
    filtered = entrez[taxa_mask]

    # Find duplicated symbols
    print("Dedup")
    symbols = filtered.symbol.value_counts()
    duplicated_symbols = symbols[symbols > 1].index.tolist()


    # Add additional canonical symbol for symbols that are repeated across different organisms
    duplicated_symbol_mask = filtered.symbol.isin(duplicated_symbols)

    filtered['canonical_symbol'] = filtered['symbol']
    # filtered.loc[duplicated_symbol_mask, 'canonical_symbol'] = filtered.loc[duplicated_symbol_mask, ['tax_id','symbol']].apply(lambda x: f"{x[1]} ({tax2name[x[0]]})", axis=1)


    # Complie list of all symbols (except primary name)
    print("Complilng symbols")
    filtered['all_symbols'] = filtered[['symbol','synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i for i in x if i.strip() != '-']))), axis=1)
    # filtered['all_symbols'] = filtered[['synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i.strip() for i in x if i.strip() != '-']))), axis=1)

    filtered['geneid'] = filtered.geneid.map(lambda x: f"NCBIGene:{x}")

    geneid2synonym = filtered.set_index('geneid')['all_symbols'].to_dict()

    all_records = []
    for d in tqdm(filtered.to_dict(orient='records')):
        entity_dict = {}
        entity_dict['cui'] = d['geneid']
        entity_dict['title'] = d['symbol']
        dedup_symbols = list(set([x for x in d['all_symbols'].split('|') if x != d['symbol']]))
        # print(dedup_symbols)
        joined_aliases = ' ; '.join(dedup_symbols)
        ent_desc = d['description']
        if len(dedup_symbols) > 0:
            if ent_desc not in ['-', ''] and ent_desc not in dedup_symbols:
                entity_dict['description'] = f"{d['symbol']} ( {tax2name[str(d['tax_id'])]}, {d['type_of_gene']} : {joined_aliases} ) [ {d['description']} ]"
            else:
                entity_dict['description'] = f"{d['symbol']} ( {tax2name[str(d['tax_id'])]}, {d['type_of_gene']} : {joined_aliases} )"
        else:
            if ent_desc not in ['-', ''] and ent_desc not in dedup_symbols:
                entity_dict['description'] = f"{d['symbol']} ( {tax2name[str(d['tax_id'])]}, {d['type_of_gene']} ) [ {d['description']} ]"
            else:
                entity_dict['description'] = f"{d['symbol']} ( {tax2name[str(d['tax_id'])]}, {d['type_of_gene']} )"
        
        entity_dict['type'] = d['type_of_gene']
        all_records.append(entity_dict)

    # with open(f'../data/arboel/{dataset}/dictionary.pickle', 'wb') as f:
    #     pickle.dump(all_records, f)
        

    with open(f"../data/entrez_to_alias_{dataset}.json",'w') as f:
        f.write(ujson.dumps(geneid2synonym))

filter by taxa
22
Dedup
Complilng symbols


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['canonical_symbol'] = filtered['symbol']
100%|██████████| 549545/549545 [00:02<00:00, 192612.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['all_symbols'] = filtered[['symbol','synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i for i in x if i.strip() != '-']))), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

filter by taxa
26
Dedup


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['canonical_symbol'] = filtered['symbol']


Complilng symbols


100%|██████████| 704575/704575 [00:03<00:00, 191155.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['all_symbols'] = filtered[['symbol','synonyms','official_symbol','official_name','other_designations', 'canonical_symbol']].progress_apply(lambda x: '|'.join(list(set([i for i in x if i.strip() != '-']))), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['geneid'] = filtered.geneid.map(lambda x: f"NCBIGene:{x}")
100%|██████████| 704575/704575 [00:02<00:00, 340062.16it/s]


In [11]:
all_records[19935]

{'cui': 'NCBIGene:817469',
 'title': 'PUM1',
 'description': 'PUM1 ( Arabidopsis thaliana, protein-coding : APUM1 ; F16P2_42 ; pumilio 1 ; F16P2.42 )',
 'type': 'protein-coding'}

# Convert dataset to examples

In [86]:
sample_data = pd.read_json('../arboel/data/medmentions/processed/sample.jsonl', lines=True)
sample_data.head()

Unnamed: 0,mention,mention_id,context_left,context_right,context_doc_id,type,label_id,label,label_title
0,dynactin 4,25763772.1,,as a modifier of chronic Pseudomonas aeruginos...,25763772,T103,C4308010,"DCTN4 protein , human ( Chemical : dynactin p6...","DCTN4 protein , human"
1,chronic Pseudomonas aeruginosa infection,25763772.2,dynactin 4 as a modifier of,in cystic fibrosis Pseudomonas aeruginosa infe...,25763772,T038,C0854135,Pseudomonas aeruginosa infection ( Biologic Fu...,Pseudomonas aeruginosa infection
2,cystic fibrosis,25763772.3,dynactin 4 as a modifier of chronic Pseudomona...,Pseudomonas aeruginosa infection in cystic fib...,25763772,T038,C0010674,"Cystic Fibrosis ( Biologic Function , CF , cys...",Cystic Fibrosis
3,Pseudomonas aeruginosa infection,25763772.4,dynactin 4 as a modifier of chronic Pseudomona...,in cystic fibrosis patients is associated with...,25763772,T038,C0854135,Pseudomonas aeruginosa infection ( Biologic Fu...,Pseudomonas aeruginosa infection
4,cystic fibrosis,25763772.5,dynactin 4 as a modifier of chronic Pseudomona...,patients is associated with worse long - term ...,25763772,T038,C0010674,"Cystic Fibrosis ( Biologic Function , CF , cys...",Cystic Fibrosis


In [48]:
np.random.choice([{1,2,3}, {'a','b','c'}, 'steven'])

{1, 2, 3}

In [4]:
resolve_abbrevs=True
abbrev_dict = ujson.load(open('../data/abbreviations.json'))

# for name in DATASET_NAMES:
for name in ['ncbi_disease']:

# for name in ['ncbi_disease']:
    print(name)
    # if name == 'ncbi_disease':
    #     continue
    x
    entities = pickle.load(open(f'../data/arboel/{name}/dictionary.pickle', 'rb'))
    entity_dictionary = {d['cui']:d for d in tqdm(entities)}

    if name == 'ncbi_disease':
        # Need to redo this since we have multiple synonymous CUIs for ncbi_disease
        entity_dictionary = {cui:d for d in tqdm(entities) for cui in d['cuis']}
        cui_synsets = {}
        for subdict in tqdm(entities): 
            for cui in subdict['cuis']:
                if cui in subdict:
                    print(cui, cui_synsets[cui], subdict['cuis'])
                cui_synsets[cui] = subdict['cuis'] 

        with open('../data/arboel/ncbi_disease/cui_synsets.json', 'w') as f:
            f.write(ujson.dumps(cui_synsets, indent=2))

    if name in VALIDATION_DOCUMENT_IDS:
        validation_pmids = VALIDATION_DOCUMENT_IDS[name]
    else:
        print("ERROR!!!")
    df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude, val_split_ids=validation_pmids)
    docs = dataset_to_documents(data)
    label_len = df['db_ids'].map(lambda x: len(x)).max()
    print("Max labels on one doc:", label_len)

    for split in df.split.unique():
        print(split)

        ents_in_split = []
        for d in tqdm(df.query("split == @split").to_dict(orient='records')):
            abbrev_resolved = False
            offsets = d['offsets']
            doc_id = d['document_id']
            doc = docs[doc_id]
            mention = d['text']
            
            # Resolve abbreviaions if desired
            if resolve_abbrevs and abbrev_dict is not None:
                deabbreviated_mention = resolve_abbreviation(doc_id, mention, abbrev_dict)
                abbrev_resolved = True

            # Get offsets and context
            start = offsets[0][0]
            end = offsets[-1][-1]
            before_context = doc[:start]
            after_context = doc[end:]
            
            
            # ArboEL can't handle multi-labels, so we randomly choose one.
            

            if len(d['db_ids']) == 1:
                label_id = d['db_ids'][0]

            # ncbi_disease is a special case that requires extra care
            elif name == 'ncbi_disease':
                labels = []
                used_cuis = set([])
                choosable_ids = []
                for db_id in d['db_ids']:
                    if db_id in used_cuis:
                        continue
                    else:
                        used_cuis.update(set(entity_dictionary[db_id]['cuis']))
                    choosable_ids.append(db_id)

                label_id = np.random.choice(choosable_ids)
            
            else:
                label_id = np.random.choice(d['db_ids'])

            # Check if we missed something
            if label_id not in entity_dictionary:
                print(label_id)
                continue
            

            
            output = [{
                'mention': mention, 
                'mention_id': d['mention_id'],
                'context_left': before_context,
                'context_right': after_context, 
                'context_doc_id': doc_id,
                'type': d['type'][0],
                'label_id': label_id,
                'label': entity_dictionary[label_id]['description'],
                'label_title': entity_dictionary[label_id]['title'],
            }]

            if abbrev_resolved:
                output.append({
                'mention': deabbreviated_mention, 
                'mention_id': d['mention_id']+'.abbr_resolved',
                'context_left': before_context,
                'context_right': after_context, 
                'context_doc_id': doc_id,
                'type': d['type'][0],
                'label_id': label_id,
                'label': entity_dictionary[label_id]['description'],
                'label_title': entity_dictionary[label_id]['title'],
            })

            # else:
            #     labels = []
            #     for db_id in d['db_ids']:
            #         labels.append({
            #         'label': entity_dictionary[db_id]['description'],
            #         'label_title':  entity_dictionary[db_id]['title'],
            #         'label_id': db_id,
            #         })
            #     output = {
            #         'mention': mention, 
            #         'mention_id': d['mention_id'],
            #         'context_left': before_context,
            #         'context_right': after_context, 
            #         'context_doc_id': doc_id,
            #         'type': d['type'][0],
            #         'labels': labels,
            #     }

            ents_in_split.extend(output)

        split_name = split
        if split =='validation':
            split_name = 'valid'
        with open(f'../data/arboel/{name}/{split_name}.jsonl', 'w') as f:
            f.write('\n'.join([ujson.dumps(x) for x in ents_in_split]))

        # print(split, len(ents_in_split))

Found cached dataset ncbi_disease (/nethome/dkartchner3/.cache/huggingface/datasets/ncbi_disease/ncbi_disease_bigbio_kb/1.0.0/e6b217666a5647d5abc614785b2caad62f1d72a94d1631b86c0f615b75dcc865)


ncbi_disease


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 13189/13189 [00:00<00:00, 3156737.93it/s]
100%|██████████| 13189/13189 [00:00<00:00, 2632342.40it/s]
100%|██████████| 13189/13189 [00:00<00:00, 1720161.56it/s]


Max labels on one doc: 5
train


100%|██████████| 5065/5065 [00:00<00:00, 188433.22it/s]


validation


100%|██████████| 780/780 [00:00<00:00, 167505.87it/s]


test


100%|██████████| 960/960 [00:00<00:00, 206668.99it/s]


In [54]:
# name = 'ncbi_disease'
# data = conhelps.for_config_name(f'{name}_bigbio_kb').load_dataset()
# exclude = CUIS_TO_EXCLUDE[name]
# remap = CUIS_TO_REMAP[name]
# entities = pickle.load(open(f'../data/arboel/{name}/dictionary.pickle', 'rb'))

# # Need to redo this since we have multiple synonymous CUIs for ncbi_disease
# entity_dictionary = {cui:d for d in tqdm(entities) for cui in d['cuis']}
# cui_synsets = {}
# for d in tqdm(entities): 
#     for cui in d['cuis']:
#         if cui in d:
#             print(cui, cui_synsets[cui], d['cuis'])
#         cui_synsets[cui] = d['cuis'] 

# with open('../data/arboel/ncbi_disease/cui_synsets.json', 'w') as f:
#     f.write(ujson.dumps(cui_synsets, indent=2))



# df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude)
# docs = dataset_to_documents(data)
# label_len = df['db_ids'].map(lambda x: len(x)).max()
# print("Max labels on one doc:", label_len)

# for split in df.split.unique():

#     ents_in_split = []
#     for d in tqdm(df.query("split == @split").to_dict(orient='records')):
#         offsets = d['offsets']
#         doc_id = d['document_id']
#         doc = docs[doc_id]
#         mention = d['text']
        
#         # Resolve abbreviaions if desired
#         if resolve_abbrevs and abbrev_dict is not None:
#             mention = resolve_abbreviation(doc_id, mention, abbrev_dict)

#         start = offsets[0][0]
#         end = offsets[-1][-1]
#         before_context = doc[:start]
#         after_context = doc[end:]
        

#         labels = []
#         used_cuis = set([])
#         for db_id in d['db_ids']:
#             if db_id in used_cuis:
#                 continue
#             else:
#                 used_cuis.update(set(entity_dictionary[db_id]['cuis']))
#             labels.append({
#             'label': entity_dictionary[db_id]['description'],
#             'label_title':  entity_dictionary[db_id]['title'],
#             'label_id': entity_dictionary[db_id]['cui'],
#             })
#         output = {
#             'mention': mention, 
#             'mention_id': d['mention_id'],
#             'context_left': before_context,
#             'context_right': after_context, 
#             'context_doc_id': doc_id,
#             'type': d['type'][0],
#             'labels': labels,
#         }

#         ents_in_split.append(output)

#     split_name = split
#     if split =='validation':
#         split_name = 'valid'
#     with open(f'../data/arboel/{name}/{split_name}.jsonl', 'w') as f:
#         f.write('\n'.join([ujson.dumps(x) for x in ents_in_split]))

Found cached dataset ncbi_disease (/home/dkartchner3/.cache/huggingface/datasets/ncbi_disease/ncbi_disease_bigbio_kb/1.0.0/cb155244edb3e586c0105202613da9f02c1baf3de0550919e638775c31469221)


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 13189/13189 [00:00<00:00, 2073957.76it/s]
100%|██████████| 13189/13189 [00:00<00:00, 1732715.51it/s]


Max labels on one doc: 5


100%|██████████| 5065/5065 [00:00<00:00, 213758.25it/s]
100%|██████████| 780/780 [00:00<00:00, 202811.80it/s]
100%|██████████| 960/960 [00:00<00:00, 250624.41it/s]


In [55]:
cui_synsets

{'MESH:C538288': ['MESH:C538288'],
 'MESH:C535484': ['MESH:C535484'],
 'MESH:C579849': ['MESH:C579849'],
 'MESH:C579850': ['MESH:C579850'],
 'MESH:C567076': ['MESH:C567076'],
 'MESH:C537805': ['MESH:C537805', 'OMIM:264300'],
 'OMIM:264300': ['MESH:C537805', 'OMIM:264300'],
 'MESH:C537806': ['MESH:C537806', 'OMIM:203400', 'OMIM:610600'],
 'OMIM:203400': ['MESH:C537806', 'OMIM:203400', 'OMIM:610600'],
 'OMIM:610600': ['MESH:C537806', 'OMIM:203400', 'OMIM:610600'],
 'MESH:D058165': ['MESH:D058165'],
 'MESH:C565624': ['MESH:C565624', 'OMIM:616034'],
 'OMIM:616034': ['MESH:C565624', 'OMIM:616034'],
 'MESH:C535305': ['MESH:C535305'],
 'MESH:C535306': ['MESH:C535306',
  'OMIM:236792',
  'OMIM:600721',
  'OMIM:613657',
  'OMIM:615182'],
 'OMIM:236792': ['MESH:C535306',
  'OMIM:236792',
  'OMIM:600721',
  'OMIM:613657',
  'OMIM:615182'],
 'OMIM:600721': ['MESH:C535306',
  'OMIM:236792',
  'OMIM:600721',
  'OMIM:613657',
  'OMIM:615182'],
 'OMIM:613657': ['MESH:C535306',
  'OMIM:236792',
  'OMIM

In [7]:
pickle_data = pickle.load(open('../arboel/models/trained/ncbi_disease/valid_processed_data.pickle', 'rb'))
len(pickle_data)

1444

In [50]:
name = 'medmentions_st21pv'
data = conhelps.for_config_name(f'{name}_bigbio_kb').load_dataset()
exclude = CUIS_TO_EXCLUDE[name]
remap = CUIS_TO_REMAP[name]



df = dataset_to_df(data, entity_remapping_dict=remap, cuis_to_exclude=exclude)
docs = dataset_to_documents(data)

df['label_len'] = df['db_ids'].map(lambda x: len(x))

df[df.label_len == df.label_len.max()]

Found cached dataset medmentions (/home/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_st21pv_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794)


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,document_id,offsets,text,type,db_ids,split,mention_id,label_len
64613,27655319,"[[462, 468]]",detect,"[T058, T033]","[UMLS:C1511790, UMLS:C0442726]",test,27655319.17,2
72785,27708774,"[[2036, 2052]]",antral follicles,[T017],"[UMLS:C0600225, UMLS:C1135971]",train,27708774.75,2
87101,27801847,"[[496, 500]]",Ldlr,"[T103, T017]","[UMLS:C1366529, UMLS:C0034821]",train,27801847.22,2
111221,27979025,"[[100, 104]]",food,"[T038, T168]","[UMLS:C0016452, UMLS:C0016470]",train,27979025.4,2
189426,28464073,"[[2110, 2112]]",HS,"[T038, T037]","[UMLS:C0282498, UMLS:C0282507]",train,28464073.47,2
199085,28526565,"[[1372, 1377]]",ICOAS,"[T058, T201]","[UMLS:C1456627, UMLS:C0087111]",train,28526565.38,2
202565,28548949,"[[1809, 1812]]",XPA,"[T017, T103]","[UMLS:C1506534, UMLS:C1337030]",train,28548949.75,2


In [4]:
df = pd.read_json('../data/arboel/ncbi_disease/train.jsonl', lines=True)

In [None]:
pd.set_option('display.max_rows', 200)

df[319:483]

# Data Dictionary Checks

In [4]:
entity_dict = pickle.load(open('../data/arboel/medmentions_st21pv/dictionary.pickle', 'rb'))

In [5]:
entity_dict[0]

{'cui': 'UMLS:C0000005',
 'title': '(131)I-Macroaggregated Albumin',
 'type': 'Chemical',
 'description': '(131)I-Macroaggregated Albumin ( Chemical : (131)I-Macroaggregated Albumin ; (131)I-MAA )'}

In [108]:
entity_dict[np.random.choice(2369483)]

{'cui': 'UMLS:C3828178',
 'title': 'SGRQ-C - Coughing Disturbs My Sleep',
 'type': 'Intellectual Product',
 'description': "SGRQ-C - Coughing Disturbs My Sleep ( Intellectual Product : SGRQ-C - Coughing Disturbs My Sleep ) [ St. George's Respiratory Questionnaire for COPD Patients (SGRQ-C) Some more questions about your cough and breathlessness: My coughing or breathing disturbs my sleep. ]"}

In [110]:
mm_dict = pickle.load(open('../arboel/data/medmentions/processed/dictionary.pickle', 'rb'))

In [111]:
len(mm_dict)

2327239

In [169]:
mm_dict[np.random.choice(2327239)]

{'cui': 'C3944496',
 'title': 'Eleutherochir opercularis',
 'description': 'Eleutherochir opercularis ( Eukaryote : Eleutherochir opercularis ( Valenciennes , 1837 ) ; Callionymus opercularis ; Callionymus opercularis Valenciennes , 1837 )',
 'type': 'T204'}