In [12]:
%load_ext autoreload
%autoreload 2

import sys
import ujson
import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import train_test_split

from bigbio.dataloader import BigBioConfigHelpers

sys.path.append('..')
from bigbio_utils import dataset_to_df, DATASET_NAMES, CUIS_TO_EXCLUDE, CUIS_TO_REMAP, VALIDATION_DOCUMENT_IDS


conhelps = BigBioConfigHelpers()

def read_examples(filepath):
    # Read file
    with open(filepath, 'r') as f:
        lines = f.read().split('\n')

    # Construct dict mapping each CURIE to a list of aliases
    umls_dict = {} 
    for line in tqdm(lines):
        if len(line.split('||')) != 2:
            print(line)
            continue
        cui, name = line.split("||")
        if cui in umls_dict:
            umls_dict[cui].append(name)
        else:
            umls_dict[cui] = [name]

    return umls_dict

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
dataset = conhelps.for_config_name('bc5cdr_bigbio_kb').load_dataset()
df = dataset_to_df(dataset)
all_cuis = set([y for x in df.db_ids for y in x])
len(all_cuis)


Found cached dataset bc5cdr (/nethome/dkartchner3/.cache/huggingface/datasets/bc5cdr/bc5cdr_bigbio_kb/1.0.0/f01f16ea9b65ead985bedadf7335195c32297c8f1b09417fc607b102a6757d6f)


  0%|          | 0/3 [00:00<?, ?it/s]

2351

In [4]:
for key in dataset.keys():
    print(len(dataset[key]))

500
500
500


In [10]:


mesh_dict = read_examples('../data/umls_2017/mesh_to_alias.txt')

100%|██████████| 1006992/1006992 [00:01<00:00, 900870.91it/s]


In [11]:
len(all_cuis - set(mesh_dict.keys()))
# for x in all_cuis - set(mesh_dict.keys()):
#     print(x.split(':')[-1])

82

In [12]:
all_cuis - set(mesh_dict.keys())

{'MESH:-1',
 'MESH:C000873',
 'MESH:C005435',
 'MESH:C008281',
 'MESH:C009250',
 'MESH:C010845',
 'MESH:C012052',
 'MESH:C014347',
 'MESH:C019248',
 'MESH:C022189',
 'MESH:C026098',
 'MESH:C030852',
 'MESH:C032171',
 'MESH:C032302',
 'MESH:C034930',
 'MESH:C035133',
 'MESH:C036006',
 'MESH:C037663',
 'MESH:C039726',
 'MESH:C040029',
 'MESH:C041359',
 'MESH:C043211',
 'MESH:C043265',
 'MESH:C044650',
 'MESH:C045645',
 'MESH:C047047',
 'MESH:C047426',
 'MESH:C047781',
 'MESH:C048833',
 'MESH:C049430',
 'MESH:C051890',
 'MESH:C052342',
 'MESH:C055162',
 'MESH:C059262',
 'MESH:C061870',
 'MESH:C063008',
 'MESH:C064276',
 'MESH:C065179',
 'MESH:C065180',
 'MESH:C065757',
 'MESH:C067311',
 'MESH:C067431',
 'MESH:C069541',
 'MESH:C071741',
 'MESH:C076029',
 'MESH:C076946',
 'MESH:C081309',
 'MESH:C081489',
 'MESH:C084178',
 'MESH:C085143',
 'MESH:C086816',
 'MESH:C090450',
 'MESH:C094645',
 'MESH:C096918',
 'MESH:C098010',
 'MESH:C099041',
 'MESH:C102006',
 'MESH:C105934',
 'MESH:C107135',
 '

In [13]:
mesh_dict['MESH:C000002']

['bevonium',
 '2-(hydroxymethyl)-n,n-dimethylpiperidinium benzilate',
 'cg 201',
 'acabel',
 'bevonium sulfate (1:1)',
 'bevonium methyl sulfate',
 'piribenzil methyl sulfate',
 'bevonium methylsulfate',
 'bevonium metilsulfate']

In [19]:
biogenel_bc5cdr_database = ujson.load(open('../../biogenel_datasets/bc5cdr/target_kb.json','r'))

In [22]:
len(set([x.split(':')[-1] for x in all_cuis]) - set(biogenel_bc5cdr_database.keys()))

22

# NLM Chem Checks

In [34]:
nlm_chem_dataset = conhelps.for_config_name('nlmchem_bigbio_kb').load_dataset()
nlm_chem_df = dataset_to_df(nlm_chem_dataset)
nlm_chem_cuis = set([y for x in nlm_chem_df.db_ids for y in x])
len(nlm_chem_cuis)

Found cached dataset nlmchem (/home/dkartchner3/.cache/huggingface/datasets/nlmchem/nlmchem_bigbio_kb/1.0.0/d91131823c66b7dd1162027991ea47c342e478209b37cf261c5f122d30409594)


  0%|          | 0/3 [00:00<?, ?it/s]

1810

In [10]:
nlm_chem_dict = read_examples('../data/mesh_to_alias.txt')
chemicals = set(nlm_chem_dict.keys())

100%|██████████| 1006992/1006992 [00:01<00:00, 895795.79it/s]


In [11]:
len(nlm_chem_cuis - chemicals)

for x in nlm_chem_cuis - chemicals:
    print(x.split(':')[-1])

C015329
C067134
C061870
C000603933
C004322
C059514
C551994
C000612088
C419708
C522924
D000073878
D000069463
C490728
C089032


In [None]:
df = dataset_to_df(dataset, name='nlmchem')

# UMLS quality checks

In [4]:
dataset = conhelps.for_config_name('medmentions_full_bigbio_kb').load_dataset()
df = dataset_to_df(dataset)
all_umls_cuis = set([y for x in df.db_ids for y in x])
len(all_umls_cuis)


Found cached dataset medmentions (/home/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_full_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794)


  0%|          | 0/3 [00:00<?, ?it/s]

34724

In [6]:
dataset = conhelps.for_config_name('medmentions_st21pv_bigbio_kb').load_dataset()
df = dataset_to_df(dataset)
all_st21pv_cuis = set([y for x in df.db_ids for y in x])
len(all_st21pv_cuis)

Found cached dataset medmentions (/home/dkartchner3/.cache/huggingface/datasets/medmentions/medmentions_st21pv_bigbio_kb/1.0.0/3fc6b8a3681d540ae6c7497c238636b543b90764247b5ff3642d243474000794)


  0%|          | 0/3 [00:00<?, ?it/s]

25419

In [21]:
umls_dict = read_examples('../data/umls_2017/umls_to_alias.txt')
print(len(umls_dict))

 32%|███▏      | 2529085/7935792 [00:02<00:04, 1194647.27it/s]

C0851160|ENG|S|L0026543|VO|S0782321|Y|A0842456||||RCD|OP|XM1W1|Morning after pill


100%|██████████| 7935792/7935792 [00:19<00:00, 415045.86it/s] 


3463453


In [22]:
st21pv_dict = read_examples('../data/umls_2017/st21pv_to_alias.txt')
print(len(st21pv_dict))

 35%|███▍      | 1670632/4796447 [00:01<00:02, 1265088.37it/s]

C0851160|ENG|S|L0026543|VO|S0782321|Y|A0842456||||RCD|OP|XM1W1|Morning after pill


100%|██████████| 4796447/4796447 [00:04<00:00, 1176969.41it/s]


2368501


In [29]:
umls_missed = list(all_umls_cuis - set(umls_dict.keys()))
print(umls_missed)
st21pv_missed = list(all_st21pv_cuis -  set(st21pv_dict.keys()))
print(st21pv_missed)


['UMLS:C4300640', 'UMLS:C4300518']
['UMLS:C0683939']


In [18]:
umls_keys = set([x.split(':')[-1] for x in umls_dict.keys()])
st21pv_keys = set([x.split(':')[-1] for x in st21pv_dict.keys()])

In [14]:
len(st21pv_missed)

166

In [15]:
umls = ujson.load(open('../data/medlinker/umls.2017AA.active.full.json'))

In [16]:
st21pv = ujson.load(open('../data/medlinker/umls.2017AA.active.st21pv.json'))

In [23]:
count = 0
for x, y in umls.items():
    if x not in umls_keys:
        count += 1
        umls_dict[f'UMLS:{x}'] = [y['Name']] + y['STR']

print(count)


count = 0
for x, y in st21pv.items():
    if x not in st21pv_keys:
        count += 1
        st21pv_dict[f'UMLS:{x}'] = [y['Name']] + y['STR']

print(count)

1554
980


In [26]:
len(umls_dict)

3465007

In [27]:
umls_dict

{'UMLS:C0000005': ['(131)I-Macroaggregated Albumin', '(131)I-MAA'],
 'UMLS:C0000039': ['1,2-Dipalmitoylphosphatidylcholine',
  '1,2 Dipalmitoylphosphatidylcholine',
  '1,2-Dihexadecyl-sn-Glycerophosphocholine',
  '1,2 Dihexadecyl sn Glycerophosphocholine',
  '1,2-Dipalmitoyl-Glycerophosphocholine',
  '1,2 Dipalmitoyl Glycerophosphocholine',
  'Dipalmitoylphosphatidylcholine',
  'Dipalmitoylglycerophosphocholine',
  'Dipalmitoyllecithin',
  '3,5,9-Trioxa-4-phosphapentacosan-1-aminium, 4-hydroxy-N,N,N-trimethyl-10-oxo-7-((1-oxohexadecyl)oxy)-, inner salt, 4-oxide',
  'Dipalmitoylphosphatidylcholine (substance)',
  'Dipalmitoyl Phosphatidylcholine',
  'Phosphatidylcholine, Dipalmitoyl',
  'DPPC',
  'DIPALMITOYLPHOSPHATIDYLCHOLINE 0102',
  '1,2-Dipalmitoylphosphatidylcholine [Chemical/Ingredient]'],
 'UMLS:C0000052': ['1,4-alpha-Glucan Branching Enzyme',
  '1,4-alpha-Glucan branching enzyme',
  '1,4-Alpha glucan branching enzyme',
  '1,4 alpha Glucan Branching Enzyme',
  'Branching Enzyme,

In [25]:
len(st21pv_dict)

2369481

In [33]:
umls_output = [f'{key}||{x}' for key, val in umls_dict.items() for x in val]
# with open('../data/umls_2017/umls_updated_to_alias.txt', 'w') as f:
with open('../biogenel/src/bigbio_/data/medmentions_full/medmentions_full_aliases.txt', 'w') as f:
    f.write('\n'.join(umls_output))

st21pv_output = [f'{key}||{x}' for key, val in st21pv_dict.items() for x in val]
# with open('../data/umls_2017/st21pv_updated_to_alias.txt', 'w') as f:
with open('../biogenel/src/bigbio_/data/medmentions_st21pv/medmentions_st21pv_aliases.txt', 'w') as f:
    f.write('\n'.join(st21pv_output))

In [None]:
umls_output = []
for key, val in tqdm(umls.items()):
    umls_output.append(f'UMLS:{key}||{val["Name"]}')
    for alias in val['STR']:
        umls_output.append(f'UMLS:{key}||{alias}')
with open('../data/umls_2017/umls_to_alias.txt', 'w') as f:
    f.write('\n'.join(umls_output))


st21pv_output = []
for key, val in tqdm(st21pv.items()):
    st21pv_output.append(f'UMLS:{key}||{val["Name"]}')
    for alias in val['STR']:
        st21pv_output.append(f'UMLS:{key}||{alias}')
with open('../data/umls_2017/st21pv_to_alias.txt', 'w') as f:
    f.write('\n'.join(st21pv_output))

# Make test set for GNormPlus and NLM-Gene

In [15]:
name = 'gnormplus'
dataset = conhelps.for_config_name(f'{name}_bigbio_kb').load_dataset()
exclude = CUIS_TO_EXCLUDE[name]
remap = CUIS_TO_REMAP[name]
valid_pmids = VALIDATION_DOCUMENT_IDS[name]
df = dataset_to_df(dataset, cuis_to_exclude=exclude, entity_remapping_dict=remap, val_split_ids=valid_pmids)
df.groupby('split').agg({'document_id':'nunique'})

Found cached dataset gnormplus (/home/dkartchner3/.cache/huggingface/datasets/bigbio___gnormplus/gnormplus_bigbio_kb/1.0.0/97a2714b58185305591c949b067cea2febfca2447016096c3d08021d84bf7b69)


  0%|          | 0/2 [00:00<?, ?it/s]

<class 'str'> <class 'str'>


Unnamed: 0_level_0,document_id
split,Unnamed: 1_level_1
test,254
train,353
validation,63


In [5]:
seed = 0
for name in ['gnormplus','nlm_gene']:
# for name in ['gnormplus']:
    print(name)
    dataset = conhelps.for_config_name(f'{name}_bigbio_kb').load_dataset()
    exclude = CUIS_TO_EXCLUDE[name]
    remap = CUIS_TO_REMAP[name]
    df = dataset_to_df(dataset, cuis_to_exclude=exclude, entity_remapping_dict=remap)

    all_train_pmids = df.query("split == 'train'").document_id.unique()
    print(type(all_train_pmids))
    train_pmids, validation_pmids = train_test_split(all_train_pmids, test_size=0.15, random_state=seed)
    with open(f'../data/validation_split_ids/{name}.txt', 'w') as f:
        f.write('\n'.join(validation_pmids))


gnormplus


Found cached dataset gnormplus (/home/dkartchner3/.cache/huggingface/datasets/bigbio___gnormplus/gnormplus_bigbio_kb/1.0.0/97a2714b58185305591c949b067cea2febfca2447016096c3d08021d84bf7b69)


  0%|          | 0/2 [00:00<?, ?it/s]

<class 'numpy.ndarray'>


# Final Quality Checks

In [2]:
for name in DATASET_NAMES:
    print(name)
    dataset = conhelps.for_config_name(f'{name}_bigbio_kb').load_dataset()
    print(dataset.keys())
    ontology = read_examples(f'../data/alias_mappings/{name}_aliases.txt')
    exclude = CUIS_TO_EXCLUDE[name]
    remap = CUIS_TO_REMAP[name]
    df = dataset_to_df(dataset, cuis_to_exclude=exclude, entity_remapping_dict=remap)
    all_cuis = set([y for x in df.db_ids for y in x])
    for x in all_cuis:
        if x not in ontology:
            print(x)

    print("**********************************")

Found cached dataset nlmchem (/nethome/dkartchner3/.cache/huggingface/datasets/nlmchem/nlmchem_bigbio_kb/1.0.0/d91131823c66b7dd1162027991ea47c342e478209b37cf261c5f122d30409594)


nlmchem


  0%|          | 0/3 [00:00<?, ?it/s]

dict_keys(['train', 'test', 'validation'])


100%|██████████| 1006992/1006992 [00:01<00:00, 811475.42it/s]
Found cached dataset nlm_gene (/nethome/dkartchner3/.cache/huggingface/datasets/nlm_gene/nlm_gene_bigbio_kb/1.0.0/eccd5e8295a7e199d672750806ac201aaa31d87ae12168a51b63a80e8cbb9d61)


**********************************
nlm_gene


  0%|          | 0/2 [00:00<?, ?it/s]

dict_keys(['train', 'test'])


100%|██████████| 2507059/2507059 [00:02<00:00, 859331.32it/s] 
Found cached dataset gnormplus (/nethome/dkartchner3/.cache/huggingface/datasets/gnormplus/gnormplus_bigbio_kb/1.0.0/c0080d95f482a3af78d8a39ad95daa2ec310687cba3c308d34c64811ee862a60)


**********************************
gnormplus


  0%|          | 0/2 [00:00<?, ?it/s]

dict_keys(['train', 'test'])


100%|██████████| 2114393/2114393 [00:02<00:00, 938288.27it/s] 


**********************************
