In [1]:
import sys
import ujson
import pickle
import pandas as pd
import numpy as np

from tqdm import tqdm
from collections import defaultdict

from bigbio.dataloader import BigBioConfigHelpers

sys.path.append('..')
from bigbio_utils import dataset_to_df


conhelps = BigBioConfigHelpers()


In [2]:
dataset = conhelps.for_config_name('ncbi_disease_bigbio_kb').load_dataset()
df = dataset_to_df(dataset)
all_cuis = set([y for x in df.db_ids for y in x])
all_cuis

Found cached dataset ncbi_disease (/home/dkartchner3/.cache/huggingface/datasets/ncbi_disease/ncbi_disease_bigbio_kb/1.0.0/cb155244edb3e586c0105202613da9f02c1baf3de0550919e638775c31469221)


  0%|          | 0/3 [00:00<?, ?it/s]

{'MESH:C531603',
 'MESH:C531604',
 'MESH:C531609',
 'MESH:C531610',
 'MESH:C531652',
 'MESH:C531732',
 'MESH:C531844',
 'MESH:C535331',
 'MESH:C535342',
 'MESH:C535395',
 'MESH:C535464',
 'MESH:C535468',
 'MESH:C535504',
 'MESH:C535506',
 'MESH:C535662',
 'MESH:C535668',
 'MESH:C535679',
 'MESH:C535700',
 'MESH:C535733',
 'MESH:C535764',
 'MESH:C535804',
 'MESH:C535884',
 'MESH:C535916',
 'MESH:C535918',
 'MESH:C535944',
 'MESH:C535964',
 'MESH:C536016',
 'MESH:C536106',
 'MESH:C536170',
 'MESH:C536183',
 'MESH:C536185',
 'MESH:C536210',
 'MESH:C536245',
 'MESH:C536252',
 'MESH:C536277',
 'MESH:C536288',
 'MESH:C536309',
 'MESH:C536353',
 'MESH:C536399',
 'MESH:C536438',
 'MESH:C536456',
 'MESH:C536528',
 'MESH:C536575',
 'MESH:C536587',
 'MESH:C536648',
 'MESH:C536852',
 'MESH:C536903',
 'MESH:C537022',
 'MESH:C537034',
 'MESH:C537181',
 'MESH:C537182',
 'MESH:C537207',
 'MESH:C537209',
 'MESH:C537243',
 'MESH:C537277',
 'MESH:C537346',
 'MESH:C537356',
 'MESH:C537369',
 'MESH:C537502

In [3]:
def read_examples(filepath):
    # Read file
    with open(filepath, 'r') as f:
        lines = f.read().split('\n')

    # Construct dict mapping each CURIE to a list of aliases
    umls_dict = {} 
    for line in tqdm(lines):
        cui, name = line.split("||")
        if cui in umls_dict:
            umls_dict[cui].append(name)
        else:
            umls_dict[cui] = [name]

    return umls_dict

mesh_omim_dict = read_examples('../data/umls_2017/mesh_and_omim_to_alias.txt')

mesh_omim_cuis = set(mesh_omim_dict.keys())

# ujson.load(open('../data/mesh_and_omim_to_alias.txt'))

100%|██████████| 1193815/1193815 [00:01<00:00, 816117.27it/s] 


In [4]:
mesh_omim_dict

{'MESH:C000002': ['bevonium',
  '2-(hydroxymethyl)-n,n-dimethylpiperidinium benzilate',
  'cg 201',
  'acabel',
  'bevonium sulfate (1:1)',
  'bevonium methyl sulfate',
  'piribenzil methyl sulfate',
  'bevonium methylsulfate',
  'bevonium metilsulfate'],
 'MESH:C000006': ['insulin, neutral',
  'neutral insulin',
  'insulin, 8a-l-threonine-10a-l-isoleucine-',
  'actrapid insulin',
  'novo mc insulin',
  'insulin pork',
  'insulin (pork)',
  'insulin (swine)',
  'insulin (ox), 8(a)-l-threonine-10(a)-l-isoleucine-'],
 'MESH:C000009': ['asparaginylglucosamine',
  'aspartylglucosamine',
  'aspartylglucosylamine',
  'aspartylglycosamine',
  'n-acetylglucosaminylasparagine',
  '2-acetamido-1-(beta-l-aspartamido)-1,2-dideoxy-beta-d-glucose',
  "2-acetamido-1-n-(4'-l-aspartyl)-2-deoxy-beta-d-glucopyranosylamine",
  'n(4)-(2-acetamido-2-deoxy-beta-d-glucopyranosyl)-l-asparagine',
  '4-n-2-acetamido-2-deoxy-beta-d-glucopyranosyl-l-asparagine',
  'aadg',
  'acetylglucosaminylasparagine',
  'l-asp

In [5]:
# for x in (all_cuis - mesh_omim_cuis):
#     # print(x.split(':')[-1])
#     print(x)
deprecated_cuis = list(all_cuis - mesh_omim_cuis)
with open('../data/ncbi_disease_deprecated_cuis.json', 'w') as f:
    f.write(ujson.dumps(deprecated_cuis))

In [3]:

col_names = "Disease_Name   Disease_ID       Alt_Disease_IDs   Definition      Parent_IDs       Tree_Numbers     Parent_Tree_Numbers       Synonyms        Slim_Mappings".lower().split()

medic = pd.read_csv("../data/CTD_diseases.tsv", comment='#', names=col_names, delimiter='\t', na_filter=False, usecols=['disease_name','disease_id','alt_disease_ids','definition','synonyms'])
medic

Unnamed: 0,disease_name,disease_id,alt_disease_ids,definition,synonyms
0,10p Deletion Syndrome (Partial),MESH:C538288,,,"Chromosome 10, 10p- Partial|Chromosome 10, mon..."
1,13q deletion syndrome,MESH:C535484,,,Chromosome 13q deletion|Chromosome 13q deletio...
2,15q24 Microdeletion,MESH:C579849,DO:DOID:0060395,,15q24 Deletion|15q24 Microdeletion Syndrome|In...
3,16p11.2 Deletion Syndrome,MESH:C579850,,,
4,"17,20-Lyase Deficiency, Isolated",MESH:C567076,,,"17-Alpha-Hydroxylase-17,20-Lyase Deficiency, C..."
...,...,...,...,...,...
13184,Zunich neuroectodermal syndrome,MESH:C536729,DO:DOID:0112152|OMIM:280000,,"CHIME|CHIME syndrome|COLOBOMA, CONGENITAL HEAR..."
13185,Zuska's Disease,MESH:C536730,,,Lactation and squamous metaplasia of lactifero...
13186,Zygodactyly 1,MESH:C565223,DO:DOID:0111820|OMIM:609815,,ZD1
13187,Zygomatic Fractures,MESH:D015051,,Fractures of the zygoma.,"Fractures, Zygomatic|Fracture, Zygomatic|Zygom..."


In [5]:
col_names = "Disease_Name   Disease_ID       Alt_Disease_IDs   Definition      Parent_IDs       Tree_Numbers     Parent_Tree_Numbers       Synonyms        Slim_Mappings".lower().split()

medic = pd.read_csv("../data/CTD_diseases.tsv", comment='#', names=col_names, delimiter='\t', na_filter=False, usecols=['disease_name','disease_id','alt_disease_ids','definition','synonyms'])
for col in medic.columns:
    medic[col] = medic[col].map(lambda x: x.strip())
medic.alt_disease_ids = medic.alt_disease_ids.map(lambda x: x.split('|'))
medic.synonyms = medic.synonyms.map(lambda x: x.split('|'))
medic['all_names'] = medic[['disease_name','synonyms']].apply(lambda x: [x[0]] + x[1], axis=1)
medic['all_ids'] = medic[['disease_id','alt_disease_ids']].apply(lambda x: [x[0]] + x[1], axis=1)

medic_arboel_entities = []
for x in medic.to_dict(orient='records'):
    joined_aliases = ' ; '.join([y for y in x['synonyms'] if y != ''])
    definition = x['definition'].strip()
    if len(definition) > 0:
        description = f"{x['disease_name']} ( Disease : {joined_aliases} ) [ {definition} ]"
    else:
        description = f"{x['disease_name']} ( Disease : {joined_aliases} )"

    d = {'type':'Disease', 
        'cui':x['disease_id'], 
        'title':x['disease_name'], 
        'cuis': [y for y in x['all_ids'] if y != '' and not y.startswith('DO:')], 
        'description': description,
        }
    medic_arboel_entities.append(d)

pickle.dump(medic_arboel_entities, open('../data/arboel/ncbi_disease/dictionary.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    


# Make synonym list from medic dictionary
medic_cui2synonyms = defaultdict(set)
for x in medic[['all_names','all_ids']].values:
    for y in x[1]:
        if y == '' or y.startswith('DO:'):
            continue
        medic_cui2synonyms[y].update([z for z in x[0] if z != ''])


# # Add as many missing identifiers as we can from current UMLS
# for x in all_cuis - set(medic_cui2synonyms.keys()):
#     if x in mesh_omim_dict:
#         medic_cui2synonyms[x] = mesh_omim_dict[x]


# Write to file
medic_dict = {key: list(val) for key, val in medic_cui2synonyms.items()}
with open("../data/ncbi_disease_dict.json",'w') as f:
    f.write(ujson.dumps(medic_dict))

# Write in standard pipe-delimited format
with open('../data/alias_mappings/ncbi_disease_aliases.txt', 'w') as f:
    f.write('\n'.join([f"{key}||{v}" for key, val in medic_dict.items() for v in val]))

# len(all_cuis - set(medic_cui2synonyms.keys()))

In [20]:
medic_cui2synonyms

defaultdict(set,
            {'MESH:D052159': ['frasier syndrome', 'syndrome, frasier'],
             'MESH:D016864': ['li-fraumeni syndrome', 'li fraumeni syndrome'],
             'MESH:D012175': ['retinoblastoma',
              'retinoblastomas',
              'glioblastoma, retinal',
              'glioblastomas, retinal',
              'retinal glioblastomas',
              'retinal glioblastoma',
              'glioma, retinal',
              'gliomas, retinal',
              'retinal gliomas',
              'retinal glioma',
              'neuroblastoma, retinal',
              'neuroblastomas, retinal',
              'retinal neuroblastomas',
              'retinal neuroblastoma',
              'eye cancer, retinoblastoma',
              'cancers, retinoblastoma eye',
              'eye cancers, retinoblastoma',
              'retinoblastoma eye cancers',
              'cancer, retinoblastoma eye',
              'retinoblastoma eye cancer',
              'familial retinoblastoma

In [8]:
all_medic_ids = [y for x in medic.all_ids for y in x]
print(len(all_medic_ids))
print(len(set(all_medic_ids)))

28857
22182


In [9]:
set([z for z in all_medic_ids if 'DO' not in z]) - mesh_omim_cuis

{'',
 'MESH:C',
 'MESH:C000721267',
 'MESH:C000721268',
 'MESH:C000721288',
 'MESH:C000721289',
 'MESH:C000721322',
 'MESH:C000721324',
 'MESH:C000721327',
 'MESH:C000721349',
 'MESH:C000721350',
 'MESH:C000721352',
 'MESH:C000721353',
 'MESH:C000721355',
 'MESH:C000721367',
 'MESH:C000721389',
 'MESH:C000721391',
 'MESH:C000721392',
 'MESH:C000721407',
 'MESH:C000721427',
 'MESH:C000721429',
 'MESH:C000721448',
 'MESH:C000721467',
 'MESH:C000721827',
 'MESH:C000721847',
 'MESH:C000721848',
 'MESH:C000722495',
 'MESH:C000722498',
 'MESH:C000722847',
 'MESH:C000722848',
 'MESH:C000722978',
 'MESH:C000722979',
 'MESH:C000723354',
 'MESH:C000723468',
 'MESH:C000723471',
 'MESH:C000723487',
 'MESH:C000723647'}

In [10]:
medic[medic.disease_id.isin(['MESH:C',
 'MESH:C000721267',
 'MESH:C000721268',
 'MESH:C000721288',
 'MESH:C000721289',
 'MESH:C000721322',
 'MESH:C000721324',
 'MESH:C000721327',
 'MESH:C000721349',
 'MESH:C000721350',
 'MESH:C000721352',
 'MESH:C000721353',
 'MESH:C000721355',
 'MESH:C000721367',
 'MESH:C000721389',
 'MESH:C000721391',
 'MESH:C000721392',
 'MESH:C000721407',
 'MESH:C000721427',
 'MESH:C000721429',
 'MESH:C000721448',
 'MESH:C000721467',
 'MESH:C000721827',
 'MESH:C000721847',
 'MESH:C000721848',
 'MESH:C000722495',
 'MESH:C000722498',
 'MESH:C000722847',
 'MESH:C000722848',
 'MESH:C000722978',
 'MESH:C000722979',
 'MESH:C000723354',
 'MESH:C000723468',
 'MESH:C000723471',
 'MESH:C000723487',
 'MESH:C000723647'])]

Unnamed: 0,disease_name,disease_id,alt_disease_ids,definition,synonyms,all_names,all_ids
395,Air crescent sign,MESH:C000721349,[],,"[Air-crescent sign, Pulmonary air crescent sig...","[Air crescent sign, Air-crescent sign, Pulmona...","[MESH:C000721349, ]"
2074,cell-associated neurotoxicity,MESH:C000722498,[],,[immune effector cell-associated neurotoxicity...,"[cell-associated neurotoxicity, immune effecto...","[MESH:C000722498, ]"
2194,Chapare hemorrhagic fever,MESH:C000723468,[],,[Chapare virus infection],"[Chapare hemorrhagic fever, Chapare virus infe...","[MESH:C000723468, ]"
2685,Cleft eyelid,MESH:C000721288,[],,"[Coloboma of eyelid, Eyelid coloboma]","[Cleft eyelid, Coloboma of eyelid, Eyelid colo...","[MESH:C000721288, ]"
2714,Coarse facial features,MESH:C000721322,[],,"[Coarce facies, Coarse face, Pugilistic facies]","[Coarse facial features, Coarce facies, Coarse...","[MESH:C000721322, ]"
3020,Continuous diaphragm sign,MESH:C000721367,[],,[],"[Continuous diaphragm sign, ]","[MESH:C000721367, ]"
3144,COVID-19 rebound,MESH:C000722978,[],,"[COVID-19 recrudescence, COVID-19 relapse]","[COVID-19 rebound, COVID-19 recrudescence, COV...","[MESH:C000722978, ]"
3145,COVID-19 reinfection,MESH:C000722979,[],,[COVID-19 re-infection],"[COVID-19 reinfection, COVID-19 re-infection]","[MESH:C000722979, ]"
3210,Crazy paving pattern,MESH:C000721427,[],,"[Crazy paving appearance, Crazy-paving pattern...","[Crazy paving pattern, Crazy paving appearance...","[MESH:C000721427, ]"
3547,deficiency of adenosine deaminase 2,MESH:C000723487,[],,"[ADA2 deficiency, DADA2 deficiency of adenosin...","[deficiency of adenosine deaminase 2, ADA2 def...","[MESH:C000723487, ]"


In [11]:
deprecated_cuis = all_cuis - set(all_medic_ids)
deprecated_cuis

{'MESH:C531603',
 'MESH:C531610',
 'MESH:C531652',
 'MESH:C535804',
 'MESH:C537666',
 'MESH:C538138',
 'MESH:C538219',
 'MESH:C538392',
 'MESH:C538554',
 'MESH:D001405',
 'OMIM:102600',
 'OMIM:103600',
 'OMIM:164160',
 'OMIM:600698'}

In [15]:
medic[medic.all_ids.map(lambda x: '' in x)]

Unnamed: 0,disease_name,disease_id,alt_disease_ids,definition,synonyms,all_names,all_ids
0,10p Deletion Syndrome (Partial),MESH:C538288,[],,"[Chromosome 10, 10p- Partial, Chromosome 10, m...","[10p Deletion Syndrome (Partial), Chromosome 1...","[MESH:C538288, ]"
1,13q deletion syndrome,MESH:C535484,[],,"[Chromosome 13q deletion, Chromosome 13q delet...","[13q deletion syndrome, Chromosome 13q deletio...","[MESH:C535484, ]"
3,16p11.2 Deletion Syndrome,MESH:C579850,[],,[],"[16p11.2 Deletion Syndrome, ]","[MESH:C579850, ]"
4,"17,20-Lyase Deficiency, Isolated",MESH:C567076,[],,"[17-Alpha-Hydroxylase-17,20-Lyase Deficiency, ...","[17,20-Lyase Deficiency, Isolated, 17-Alpha-Hy...","[MESH:C567076, ]"
7,22q11 Deletion Syndrome,MESH:D058165,[],Condition with a variable constellation of phe...,"[22q11 Deletion Syndromes, Deletion Syndrome, ...","[22q11 Deletion Syndrome, 22q11 Deletion Syndr...","[MESH:D058165, ]"
...,...,...,...,...,...,...,...
13182,Zoster Sine Herpete,MESH:D031368,[],HERPES ZOSTER but without eruption of vesicles...,[Zoster Sine Eruptione],"[Zoster Sine Herpete, Zoster Sine Eruptione]","[MESH:D031368, ]"
13183,ZTTK SYNDROME,OMIM:617140,[],,"[ZHU-TOKITA-TAKENOUCHI-KIM SYNDROME, ZTTK MULT...","[ZTTK SYNDROME, ZHU-TOKITA-TAKENOUCHI-KIM SYND...","[OMIM:617140, ]"
13185,Zuska's Disease,MESH:C536730,[],,[Lactation and squamous metaplasia of lactifer...,"[Zuska's Disease, Lactation and squamous metap...","[MESH:C536730, ]"
13187,Zygomatic Fractures,MESH:D015051,[],Fractures of the zygoma.,"[Fractures, Zygomatic, Fracture, Zygomatic, Zy...","[Zygomatic Fractures, Fractures, Zygomatic, Fr...","[MESH:D015051, ]"


In [14]:
medic_cui2synonyms

defaultdict(set,
            {'MESH:C538288': {'10p Deletion Syndrome (Partial)',
              'Chromosome 10, 10p- Partial',
              'Chromosome 10, Partial Deletion (short arm)',
              'Chromosome 10, monosomy 10p',
              'Monosomy 10p'},
             '': {'Necrosis, Submassive Hepatic',
              'Short Stature, Mental Retardation, Callosal Agenesis, Heminasal Hypoplasia, Microphthalmia, And Atypical Clefting',
              'CAFE-AU-LAIT SPOTS WITH PULMONIC STENOSIS',
              'Cells, Neoplasm Circulating',
              'Arnold Chiari Malformation, Type I',
              'T LGL Leukemia',
              'Spiral Fracture',
              'Clinical Deterioration',
              'Lowry Wood syndrome',
              'Arterial Inflammation',
              'Fungal Lung Diseases',
              'Limb Deformity, Congenital',
              'Pyogenic Brain Abscesses',
              'Keloid, Acne',
              'Syndrome, Eisenmenger',
              'Choristoma

In [None]:
gnormplus_dict = ujson.load(open('../data/entrez_to_alias_gnormplus.json'))
# Write in standard pipe-delimited format
with open('../data/alias_mappings/gnormplus_aliases.txt', 'w') as f:
    f.write('\n'.join([f"{key}||{v}" for key, val in gnormplus_dict.items() for v in val.split('|')]))

nlm_gene_dict = ujson.load(open('../data/entrez_to_alias_nlm_gene.json'))
with open('../data/alias_mappings/nlm_gene_aliases.txt', 'w') as f:
    f.write('\n'.join([f"{key}||{v}" for key, val in nlm_gene_dict.items() for v in val.split('|')]))



    

In [28]:
gnormplus_dict

{'NCBIGene:814629': 'AT2G01050|zinc ion binding / nucleic acid binding protein|F23H14.2|F23H14_2',
 'NCBIGene:814630': 'AT2G01060|F23H14.3|F23H14_3|myb-like HTH transcriptional regulator family protein',
 'NCBIGene:814631': 'AT2G01031',
 'NCBIGene:814636': 'F23H14.5|F23H14_5|Late embryogenesis abundant (LEA) hydroxyproline-rich glycoprotein family|AT2G01080',
 'NCBIGene:814637': 'F23H14.4|F23H14_4|Lung seven transmembrane receptor family protein|AT2G01070',
 'NCBIGene:814638': 'Ubiquinol-cytochrome C reductase hinge protein|F23H14.6|F23H14_6|AT2G01090',
 'NCBIGene:814639': 'F23H14.7|F23H14_7|FAM133-like protein|AT2G01100',
 'NCBIGene:814640': 'Sec-independent periplasmic protein translocase|APG2|ALBINO AND PALE GREEN 2|F23H14.8|F23H14_8|PGA2|TATC|TWIN-ARGININE TRANSLOCATION C|UNE3|unfertilized embryo sac 3',
 'NCBIGene:814641': 'ATORC4|F23H14.9|F23H14_9|ORIGIN RECOGNITION COMPLEX SUBUNIT 4|origin recognition complex subunit 4|ORC4|origin recognition complex subunit 4|ORC4 (Arabidopsis 

In [32]:
'abc def '.split(' ')

['abc', 'def', '']