In [1]:
import os
import csv

import networkx

import do_tools

In [2]:
! svn checkout svn://svn.code.sf.net/p/diseaseontology/code/trunk/ download

Checked out revision 2810.


In [3]:
path = os.path.join('download', 'HumanDO.obo')
do = do_tools.load_do(path)
dox = do_tools.do_to_networkx(do)

In [4]:
do.get_term_ids()
do.get_term('DOID:0050234')

<IGS_scripts.term.Term at 0x7f3196f29eb8>

In [5]:
term = do.get_term('DOID:2377')
term.__dict__

{'alternateIds': [],
 'definition': '"A demyelinating disease that involves damage to the fatty myelin sheaths around the axons of the brain and spinal cord resulting in demyelination and scarring." [url:http\\://en.wikipedia.org/wiki/Multiple_sclerosis]',
 'id': 'DOID:2377',
 'name': 'multiple sclerosis',
 'obsolete': False,
 'relationships': [('is_a', 'DOID:3213', 'demyelinating disease')],
 'subsets': [],
 'synonyms': [('Generalized multiple sclerosis (disorder)',
   'EXACT',
   'SNOMEDCT_2005_07_31:192928003'),
  ('insular sclerosis', 'EXACT', 'CSP2005:2042-2324')],
 'xrefs': ['EFO:0003885',
  'ICD9CM:340',
  'MSH:D009103',
  'NCI:C3243',
  'OMIM:126200',
  'OMIM:612594',
  'OMIM:612595',
  'OMIM:612596',
  'OMIM:614810',
  'SNOMEDCT_2010_1_31:155023009',
  'SNOMEDCT_2010_1_31:192928003',
  'SNOMEDCT_2010_1_31:192930001',
  'SNOMEDCT_2010_1_31:24700007',
  'UMLS_CUI:C0026769']}

In [6]:
xref_rename = {
    'ICD10CM': 'ICD10',
    'ICD9CM': 'ICD9',
    'IDC10CM': 'ICD10',
    'IDC9CM': 'ICD9',
    'MedDRA': 'MEDDRA',
    'NCI2009_04D': 'NCI',
    'SNOMEDCT_2010_1_31': 'SNOMEDCT',
    'SNOMEDCT_2013_01_31': 'SNOMEDCT',
    'UMLS_CUI': 'UMLS',
    'UML_CUI': 'UMLS',
}

In [7]:
def write_xref_row(writer, doid_code, doid_name, xrefs, rename_dict):
    rows = list()
    for xref in xrefs:
        resource, resource_id = xref.split(':', 1)
        if resource in rename_dict:
            resource = rename_dict[resource]
        rows.append([doid_code, doid_name, resource, resource_id])
    rows.sort()
    writer.writerows(rows)

file_unprop = open(os.path.join('data', 'xrefs.tsv'), 'w')
file_prop = open(os.path.join('data', 'xrefs-prop.tsv'), 'w')

writer_unprop = csv.writer(file_unprop, delimiter='\t')
writer_prop = csv.writer(file_prop, delimiter='\t')

for writer in writer_unprop, writer_prop:
    writer.writerow(['doid_code', 'doid_name', 'resource', 'resource_id'])

for term in networkx.topological_sort_recursive(dox, reverse=True):
    xrefs = set(term.xrefs)
    xrefs_prop = set(xrefs)
    for ancestor in networkx.ancestors(dox, term):
        xrefs_prop |= set(ancestor.xrefs)
    
    write_xref_row(writer_unprop, term.id, term.name, xrefs, xref_rename)
    write_xref_row(writer_prop, term.id, term.name, xrefs_prop, xref_rename)

for write_file in file_unprop, file_prop:
    write_file.close()


In [8]:
# list of xrefs
import pandas
path = os.path.join('data', 'xrefs.tsv')
xref_df = pandas.read_table(path)
set(xref_df.resource)

{'CSP',
 'CTV3',
 'EFO',
 'EFOpat_id',
 'HP',
 'ICD10',
 'ICD9',
 'KEGG',
 'MEDDRA',
 'MSH',
 'MTH',
 'NCI',
 'NDFRT',
 'OMIM',
 'ORDO',
 'Orphanet',
 'SNOMEDCT',
 'UMLS',
 'WHO'}

In [9]:
# create a name to term mapping
rows = list()
for term in dox:
    rows.append({'doid': term.id, 'name': term.name, 'type': 'name'})
    for synonym in term.synonyms:
        rows.append({'doid': term.id, 'name': synonym[0], 'type': '{}-synonym'.format(synonym[1].lower())})
path = os.path.join('data', 'term-names.tsv')
with open(path, 'w') as write_file:
    writer = csv.DictWriter(write_file, delimiter='\t', fieldnames=['doid', 'name', 'type'])
    writer.writeheader()
    writer.writerows(rows)