In [1]:
import os

import pandas
import networkx

import do_tools

In [2]:
# read the disease ontology
path = os.path.join('download', 'HumanDO.obo')
do = do_tools.load_do(path)
dox = do_tools.do_to_networkx(do)

In [3]:
# read the slim disease ontology terms
path = os.path.join('data', 'slim-terms.tsv')
slim_df = pandas.read_table(path)
slim_df[:3]

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,Hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,Brain cancer,DOcancerslim,neoplastic
2,DOID:1324,Lung cancer,DOcancerslim,neoplastic


In [4]:
# check for terms that in slim but not in the DO
all_doids = set(do.get_term_ids())
slim_doids = set(slim_df.doid)
unmatched = slim_doids - all_doids
slim_doids &= all_doids
slim_df.loc[slim_df.doid.isin(unmatched)]

Unnamed: 0,doid,name,source,pathophysiology
55,DOID:9917,Pleural cancer,DOcancerslim,neoplastic


In [5]:
# check for node redundancy
slim_terms = set(map(do.get_term, slim_doids))
for term in slim_terms:
    nodes_to_root = networkx.descendants(dox, term)
    conflicts = {x.name for x in nodes_to_root & slim_terms}
    if conflicts:
        print(term.name, conflicts)

In [6]:
path = os.path.join('data', 'xrefs.tsv')
map_unprop_df = pandas.read_table(path)

path = os.path.join('data', 'xrefs-prop.tsv')
map_prop_df = pandas.read_table(path)

In [7]:
slim_df = slim_df.rename(columns={'doid': 'doid_code'})
slim_map_unprop_df = slim_df[['doid_code']].merge(map_unprop_df)
slim_map_prop_df = slim_df[['doid_code']].merge(map_prop_df)
slim_map_prop_df[:3]

Unnamed: 0,doid_code,doid_name,resource,resource_id
0,DOID:2531,hematologic cancer,CSP,2004-1600
1,DOID:2531,hematologic cancer,CSP,2004-1803
2,DOID:2531,hematologic cancer,CSP,2004-2820


In [8]:
path = os.path.join('data', 'xrefs-slim.tsv')
slim_map_unprop_df.to_csv(path, sep='\t', index=False)

path = os.path.join('data', 'xrefs-prop-slim.tsv')
slim_map_prop_df.to_csv(path, sep='\t', index=False)

In [17]:
rows = list()
for term in slim_terms:
    subsumed = networkx.ancestors(dox, term)
    row_part = [term.id, term.name]
    rows.append(row_part + row_part)
    for subterm in subsumed:
        rows.append(row_part + [subterm.id, subterm.name])
rows.sort()

slim_prop_df = pandas.DataFrame(rows, columns=['slim_id', 'slim_name', 'subsumed_id', 'subsumed_name'])
path = os.path.join('data', 'xrefs-prop-slim.tsv')
slim_prop_df.to_csv(path, sep='\t', index=False)

Unnamed: 0,slim_id,slim_name,subsumed_id,subsumed_name
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156,idiopathic pulmonary fibrosis
1,DOID:0050425,restless legs syndrome,DOID:0050425,restless legs syndrome
2,DOID:0050741,alcohol dependence,DOID:0050741,alcohol dependence
3,DOID:0050742,nicotine dependence,DOID:0050742,nicotine dependence
4,DOID:0060073,lymphatic system cancer,DOID:0060073,lymphatic system cancer
5,DOID:0060073,lymphatic system cancer,DOID:0060219,lymph node adenoid cystic carcinoma
6,DOID:0060073,lymphatic system cancer,DOID:10619,lymph node cancer
7,DOID:0060073,lymphatic system cancer,DOID:12972,intrapelvic lymph node leukemic reticuloendoth...
8,DOID:0060073,lymphatic system cancer,DOID:13005,intra-abdominal lymph node mast cell malignancy
9,DOID:0060073,lymphatic system cancer,DOID:265,spleen angiosarcoma


[['DOID:0050156',
  'idiopathic pulmonary fibrosis',
  'DOID:0050156',
  'idiopathic pulmonary fibrosis'],
 ['DOID:0050425',
  'restless legs syndrome',
  'DOID:0050425',
  'restless legs syndrome'],
 ['DOID:0050741', 'alcohol dependence', 'DOID:0050741', 'alcohol dependence'],
 ['DOID:0050742',
  'nicotine dependence',
  'DOID:0050742',
  'nicotine dependence'],
 ['DOID:0060073',
  'lymphatic system cancer',
  'DOID:0060073',
  'lymphatic system cancer'],
 ['DOID:0060073',
  'lymphatic system cancer',
  'DOID:0060219',
  'lymph node adenoid cystic carcinoma'],
 ['DOID:0060073',
  'lymphatic system cancer',
  'DOID:10619',
  'lymph node cancer'],
 ['DOID:0060073',
  'lymphatic system cancer',
  'DOID:12972',
  'intrapelvic lymph node leukemic reticuloendotheliosis'],
 ['DOID:0060073',
  'lymphatic system cancer',
  'DOID:13005',
  'intra-abdominal lymph node mast cell malignancy'],
 ['DOID:0060073',
  'lymphatic system cancer',
  'DOID:265',
  'spleen angiosarcoma'],
 ['DOID:0060073', 