## Mapping ehrlink problems to the Disease Ontology using exact string matching

In [2]:
import os
import pandas

In [3]:
path = os.path.join('data', 'problems.tsv')
problem_df = pandas.read_table(path)
problem_df['name_lower'] = list(map(str.lower, problem_df.problem))
problem_df[:3]

Unnamed: 0,problem_definition_id,problem,indications,name_lower
0,63467,Hypertension,242,hypertension
1,63468,Essential Hypertension,204,essential hypertension
2,63470,Benign Essential Hypertension,190,benign essential hypertension


In [4]:
len(problem_df)

1596

In [5]:
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv'
doterm_df = pandas.read_table(url)
doterm_df['name_lower'] = list(map(str.lower, doterm_df.name))
set(doterm_df.type)

{'exact-synonym', 'name', 'narrow-synonym', 'related-synonym'}

In [6]:
doterm_df = doterm_df.query("type in ['exact-synonym', 'name']")
doterm_df[:3]

Unnamed: 0,doid,name,type,name_lower
0,DOID:9888,alternating esotropia,name,alternating esotropia
1,DOID:3389,Papillon-Lefevre disease,name,papillon-lefevre disease
2,DOID:3389,Papillon Lefevre syndrome,exact-synonym,papillon lefevre syndrome


In [7]:
domap_df = problem_df.merge(doterm_df, how='left', on='name_lower')
path = os.path.join('data', 'problem-to-doid.tsv')
domap_df.to_csv(path, index=False, sep='\t')

In [8]:
pair_df = domap_df[['problem_definition_id', 'doid']].dropna().drop_duplicates()
len(pair_df)

368

In [9]:
domap_df[:50]

Unnamed: 0,problem_definition_id,problem,indications,name_lower,doid,name,type
0,63467,Hypertension,242,hypertension,DOID:10763,hypertension,name
1,63468,Essential Hypertension,204,essential hypertension,DOID:10825,essential hypertension,name
2,63470,Benign Essential Hypertension,190,benign essential hypertension,DOID:10913,benign essential hypertension,name
3,63470,Benign Essential Hypertension,190,benign essential hypertension,DOID:10913,benign Essential hypertension,exact-synonym
4,69380,Diabetes Mellitus,151,diabetes mellitus,DOID:9351,diabetes mellitus,name
5,69402,Diabetes Mellitus Poorly Controlled,146,diabetes mellitus poorly controlled,,,
6,69381,Type II Diabetes Mellitus,112,type ii diabetes mellitus,DOID:9352,type II diabetes mellitus,exact-synonym
7,76351,Depression,105,depression,,,
8,62598,Allergic Rhinitis,103,allergic rhinitis,DOID:4481,allergic rhinitis,name
9,74988,Seizure Disorder,98,seizure disorder,,,


In [10]:
mapped_problems = set(pair_df.problem_definition_id)

In [11]:
len(mapped_problems) / len(problem_df)

0.22932330827067668

In [12]:
len(mapped_problems)

366

In [13]:
# Find problems that mapped to multiple DO terms
duplicates = set(pair_df[pair_df.duplicated('problem_definition_id')].problem_definition_id)
domap_df[domap_df.problem_definition_id.isin(duplicates)]

Unnamed: 0,problem_definition_id,problem,indications,name_lower,doid,name,type
21,70475,Scleroderma,63,scleroderma,DOID:418,Scleroderma,exact-synonym
22,70475,Scleroderma,63,scleroderma,DOID:419,scleroderma,name
932,167200,Premature Menopause,2,premature menopause,DOID:5426,premature menopause,exact-synonym
933,167200,Premature Menopause,2,premature menopause,DOID:10787,premature menopause,name
934,167200,Premature Menopause,2,premature menopause,DOID:10787,Premature menopause,exact-synonym


In [14]:
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms.tsv'
doslim_df = pandas.read_table(url)
len(doslim_df)

137

In [15]:
sum(doslim_df.doid.isin(set(pair_df.doid)))

51

In [16]:
# with propagation
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms-prop.tsv'
doslim_df = pandas.read_table(url)
len(set(doslim_df.slim_id[doslim_df.subsumed_id.isin(set(pair_df.doid))]))

56