# Map MeSH conditions to the Disease Ontology and MeSH interventions to DrugBank

In [1]:
import pandas

## Map MeSH to the Disease Ontology

In [2]:
url = 'https://github.com/dhimmel/disease-ontology/blob/75050ea2d4f60e745d3f3578ae03560a2cc0e444/data/xrefs.tsv?raw=true'
disease_map_df = (
    pandas.read_table(url)
    .query("resource == 'MSH'")
    .drop('resource', axis='columns')
    .rename(columns={'resource_id': 'condition'})
)
disease_map_df.head(2)

Unnamed: 0,doid_code,doid_name,condition
0,DOID:4,disease,D004194
5,DOID:863,nervous system disease,D009422


## Map MeSH to DrugBank

In [3]:
# Map from DrugBank to MeSH using DrugCentral
url = 'https://github.com/olegursu/drugtarget/blob/9a6d84bed8650c6c507a2d3d786814c774568610/identifiers.tsv?raw=true'
drug_map_df = pandas.read_table(url)
drug_map_df = drug_map_df[drug_map_df.ID_TYPE.str.contains('MESH')][['DRUG_ID', 'IDENTIFIER']].rename(columns={'IDENTIFIER': 'intervention'}).merge(
drug_map_df[drug_map_df.ID_TYPE == 'DRUGBANK_ID'][['DRUG_ID', 'IDENTIFIER']].rename(columns={'IDENTIFIER': 'drugbank_id'})
).drop('DRUG_ID', axis='columns')
drug_map_df.head(2)

Unnamed: 0,intervention,drugbank_id
0,C016671,DB00067
1,C031183,DB03193


## Read DrugBank

In [4]:
url = 'https://github.com/dhimmel/drugbank/blob/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank.tsv?raw=true'
drugbank_df = pandas.read_table(url)
drugbank_id_to_name = dict(zip(drugbank_df.drugbank_id, drugbank_df.name))

url = 'https://github.com/dhimmel/drugbank/blob/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv?raw=true'
drugbank_slim_ids = set(pandas.read_table(url).drugbank_id)

## Map ClinicalTrials.gov intervention-condition pairs

In [5]:
mesh_df = pandas.read_table('data/mesh-intervention-to-condition.tsv')
mesh_df.head(2)

Unnamed: 0,nct_id,intervention,condition
0,NCT00000114,D014801,D012173
1,NCT00000114,D014801,D012174


In [6]:
mapped_df = mesh_df.merge(drug_map_df).merge(disease_map_df)
mapped_df = mapped_df.drop(['condition', 'intervention'], axis='columns').drop_duplicates()
mapped_df.insert(2, 'drugbank_name', mapped_df.drugbank_id.map(drugbank_id_to_name))
mapped_df = mapped_df.sort_values(['doid_code', 'drugbank_id', 'nct_id'])

In [7]:
mapped_df.head(2)

Unnamed: 0,nct_id,drugbank_id,drugbank_name,doid_code,doid_name
160443,NCT02584309,DB00380,Dexrazoxane,DOID:0001816,angiosarcoma
160445,NCT00245102,DB00398,Sorafenib,DOID:0001816,angiosarcoma


In [8]:
len(mapped_df), mapped_df.nct_id.nunique(), mapped_df.drugbank_id.nunique(), mapped_df.doid_code.nunique()

(158767, 42826, 1181, 1617)

In [9]:
len(mapped_df[['drugbank_id', 'doid_code']].drop_duplicates())

33095

In [10]:
#mapped_df.query("doid_name == 'multiple sclerosis'").drug_name.value_counts()

In [11]:
mapped_df.to_csv('data/DrugBank-DO.tsv', sep='\t', index=False)

## Create a slim subset

In [12]:
# Read Disease Ontology transitive closures for slim terms
url = 'https://github.com/dhimmel/disease-ontology/blob/75050ea2d4f60e745d3f3578ae03560a2cc0e444/data/slim-terms-prop.tsv?raw=true'
do_slim_map_df = (
    pandas.read_table(url)
    .rename(columns={'slim_id': 'disease_id', 'slim_name': 'disease_name', 'subsumed_id': 'doid_code'})
    .drop(['subsumed_name', 'min_distance'], axis='columns')
)
do_slim_map_df.head(2)

Unnamed: 0,disease_id,disease_name,doid_code
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156
1,DOID:0050425,restless legs syndrome,DOID:0050425


In [13]:
slim_df = (mapped_df
    .query("drugbank_id in @drugbank_slim_ids")
    .merge(do_slim_map_df)
    .drop(['doid_code', 'doid_name'], axis='columns')
    .rename(columns={'drugbank_id': 'compound_id', 'drugbank_name': 'compound_name'})
    .drop_duplicates()
    .sort_values(['disease_name', 'compound_name', 'nct_id'])
)
slim_df.head(3)

Unnamed: 0,nct_id,compound_id,compound_name,disease_id,disease_name
11168,NCT00012857,DB00316,Acetaminophen,DOID:10652,Alzheimer's disease
11169,NCT00385684,DB00316,Acetaminophen,DOID:10652,Alzheimer's disease
11170,NCT01608217,DB00316,Acetaminophen,DOID:10652,Alzheimer's disease


In [14]:
slim_df.to_csv('data/DrugBank-DO-slim.tsv', sep='\t', index=False)

In [15]:
len(slim_df), slim_df.nct_id.nunique(), slim_df.compound_id.nunique(), slim_df.disease_id.nunique()

(52013, 27240, 794, 130)

In [16]:
len(slim_df[['compound_id', 'disease_id']].drop_duplicates())

6382

In [17]:
# Count number of trials per compound-disease pair
slim_count_df = (
    slim_df.groupby(['compound_id', 'disease_id', 'compound_name', 'disease_name'])
    .apply(lambda df: pandas.Series({'n_trials': len(df)}))
    .reset_index()
)
slim_count_df.to_csv('data/DrugBank-DO-slim-counts.tsv', sep='\t', index=False)
slim_count_df.head(2)

Unnamed: 0,compound_id,disease_id,compound_name,disease_name,n_trials
0,DB00014,DOID:10283,Goserelin,prostate cancer,75
1,DB00014,DOID:11476,Goserelin,osteoporosis,2
