# Map MeSH conditions to the Disease Ontology and MeSH interventions to DrugBank

In [1]:
import pandas

## Map MeSH to the Disease Ontology

In [2]:
mesh_df = pandas.read_table('data/mesh-intervention-to-condition.tsv')
mesh_df.head(2)

Unnamed: 0,nct_id,intervention,condition
0,NCT00000114,D014801,D012173
1,NCT00000114,D014801,D012174


In [3]:
url = 'https://github.com/dhimmel/disease-ontology/blob/75050ea2d4f60e745d3f3578ae03560a2cc0e444/data/xrefs.tsv?raw=true'
disease_map_df = pandas.read_table(url)

In [4]:
disease_map_df = disease_map_df.query("resource == 'MSH'")
disease_map_df = disease_map_df.drop('resource', axis='columns')
disease_map_df = disease_map_df.rename(columns={'resource_id': 'condition'})
disease_map_df.head(2)

Unnamed: 0,doid_code,doid_name,condition
0,DOID:4,disease,D004194
5,DOID:863,nervous system disease,D009422


## Map MeSH to DrugBank

In [5]:
# Map from DrugBank to MeSH using DrugCentral
url = 'https://github.com/olegursu/drugtarget/blob/9a6d84bed8650c6c507a2d3d786814c774568610/identifiers.tsv?raw=true'
drug_map_df = pandas.read_table(url)
drug_map_df = drug_map_df[drug_map_df.ID_TYPE.str.contains('MESH')][['DRUG_ID', 'IDENTIFIER']].rename(columns={'IDENTIFIER': 'intervention'}).merge(
drug_map_df[drug_map_df.ID_TYPE == 'DRUGBANK_ID'][['DRUG_ID', 'IDENTIFIER']].rename(columns={'IDENTIFIER': 'drugbank_id'})
).drop('DRUG_ID', axis='columns')
drug_map_df.head(2)

Unnamed: 0,intervention,drugbank_id
0,C016671,DB00067
1,C031183,DB03193


## Read DrugBank

In [6]:
url = 'https://github.com/dhimmel/drugbank/blob/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank.tsv?raw=true'
drugbank_df = pandas.read_table(url)
drugbank_id_to_name = dict(zip(drugbank_df.drugbank_id, drugbank_df.name))

## Map ClinicalTrials.gov intervention-condition pairs

In [7]:
mapped_df = mesh_df.merge(drug_map_df).merge(disease_map_df)
mapped_df = mapped_df.drop(['condition', 'intervention'], axis='columns').drop_duplicates()
mapped_df.insert(2, 'drugbank_name', mapped_df.drugbank_id.map(drugbank_id_to_name))
mapped_df = mapped_df.sort_values(['doid_code', 'drugbank_id', 'nct_id'])

In [8]:
mapped_df.head(2)

Unnamed: 0,nct_id,drugbank_id,drugbank_name,doid_code,doid_name
160443,NCT02584309,DB00380,Dexrazoxane,DOID:0001816,angiosarcoma
160445,NCT00245102,DB00398,Sorafenib,DOID:0001816,angiosarcoma


In [9]:
len(mapped_df)

158767

In [10]:
mapped_df.nct_id.nunique(), mapped_df.drugbank_id.nunique(), mapped_df.doid_code.nunique()

(42826, 1181, 1617)

In [11]:
#mapped_df.query("doid_name == 'multiple sclerosis'").drug_name.value_counts()

In [12]:
mapped_df.to_csv('data/DrugBank-DO.tsv', sep='\t', index=False)