In [1]:
import pandas

## Read DO Slim - the disease subset used for rephetio

In [2]:
url = 'https://github.com/dhimmel/disease-ontology/raw/5cb93c38568536222b0a14fbcb7fb644a348931d/data/slim-terms-prop.tsv'
do_slim = pandas.read_table(url)
do_slim = do_slim[['slim_id', 'slim_name', 'subsumed_id']]
do_slim.head(2)

Unnamed: 0,slim_id,slim_name,subsumed_id
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156
1,DOID:0050425,restless legs syndrome,DOID:0050425


## Read drug targets and use chembl_ids to map to drugbank

In [3]:
path = 'drugtarget/drug_target.tsv'
target_df = pandas.read_table(path)
target_df = target_df.rename(columns={'CHEMBL_ID': 'chembl_id'})
target_df.head(2)

Unnamed: 0,DRUG_ID,chembl_id,DRUG_NAME,TARGET_NAME,TARGET_FAMILY,UNIPROT,GENE,SWISSPROT,ACTION_TYPE,SOURCE,REFERENCE
0,5080,CHEMBL3039514,elbasvir,Genome polyprotein,Polyprotein,P27958,,POLG_HCVH,INHIBITOR,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...
1,5080,CHEMBL3039514,elbasvir,Genome polyprotein,Polyprotein,P26663,,POLG_HCVBK,INHIBITOR,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...


In [4]:
chembl_map_df = target_df[['DRUG_ID', 'chembl_id']].dropna().drop_duplicates().reindex()
chembl_map_df.tail(2)

Unnamed: 0,DRUG_ID,chembl_id
2354,4982,CHEMBL1743087
2355,736,CHEMBL2105842


In [5]:
path = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/mapping/chembl.tsv'
drugbank_map_df = pandas.read_table(path).merge(chembl_map_df)[['drugbank_id', 'DRUG_ID']].drop_duplicates()
drugbank_map_df.head(2)

Unnamed: 0,drugbank_id,DRUG_ID
0,DB00014,1327
1,DB00035,817


In [6]:
target_df = drugbank_map_df.merge(target_df)
target_df.to_csv('rephetio/targets.tsv', sep='\t', index=False)
target_df.head(2)

Unnamed: 0,drugbank_id,DRUG_ID,chembl_id,DRUG_NAME,TARGET_NAME,TARGET_FAMILY,UNIPROT,GENE,SWISSPROT,ACTION_TYPE,SOURCE,REFERENCE
0,DB00014,1327,CHEMBL1200501,goserelin,Gonadotropin-releasing hormone receptor,GPCR,P30968,GNRHR,GNRHR_HUMAN,AGONIST,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...
1,DB00035,817,CHEMBL376685,desmopressin,Vasopressin V2 receptor,GPCR,P30518,AVPR2,V2R_HUMAN,AGONIST,SCIENTIFIC LITERATURE,http://www.ncbi.nlm.nih.gov/pubmed/9792651


## Read and process DrugCentral Indications

In [7]:
path = 'drugtarget/drug_indication.tsv'
indication_df = pandas.read_table(path, dtype={'SNOMEDCT_CUI': str})
indication_df = indication_df.rename(columns={'DOID': 'subsumed_id'})
indication_df.head(2)

Unnamed: 0,DRUG_ID,DRUG_NAME,INDICATION_FDB,UMLS_CUI,SNOMEDCT_CUI,subsumed_id
0,1593,lofexidine,Opioid Withdrawal Symptoms,C0029104,87132004,
1,4171,eribulin,Liposarcoma,C0023827,254829001,DOID:3382


In [8]:
mapped_indication_df = do_slim.merge(indication_df)[['slim_id', 'slim_name', 'INDICATION_FDB', 'DRUG_ID', 'DRUG_NAME']]
mapped_indication_df = mapped_indication_df.merge(drugbank_map_df)
mapped_indication_df = mapped_indication_df.sort_values(['slim_name', 'INDICATION_FDB', 'DRUG_NAME'])
mapped_indication_df.to_csv('rephetio/indications.tsv', sep='\t', index=False)
mapped_indication_df.duplicated(['slim_id', 'drugbank_id']).value_counts()

False    597
True     152
dtype: int64

## Pharmacologic class

In [9]:
path = 'drugtarget/pharm_class.tsv'
class_df = pandas.read_table(path)
class_df = drugbank_map_df.merge(class_df)
class_df = class_df.drop_duplicates(['drugbank_id', 'TYPE', 'CLASS_SOURCE_ID'])
class_df.head(2)

Unnamed: 0,drugbank_id,DRUG_ID,DRUG_NAME,TYPE,CLASS_SOURCE_ID,CLASS,SOURCE
0,DB00014,1327,goserelin,EPC,N0000175655,Gonadotropin Releasing Hormone Receptor Agonist,FDA
1,DB00014,1327,goserelin,MoA,N0000175654,Gonadotropin Releasing Hormone Receptor Agonists,FDA


In [10]:
class_df.TYPE.value_counts()

PA                     5768
has role               2066
EPC                     813
MoA                     483
Chemical/Ingredient     269
PE                      158
Name: TYPE, dtype: int64

In [11]:
class_df.SOURCE.value_counts()

MeSH     5768
CHEBI    2066
FDA      1723
Name: SOURCE, dtype: int64

In [12]:
class_df.to_csv('rephetio/classes.tsv', sep='\t', index=False)