# Convert DrugCentral relationships to Rephetio identifiers

In [1]:
import pandas

## Read DO Slim - the disease subset used for rephetio

In [2]:
url = 'https://github.com/dhimmel/disease-ontology/raw/5cb93c38568536222b0a14fbcb7fb644a348931d/data/slim-terms-prop.tsv'
do_slim = pandas.read_table(url)
do_slim = do_slim[['slim_id', 'slim_name', 'subsumed_id']]
do_slim.head(2)

Unnamed: 0,slim_id,slim_name,subsumed_id
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156
1,DOID:0050425,restless legs syndrome,DOID:0050425


## Read UniProt to Entrez Gene mapping

In [3]:
url = 'https://github.com/dhimmel/uniprot/raw/5fc60158364d2caf6d4087dad5abba0e8b2ea7db/data/map/GeneID.tsv.gz'
entrez_map_df = pandas.read_table(url, compression='gzip')
entrez_map_df.head(2)

Unnamed: 0,uniprot,GeneID
0,A0A010PZJ8,19039206
1,A0A010PZK3,19039211


## Read DrugBank Slim

In [4]:
url = 'https://github.com/dhimmel/drugbank/raw/55587651ee9417e4621707dac559d84c984cf5fa/data/drugbank-slim.tsv'
drugbank_df = pandas.read_table(url)
drugbank_df = drugbank_df[['drugbank_id', 'name']]
drugbank_df = drugbank_df.rename(columns={'name': 'drugbank_name'})
drugbank_df.head(2)

Unnamed: 0,drugbank_id,drugbank_name
0,DB00014,Goserelin
1,DB00035,Desmopressin


In [5]:
len(drugbank_df)

1552

## Read identifiers

In [6]:
path = 'drugtarget/identifiers.tsv'
id_df = pandas.read_table(path)
id_df = id_df.query("ID_TYPE == 'DRUGBANK_ID'")[['DRUG_ID', 'IDENTIFIER']]
id_df = id_df.rename(columns={'IDENTIFIER': 'drugbank_id'})
drugbank_df = id_df.merge(drugbank_df)
drugbank_df.head(2)

Unnamed: 0,DRUG_ID,drugbank_id,drugbank_name
0,4976,DB00620,Triamcinolone
1,2725,DB00620,Triamcinolone


In [7]:
len(drugbank_df)

1634

## Convert drug targets

In [8]:
path = 'drugtarget/drug_target.tsv'
target_df = pandas.read_table(path)
target_df = drugbank_df.merge(target_df)
target_df = target_df[['drugbank_id', 'drugbank_name', 'TARGET_NAME', 'TARGET_FAMILY', 'UNIPROT', 'ACTION_TYPE', 'SOURCE', 'REFERENCE']]

# Split multi-protein targets into many rows
s = target_df.UNIPROT.str.split('|').apply(pandas.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name ='uniprot'
del target_df['UNIPROT']
target_df = target_df.join(s)

target_df = entrez_map_df.merge(target_df)
del target_df['uniprot']

target_df.head(2)

Unnamed: 0,GeneID,drugbank_id,drugbank_name,TARGET_NAME,TARGET_FAMILY,ACTION_TYPE,SOURCE,REFERENCE
0,8233868,DB00431,Lindane,GABA-A receptor,Ion channel,NEGATIVE ALLOSTERIC MODULATOR,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...
1,8232849,DB08823,Spinosad,Nicotinic acetylcholine receptor,Ion channel,AGONIST,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...


In [9]:
target_df.duplicated(['drugbank_id', 'GeneID']).sum()

288

In [10]:
target_df.to_csv('rephetio/targets.tsv', sep='\t', index=False)

## Read and process DrugCentral Indications

In [11]:
path = 'drugtarget/drug_indication.tsv'
indication_df = pandas.read_table(path, dtype={'SNOMEDCT_CUI': str})
indication_df = indication_df.rename(columns={'DOID': 'subsumed_id'})
indication_df = do_slim.merge(drugbank_df.merge(indication_df))
del indication_df['DRUG_ID']
indication_df = indication_df[['slim_id', 'drugbank_id', 'slim_name', 'drugbank_name']]
indication_df = indication_df.rename(columns={'slim_id': 'doid_id', 'slim_name': 'disease', 'drugbank_name': 'drug'})
indication_df = indication_df.sort_values(['disease', 'drug'])
indication_df = indication_df.drop_duplicates()

#### Compare to PharmacotherapyDB

In [12]:
url = 'https://github.com/dhimmel/indications/raw/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv'
phcoth_df = pandas.read_table(url)
phcoth_df = phcoth_df[['doid_id', 'drugbank_id', 'category']]
indication_df = indication_df.merge(phcoth_df, how='left')
indication_df.head(2)

Unnamed: 0,doid_id,drugbank_id,disease,drug,category
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM


In [13]:
len(indication_df)

672

In [14]:
indication_df.category.value_counts(dropna=False)

DM     360
NaN    210
SYM     77
NOT     25
Name: category, dtype: int64

In [15]:
indication_df.to_csv('rephetio/indications.tsv', sep='\t', index=False)

## Pharmacologic class

In [16]:
path = 'drugtarget/pharm_class.tsv'
class_df = pandas.read_table(path)
class_df = drugbank_df.merge(class_df)
#class_df = class_df[['drugbank_id', 'drugbank_name', 'TYPE', 'CLASS_SOURCE_ID']]
class_df.head(2)

Unnamed: 0,DRUG_ID,drugbank_id,drugbank_name,DRUG_NAME,TYPE,CLASS_SOURCE_ID,CLASS,SOURCE
0,4976,DB00620,Triamcinolone,triamcinolone hexacetonide,PA,D000893,Anti-Inflammatory Agents,MeSH
1,2725,DB00620,Triamcinolone,triamcinolone,EPC,N0000175576,Corticosteroid,FDA


In [17]:
class_df.TYPE.value_counts()

PA                     6865
has role               2360
EPC                     914
MoA                     524
Chemical/Ingredient     325
PE                      188
Name: TYPE, dtype: int64

In [18]:
class_df.SOURCE.value_counts()

MeSH     6865
CHEBI    2360
FDA      1951
Name: SOURCE, dtype: int64

In [19]:
class_df.to_csv('rephetio/classes.tsv', sep='\t', index=False)