# 1. Import libraries

In [1]:
import os
import numpy as np
import copy
import collections
import re
import xml.etree.ElementTree as ET
import pandas as pd

# 2. Construct a basic drug properties table

## 2.1 Extract all drugs 

In [2]:
xml_path = os.path.join(os.path.curdir, 'full database.xml')
tree = ET.parse(xml_path)

In [3]:
root = tree.getroot()

In [4]:
ns = '{http://www.drugbank.ca}'

rows = list()
for i, drug in enumerate(root): 
    row = collections.OrderedDict()
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
#     row['synonyms'] = drug.findtext(ns + "synonyms")
#     row['synonyms'] = drug.get("synonyms")
    row['synonyms'] = [synonym.text for synonym in drug.findall("{ns}synonyms/{ns}synonym".format(ns = ns))]
    row['type'] = drug.get('type')
    row['groups'] = [group.text for group in drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['mesh_ids'] = [mesh_id.text for mesh_id in
        drug.findall("{ns}categories/{ns}category/{ns}mesh-id".format(ns = ns))]
    row['categories'] = [cat.findtext(ns + 'category') for cat in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    
    row['logP'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='logP']/{ns}value".format(ns = ns))
    row['logS'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='logS']/{ns}value".format(ns = ns))
    row['mw'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='Molecular Weight']/{ns}value".format(ns = ns))
    row['psa'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='Polar Surface Area (PSA)']/{ns}value".format(ns = ns))
    row['hbondacceptor'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='H Bond Acceptor Count']/{ns}value".format(ns = ns))
    row['hbonddonor'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='H Bond Donor Count']/{ns}value".format(ns = ns)) 
    row['inchi'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value".format(ns = ns))
    row['inchikey'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value".format(ns = ns))
    row['smiles'] = drug.findtext("{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{ns}value".format(ns = ns))
    row['description'] = drug.findtext(ns + "description")
    
    row['targets'] = [target.findtext(ns + "name") \
       for target in drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat='target'))]
    row['targets_genes'] = [target.findtext(ns + "gene-name") \
       for target in drug.findall('{ns}{cat}s/{ns}{cat}/{ns}polypeptide'.format(ns=ns, cat='target'))]
    row['enzymes'] = [enzyme.findtext(ns + "name") \
       for enzyme in drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat='enzyme'))]
    row['enzymes_genes'] = [enzyme.findtext(ns + "gene-name") \
       for enzyme in drug.findall('{ns}{cat}s/{ns}{cat}/{ns}polypeptide'.format(ns=ns, cat='enzyme'))]
    row['carriers'] = [carrier.findtext(ns + "name") \
       for carrier in drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat='carrier'))]
    row['carriers_genes'] = [carrier.findtext(ns + "gene-name") \
       for carrier in drug.findall('{ns}{cat}s/{ns}{cat}/{ns}polypeptide'.format(ns=ns, cat='carrier'))]
    row['transporters'] = [transporter.findtext(ns + "name") \
       for transporter in drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat='transporter'))]
    row['transporters_genes'] = [carrier.findtext(ns + "gene-name") \
       for carrier in drug.findall('{ns}{cat}s/{ns}{cat}/{ns}polypeptide'.format(ns=ns, cat='transporter'))]
    
    row['routes'] = [route.findtext(ns + "route") for route in
        drug.findall("{ns}dosages/{ns}dosage".format(ns = ns))]
    row['forms'] = [form.findtext(ns + "form") for form in
        drug.findall("{ns}dosages/{ns}dosage".format(ns = ns))]
    row['strengths'] = [strength.findtext(ns + "strength") for strength in
        drug.findall("{ns}dosages/{ns}dosage".format(ns = ns))]
 

    rows.append(copy.copy(row))

In [5]:
columns = ['drugbank_id', 'name', 'synonyms','type', 'groups', 'atc_codes', 'mesh_ids', 'categories', 'logP','logS',\
           'mw','psa','hbondacceptor','hbonddonor','inchi', 'inchikey', 'smiles','description',\
           'targets','targets_genes','enzymes','enzymes_genes','carriers','carriers_genes','transporters','transporters_genes',\
          'routes', 'forms', 'strengths']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]

In [6]:
drugbank_df.head()

Unnamed: 0,drugbank_id,name,synonyms,type,groups,atc_codes,mesh_ids,categories,logP,logS,...,targets_genes,enzymes,enzymes_genes,carriers,carriers_genes,transporters,transporters_genes,routes,forms,strengths
0,DB00001,Lepirudin,"[Hirudin variant-1, Lepirudin recombinant]",biotech,[approved],[B01AE02],"[D000602, D000925, D058833, D000991, None, D00...","[Amino Acids, Peptides, and Proteins, Anticoag...",,,...,[F2],[],[],[],[],[],[],"[Intravenous, Intravenous, Intravenous, Intrav...","[Injection, solution, concentrate, Injection, ...","[20 mg, 50 mg, 50 mg/1mL, 50 mg]"
1,DB00002,Cetuximab,"[Cetuximab, Cétuximab, Cetuximabum, Immunoglob...",biotech,[approved],[L01XC06],"[D000602, D000906, D000911, D061067, D000970, ...","[Amino Acids, Peptides, and Proteins, Antibodi...",,,...,"[EGFR, FCGR3B, C1R, C1QA, C1QB, C1QC, FCGR3A, ...",[],[],[],[],[],[],"[Intravenous, Intravenous, Intravenous]","[Injection, solution, Solution, Solution]","[5 mg/ml, 2 mg, 2 mg/1mL]"
2,DB00003,Dornase alfa,[Deoxyribonuclease (human clone 18-1 protein m...,biotech,[approved],[R05CB13],"[D000602, None, None, D003851, D004706, D00472...","[Amino Acids, Peptides, and Proteins, Cough an...",,,...,[],[],[],[],[],[],[],"[Respiratory (inhalation), Respiratory (inhala...","[Solution, Solution]","[1 mg/1mL, ]"
3,DB00004,Denileukin diftitox,"[Denileukin, Interleukin-2/diptheria toxin fus...",biotech,"[approved, investigational]",[L01XX29],"[D036002, D000602, D000970, None, D001427, D00...","[ADP Ribose Transferases, Amino Acids, Peptide...",,,...,"[IL2RA, IL2RB, IL2RG]",[],[],[],[],[],[],[Intravenous],"[Injection, solution]",[150 ug/1mL]
4,DB00005,Etanercept,"[Etanercept, etanercept-szzs, etanercept-ykro,...",biotech,"[approved, investigational]",[L04AB01],"[None, D000602, D000893, D000906, D018501, D00...","[Agents reducing cytokine levels, Amino Acids,...",,,...,"[TNF, TNFRSF1B, FCGR1A, FCGR3A, FCGR2A, FCGR2B...",[],[],[],[],[],[],"[Subcutaneous, Subcutaneous, Subcutaneous, Sub...","[Solution, , Injection, powder, for solution, ...","[, 25 mg/1mL, 10 mg, 25 mg, 50 mg, 25 mg, 50 m..."


In [7]:
print('Number of all drugs in DrugBank: ', drugbank_df.shape[0])

Number of all drugs in DrugBank:  13475


In [17]:
print('Number of all drugs in DrugBank: ', drugbank_df.shape[0])

Number of all drugs in DrugBank:  13339


## 2.2 Construct a separate "slim" table

-  for approved, small molecule drugs

In [8]:
# keep entries for approved, small molecule drugs 
drugbank_slim_df = drugbank_df[
    drugbank_df.groups.map(lambda x: 'approved' in x) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]

print('Number of approved, small-molecule drugs in DrugBank: ', drugbank_slim_df.shape[0])

Number of approved, small-molecule drugs in DrugBank:  2635


## 2.3 Construct a DDI drug candidate table

- for drugs with all full data entry based on the "slim" dataframe 

In [9]:
# remove rows with any nan entry
DDI_candicate_df = drugbank_slim_df.dropna(how='any')

print('Number of approved, small-molecule drugs with full information in DrugBank: ', DDI_candicate_df.shape[0])

Number of approved, small-molecule drugs with full information in DrugBank:  2301


## 2.4 Write & read for future processing

In [10]:
# write drugbank_df 
drugbank_df.to_pickle(r'drugbank.pkl')

# write drugbank_slim_df 
drugbank_slim_df.to_pickle(r'drugbank_slim_df.pkl')

# write DDI_candicate_df 
DDI_candicate_df.to_pickle(r'DDI_candicate_df.pkl')

In [None]:
# # read drugbank_df 
# drugbank_df = pd.read_pickle('drugbank.pkl')

# # read drugbank_slim_df 
# drugbank_slim_df = pd.read_pickle('drugbank_slim_df.pkl')

# # read DDI_candicate_df 
# DDI_candicate_df = pd.read_pickle('DDI_candicate_df.pkl')

# 3. Construct a DDI table

## 3.1 Extract all DDI

In [38]:
ns = '{http://www.drugbank.ca}'

drugInteraction_rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    drugName = drug.findtext(ns + "name")
    drugType = drug.get('type')
    drugGroups = [group.text for group in drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    drugInteractions = drug.findall('{ns}{item}s/{ns}{item}'.format(ns=ns, item="drug-interaction"))
    for drugInteraction in drugInteractions:
        DDI_Description = drugInteraction.findtext(ns + "description")
        if DDI_Description is not None:
            drugI_drugbank_id = drugInteraction.findtext(ns + "drugbank-id")
            row['drugI_drugbank_id'] = drugI_drugbank_id # notice it is drugI_drugbank-id here!!
            row['drugI_Name'] = drugInteraction.findtext(ns + "name")
            row['DDI_Description'] = DDI_Description

            row['drugbank_id'] = drugbank_id
            row['name'] = drugName
            row['groups'] = drugGroups
            row['type'] = drugType

            drugInteraction_rows.append(copy.copy(row))

In [39]:
columns = ['drugbank_id','name','groups', 'type', \
           'drugI_drugbank_id','drugI_Name', "DDI_Description"] 
drugInteraction_df = pd.DataFrame.from_dict(drugInteraction_rows)
drugInteraction_df = drugInteraction_df[columns]

In [40]:
drugInteraction_df.head()

Unnamed: 0,drugbank_id,name,groups,type,drugI_drugbank_id,drugI_Name,DDI_Description
0,DB00001,Lepirudin,[approved],biotech,DB06605,Apixaban,Apixaban may increase the anticoagulant activi...
1,DB00001,Lepirudin,[approved],biotech,DB06695,Dabigatran etexilate,Dabigatran etexilate may increase the anticoag...
2,DB00001,Lepirudin,[approved],biotech,DB01254,Dasatinib,The risk or severity of bleeding and hemorrhag...
3,DB00001,Lepirudin,[approved],biotech,DB01609,Deferasirox,The risk or severity of gastrointestinal bleed...
4,DB00001,Lepirudin,[approved],biotech,DB01586,Ursodeoxycholic acid,The risk or severity of bleeding and bruising ...


-  Add groups and type to the drugI based on "drugbank_df" table

In [41]:
idGroups_Dict = dict(zip(drugbank_df['drugbank_id'],drugbank_df['groups']))
idType_Dict = dict(zip(drugbank_df['drugbank_id'],drugbank_df['type']))

In [42]:
drugInteraction_df["drugI_groups"] = \
drugInteraction_df["drugI_drugbank_id"].map(idGroups_Dict)

In [43]:
drugInteraction_df["drugI_type"] = \
drugInteraction_df["drugI_drugbank_id"].map(idType_Dict)

In [44]:
print("The total number for DDI record is: ", drugInteraction_df.shape[0])

The total number for DDI record is:  2723944


In [45]:
# information in the DDI_Description
list(drugInteraction_df.head(1)['DDI_Description'])

['Apixaban may increase the anticoagulant activities of Lepirudin.']

## 3.3 Write & Read for future processing

In [52]:
# write drugInteraction_df 
drugInteraction_df.to_pickle(r'drugInteraction_df.pkl')

In [53]:
# # read drugInteraction_df 
# drugInteraction_df = pd.read_pickle('drugInteraction_df.pkl')

## 3.4 Examine number of drugs

In [54]:
print("Number of drugs in the drugbank_df: ", len(set(list(drugbank_df['drugbank_id']))))

Number of drugs in the drugbank_df:  13339


In [55]:
print("Number of drugs in the drugbank_slim_df: ", len(set(list(drugbank_slim_df['drugbank_id']))))

Number of drugs in the drugbank_slim_df:  2594


In [56]:
print("Number of primary drugs in the drugInteraction_df",\
      len(set(list(drugInteraction_df['drugbank_id']))))

Number of primary drugs in the drugInteraction_df 3949


In [57]:
print("Number of interacting drugs in the drugInteraction_df",\
len(set(list(drugInteraction_df['drugI_drugbank_id']))))

Number of interacting drugs in the drugInteraction_df 3950


In [58]:
print("Number of overlapping drugs between primary and interacting drugs in the drugInteraction_df: ",
len(set(list(drugInteraction_df['drugbank_id'])) & set(list(drugInteraction_df['drugI_drugbank_id']))))

Number of overlapping drugs between primary and interacting drugs in the drugInteraction_df:  3949


# 4. Construct a DDI dataset

In [59]:
# DDI_drugCandidates based on the drugbank_slim_df drug entries
DDI_drugCandidates = list(drugbank_slim_df['drugbank_id']) # add set?
print('Number of DDI drug candidates', len(DDI_drugCandidates))

Number of DDI drug candidates 2594


## 4.1 Iterate all drug-drug combinations

-  DDI labeling

In [60]:
# construct the DDI dataset, primarily labels column, by referring to the drugInteraction_df entry
numDrugs = len(DDI_drugCandidates)
DDI_matrix = np.empty(shape=(int(numDrugs * (numDrugs-1) / 2), 3), dtype=object)

In [61]:
drugInteractionTable = drugInteraction_df[['drugbank_id', 'drugI_drugbank_id']]

In [62]:
%timeit drugInteractionTable.loc[(drugInteractionTable['drugbank_id']=='DB00001')\
                               &(drugInteractionTable['drugI_drugbank_id']=='DB08860')]

442 ms ± 347 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [63]:
drugInteractionTable = drugInteractionTable.set_index(['drugbank_id','drugI_drugbank_id'])

In [64]:
%timeit drugInteractionTable.loc[('DB00001', 'DB08860')]

8 ms ± 22.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


  """Entry point for launching an IPython kernel.


In [65]:
num = 0
for i in range(numDrugs):
#     print('i: ',i)
#     print('num: ', num)
    for j in range(i+1, numDrugs):
#         print('j: ',j)
        index = num + j - (i+1)
#         print('index: ', index)
    
        drug1_id = DDI_drugCandidates[i]
        DDI_matrix[index, 0] = drug1_id
        drug2_id = DDI_drugCandidates[j]
        DDI_matrix[index, 1] = drug2_id
        
        try:
            drugInteractionTable.loc[(drug1_id, drug2_id)]
            DDI_matrix[index, 2] = 1
        except KeyError:
            try:
                drugInteractionTable.loc[(drug2_id, drug1_id)]
                DDI_matrix[index, 2] = 1
            except KeyError:
                DDI_matrix[index, 2] = 0
    num += (numDrugs - (i+1))


  app.launch_new_instance()


In [66]:
print('i: ',i)
print('j: ',j)

i:  2593
j:  2593


In [67]:
# make a dataframe for the DDI dataset
DDI_df = pd.DataFrame({'drug1_id': DDI_matrix[:,0], 'drug2_id': DDI_matrix[:,1], 'DDI': DDI_matrix[:,2]})

# print the size of the DDI dataset
print('Number of drug-drug pairs in the DDI dataset: ', DDI_df.shape[0])

Number of drug-drug pairs in the DDI dataset:  3363121


## 4.2 Add drug properties

In [68]:
info_List = ['name', 'atc_codes',  'targets', 'enzymes',  'carriers', 'transporters', ] 

# 'mesh_ids', 'logP', 'logS', 'mw', 'psa', 'hbondacceptor','hbonddonor', 'smiles',
# 'targets_genes','enzymes_genes','carriers_genes', 'transporters_genes', 'routes'

In [69]:
# define a function to assign information to each drug
def assignDrugInfo(info_List):
    for item in info_List:
        item_Dict = dict(zip(drugbank_df['drugbank_id'],drugbank_df[item]))
        drug1_item = 'drug1_' + item
        DDI_df[drug1_item] = DDI_df["drug1_id"].map(item_Dict)
        drug2_item = 'drug2_' + item
        DDI_df[drug2_item] = DDI_df["drug2_id"].map(item_Dict)

In [70]:
# assign info to each drug
assignDrugInfo(info_List)

## 4.3 Write & Read for future processing

In [72]:
# write DDI_df 
DDI_df.to_csv(r'new_DDI_df.csv', index=False)

In [None]:
# # note some lists may be converted to string variables
# # may need to restore the list type for certain columns 
# DDI_df['drug1_atc_codes'] = \
# [re.findall(r"'(.*?)'", item, re.DOTALL) for item in list(DDI_df['drug1_atc_codes'])]

# DDI_df['drug2_atc_codes'] = \
# [re.findall(r"'(.*?)'", item, re.DOTALL) for item in list(DDI_df['drug2_atc_codes'])]

In [None]:
# list(DDI_df['drug1_atc_codes'])

## 4.4 Look into DDI_df

In [73]:
DDI_df.columns

Index(['drug1_id', 'drug2_id', 'DDI', 'drug1_name', 'drug2_name',
       'drug1_atc_codes', 'drug2_atc_codes', 'drug1_targets', 'drug2_targets',
       'drug1_enzymes', 'drug2_enzymes', 'drug1_carriers', 'drug2_carriers',
       'drug1_transporters', 'drug2_transporters'],
      dtype='object')

In [74]:
# print positive DDI rate in the DDI dataset
print('Positive DDI rate: ', np.sum(DDI_df['DDI']) / DDI_df.shape[0])

Positive DDI rate:  0.18302552896550556


In [75]:
len(set(list(DDI_df['drug1_name'])).union(set(list(DDI_df['drug1_name']))))

2593

In [76]:
len(set(list(DDI_df['drug2_name'])))

2593

In [77]:
np.sum(DDI_df[DDI_df['drug1_name'] == 'Valsartan']['DDI'])

767

In [78]:
DDI_df.isnull().values.any()

False

In [79]:
DDI_df.shape

(3363121, 15)