In [1]:
import untangle
import pandas as pd
import numpy as np
import os

In [2]:
### Parsing Drugbank XML

In [3]:
#takes 5 minutes
filename="drugbank_20160420.xml" # DrugBank Version 4.5.0 (release date: 2016.04.20) 
obj=untangle.parse(filename)

### Building dataframe of chemical descriptors

In [4]:
#Data Frame of DrugBank Small Molecule Type Drugs
df_drugbank_sm=pd.DataFrame(columns=["drugbank_id","name","cas","smiles","logP ALOGPS", "logP ChemAxon", "solubility ALOGPS", "pKa (strongest acidic)", "pKa (strongest basic)"])
df_drugbank_sm

Unnamed: 0,drugbank_id,name,cas,smiles,logP ALOGPS,logP ChemAxon,solubility ALOGPS,pKa (strongest acidic),pKa (strongest basic)


In [5]:
# Takes around 10 minutes to run.
i=-1
#iterate over drug entries to extract information
for drug in obj.drugbank.drug:
    drug_type= str(drug["type"])
    
    # select for small molecule drugs
    if drug_type in ["small molecule", "Small Molecule", "Small molecule"]:
        i=i+1    
        
        #Get drugbank_id
        for id in drug.drugbank_id:
            if str(id["primary"])=="true":
                df_drugbank_sm.loc[i, "drugbank_id"]=id.cdata
        #Drug name
        df_drugbank_sm.loc[i,"name"]=drug.name.cdata
        
        #Drug CAS
        df_drugbank_sm.loc[i, "cas"]=drug.cas_number.cdata
        
        #Get SMILES, logP, Solubility
        #Skip drugs with no structure. ("DB00386","DB00407","DB00702","DB00785","DB00840",
        #                                            "DB00893","DB00930","DB00965", "DB01109","DB01266",
        #                                           "DB01323", "DB01341"...)
        if len(drug.calculated_properties.cdata)==0: #If there is no calculated properties
            continue
        else:
            for property in drug.calculated_properties.property:
                if property.kind.cdata == "SMILES":
                    df_drugbank_sm.loc[i, "smiles"]=property.value.cdata
                    
                if property.kind.cdata == "logP":
                    if property.source.cdata == "ALOGPS":
                        df_drugbank_sm.loc[i, "logP ALOGPS"]=property.value.cdata
                    if property.source.cdata == "ChemAxon":
                        df_drugbank_sm.loc[i, "logP ChemAxon"]=property.value.cdata
                
                if property.kind.cdata == "Water Solubility":
                    df_drugbank_sm.loc[i, "solubility ALOGPS"]=property.value.cdata
                
                if property.kind.cdata == "pKa (strongest acidic)":
                    df_drugbank_sm.loc[i, "pKa (strongest acidic)"]=property.value.cdata
                
                if property.kind.cdata == "pKa (strongest basic)":
                    df_drugbank_sm.loc[i, "pKa (strongest basic)"]=property.value.cdata
            

In [6]:
df_drugbank_sm.head(10)

Unnamed: 0,drugbank_id,name,cas,smiles,logP ALOGPS,logP ChemAxon,solubility ALOGPS,pKa (strongest acidic),pKa (strongest basic)
0,DB00006,Bivalirudin,128270-60-0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,-0.76,-14.0,4.64e-02 g/l,2.79,11.88
1,DB00014,Goserelin,65807-02-5,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,0.3,-5.2,2.83e-02 g/l,9.27,10.82
2,DB00035,Desmopressin,16679-58-6,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-1.0,-6.1,1.10e-01 g/l,9.5,11.77
3,DB00050,Cetrorelix,120287-85-6,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,1.33,-1.7,6.94e-03 g/l,9.49,11.11
4,DB00091,Cyclosporine,59865-13-3,CCC1NC(=O)C(C(O)C(C)C\C=C\C)N(C)C(=O)C(C(C)C)N...,4.37,6.92,5.81e-03 g/l,3.69,1.94
5,DB00093,Felypressin,56-59-7,NCCCC[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CSSC[...,-1.1,-5.8,4.53e-02 g/l,11.39,10.18
6,DB00104,Octreotide,83150-76-9,C[C@@H](O)[C@@H](CO)NC(=O)[C@@H]1CSSC[C@H](NC(...,0.42,-1.4,1.22e-02 g/l,11.4,10.17
7,DB00114,Pyridoxal Phosphate,54-47-7,CC1=NC=C(COP(O)(O)=O)C(C=O)=C1O,-0.55,-2.1,5.70e+00 g/l,1.68,4.11
8,DB00115,Cyanocobalamin,68-19-9,OC[C@H]1O[C@@H]([C@H](O)[C@@H]1OP(O)(=O)O[C@](...,1.87,,3.84e-02 g/l,1.84,8.77
9,DB00116,Tetrahydrofolic acid,135-16-0,NC1=NC(=O)C2=C(NCC(CNC3=CC=C(C=C3)C(=O)N[C@@H]...,-0.96,-4.2,2.69e-01 g/l,3.51,3.58


In [7]:
print df_drugbank_sm.shape

(7863, 9)


In [8]:
#Drop drugs without SMILES from the dataframe
df_drugbank_smiles = df_drugbank_sm.dropna()
df_drugbank_smiles= df_drugbank_smiles.reset_index(drop=True)
print(df_drugbank_smiles.shape)

(5653, 9)


### Counting Substructures for NHISS descriptor

In [9]:
#This section requires using OpenEye OEChem library, version 2.0.5. 

In [10]:
df_drugbank_smiles.loc[:,"F"] = None
df_drugbank_smiles.loc[:,"carbonyl"] = None
df_drugbank_smiles.loc[:,"sulfinyl"] = None
df_drugbank_smiles.loc[:,"sulfonyl"] = None
df_drugbank_smiles.loc[:,"nitroso"] = None
df_drugbank_smiles.loc[:,"nitro"] = None
df_drugbank_smiles.head()

Unnamed: 0,drugbank_id,name,cas,smiles,logP ALOGPS,logP ChemAxon,solubility ALOGPS,pKa (strongest acidic),pKa (strongest basic),F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro
0,DB00006,Bivalirudin,128270-60-0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,-0.76,-14.0,4.64e-02 g/l,2.79,11.88,,,,,,
1,DB00014,Goserelin,65807-02-5,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,0.3,-5.2,2.83e-02 g/l,9.27,10.82,,,,,,
2,DB00035,Desmopressin,16679-58-6,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-1.0,-6.1,1.10e-01 g/l,9.5,11.77,,,,,,
3,DB00050,Cetrorelix,120287-85-6,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,1.33,-1.7,6.94e-03 g/l,9.49,11.11,,,,,,
4,DB00091,Cyclosporine,59865-13-3,CCC1NC(=O)C(C(O)C(C)C\C=C\C)N(C)C(=O)C(C(C)C)N...,4.37,6.92,5.81e-03 g/l,3.69,1.94,,,,,,


In [11]:
#write to csv
df_drugbank_smiles.to_csv("df_drugbank_smiles.csv", encoding='utf-8')

In [12]:
# Run the following to populate the dataframe from terminal (runs faster):
import os
%run count_carbonyls.py
%run count_fluorines.py
%run count_sulfinyls.py
%run count_sulfonyls.py
%run count_nitroso.py
%run count_nitro.py

Done.
Done.
Done.
Done
Done.
Done.
