In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df_drugbank_smiles=pd.DataFrame.from_csv('df_drugbank_smiles.csv', encoding='utf-8')

### Calculating NHISS (Number of High Instrinsic State Substructures)

NHISS descriptor is the total number of fluorines and double bonded oxygens in the structure.

$ NHISS = fluorine + carbonyl + sulfinyl + 2*sulfonyl + nitroso + 2*nitro $

In [3]:
df_drugbank_smiles.loc[:,"NHISS"] = None
df_drugbank_smiles.loc[:,"group"] = None
df_drugbank_smiles.head()

Unnamed: 0,drugbank_id,name,cas,smiles,logP ALOGPS,logP ChemAxon,solubility ALOGPS,pKa (strongest acidic),pKa (strongest basic),F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro,NHISS,group
0,DB00006,Bivalirudin,128270-60-0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,-0.76,-14.0,4.64e-02 g/l,2.79,11.88,0,26,0,0,0,0,,
1,DB00014,Goserelin,65807-02-5,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,0.3,-5.2,2.83e-02 g/l,9.27,10.82,0,11,0,0,0,0,,
2,DB00035,Desmopressin,16679-58-6,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-1.0,-6.1,1.10e-01 g/l,9.5,11.77,0,11,0,0,0,0,,
3,DB00050,Cetrorelix,120287-85-6,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,1.33,-1.7,6.94e-03 g/l,9.49,11.11,0,12,0,0,0,0,,
4,DB00091,Cyclosporine,59865-13-3,CCC1NC(=O)C(C(O)C(C)C\C=C\C)N(C)C(=O)C(C(C)C)N...,4.37,6.92,5.81e-03 g/l,3.69,1.94,0,11,0,0,0,0,,


In [4]:
for i, row in enumerate(df_drugbank_smiles.iterrows()):
    NHISS= df_drugbank_smiles.loc[i,"F"] + df_drugbank_smiles.loc[i,"carbonyl"]+ df_drugbank_smiles.loc[i,"sulfinyl"] + 2*df_drugbank_smiles.loc[i,"sulfonyl"] + df_drugbank_smiles.loc[i,"nitroso"] + 2*df_drugbank_smiles.loc[i,"nitro"]
    df_drugbank_smiles.loc[i,"NHISS"]=NHISS
    if df_drugbank_smiles.loc[i,"logP ChemAxon"]< 2.2:
        df_drugbank_smiles.loc[i,"group"]=1
    elif NHISS >= 4:
            if df_drugbank_smiles.loc[i,"pKa (strongest acidic)"] < 7.4:
                    df_drugbank_smiles.loc[i,"group"]=3
            elif df_drugbank_smiles.loc[i,"pKa (strongest basic)"] > 8:
                    df_drugbank_smiles.loc[i,"group"]=4
            else:
                df_drugbank_smiles.loc[i,"group"]=5
    else:
        df_drugbank_smiles.loc[i,"group"]=2

In [5]:
df_drugbank_smiles.head()

Unnamed: 0,drugbank_id,name,cas,smiles,logP ALOGPS,logP ChemAxon,solubility ALOGPS,pKa (strongest acidic),pKa (strongest basic),F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro,NHISS,group
0,DB00006,Bivalirudin,128270-60-0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,-0.76,-14.0,4.64e-02 g/l,2.79,11.88,0,26,0,0,0,0,26,1
1,DB00014,Goserelin,65807-02-5,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,0.3,-5.2,2.83e-02 g/l,9.27,10.82,0,11,0,0,0,0,11,1
2,DB00035,Desmopressin,16679-58-6,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-1.0,-6.1,1.10e-01 g/l,9.5,11.77,0,11,0,0,0,0,11,1
3,DB00050,Cetrorelix,120287-85-6,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,1.33,-1.7,6.94e-03 g/l,9.49,11.11,0,12,0,0,0,0,12,1
4,DB00091,Cyclosporine,59865-13-3,CCC1NC(=O)C(C(O)C(C)C\C=C\C)N(C)C(=O)C(C(C)C)N...,4.37,6.92,5.81e-03 g/l,3.69,1.94,0,11,0,0,0,0,11,3


In [6]:
#write to csv
df_drugbank_smiles.to_csv("df_drugbank_decision_tree.csv", encoding='utf-8')

In [7]:
# Number of all molecules in dataframe
print "Number of all molecules in dataframe: ", df_drugbank_smiles.shape[0]
# Number of molecules categorized in each group
for i in range(5):
    print "Number of molecules categorized as Group {}: ".format(i+1), df_drugbank_smiles.loc[df_drugbank_smiles["group"] == (i+1)].shape[0]

Number of all molecules in dataframe:  5653
Number of molecules categorized as Group 1:  3418
Number of molecules categorized as Group 2:  1918
Number of molecules categorized as Group 3:  135
Number of molecules categorized as Group 4:  22
Number of molecules categorized as Group 5:  160


- Drugs in Group 1 and 2 are predicted not to form nanoparticles.
- Drugs in Group 3 and 5 are predicted to form nanoparticles if they are prepared in water and in buffer(pH 8-9) respectively and be stable in PBS buffer (pH 7.4).
- Drugs in Group 4 are predicted to form nanoparticles if they are prepared in buffer(pH 10-11) and these nanoparticles are only stable in basic environment.