In [1]:
import pandas as pd
import numpy as np
import os
import re
from __future__ import print_function

In [2]:
### Creating dataframe for the list of molecules

path = "./"
filename="430_molecules_list.txt"
file = open(os.path.join(path, filename), "r")

filename_list = []
for line in file:
    filename_list.append(line.split('\n')[0])

print(len(filename_list))
print(filename_list[:5])

433
['A 939572.mol', 'ABT-751.mol', 'ABT737.mol', 'AGI-6780.mol', 'AMI-1.mol']


In [3]:
df_molecules = pd.DataFrame(filename_list)
df_molecules.columns= ["NAME"]
print(df_molecules.size)

df_molecules["molecule label"] = None
df_molecules.head()

433


Unnamed: 0,NAME,molecule label
0,A 939572.mol,
1,ABT-751.mol,
2,ABT737.mol,
3,AGI-6780.mol,
4,AMI-1.mol,


In [4]:
df_molecules.iloc[0,0]

'A 939572.mol'

### Create SMILES strings for all molecules

In [5]:
from openeye import oechem, oedepict
from __future__ import print_function

In [6]:
df_molecules["smiles"] = None
df_molecules.head()

Unnamed: 0,NAME,molecule label,smiles
0,A 939572.mol,,
1,ABT-751.mol,,
2,ABT737.mol,,
3,AGI-6780.mol,,
4,AMI-1.mol,,


In [7]:
ifs = oechem.oemolistream()
ofs = oechem.oemolostream()

ifs.SetFormat(oechem.OEFormat_MOL2)
ofs.SetFormat(oechem.OEFormat_SMI)

for i, row in enumerate(df_molecules.iterrows()):
    df_molecules.ix[i,"molecule label"] = re.split("[.]", df_molecules.ix[i,"NAME"])[0]
    
    file_name = df_molecules.ix[i,0]
    mol_file_path = "./430molecules"
    this_path = os.path.join(mol_file_path, file_name)
    mol_file = ifs.open(os.path.join(this_path))
   
    for mol in ifs.GetOEGraphMols():
        #print ("Number of atoms:", mol.NumAtoms())
        #print ("Canonical isomeric SMILES:", OEMolToSmiles(mol))
        df_molecules.ix[i,"smiles"] = oechem.OEMolToSmiles(mol)
        
df_molecules.head()

Unnamed: 0,NAME,molecule label,smiles
0,A 939572.mol,A 939572,CNC(=O)c1cccc(c1)NC(=O)N2CCC(CC2)Oc3ccccc3Cl
1,ABT-751.mol,ABT-751,COc1ccc(cc1)S(=O)(=O)Nc2cccnc2Nc3ccc(cc3)O
2,ABT737.mol,ABT737,CN(C)CC[C@@H](CSc1ccccc1)Nc2ccc(cc2[N+](=O)[O-...
3,AGI-6780.mol,AGI-6780,c1cc(cc(c1)NC(=O)Nc2cc(ccc2c3ccsc3)S(=O)(=O)NC...
4,AMI-1.mol,AMI-1,c1cc2c(cc1NC(=O)Nc3ccc4c(c3)cc(cc4O)S(=O)(=O)O...


### Counting Substructures for NHISS descriptor
This section requires using OpenEye OEChem library, version 2.0.5.

In [8]:
df_molecules.loc[:,"F"] = None
df_molecules.loc[:,"carbonyl"] = None
df_molecules.loc[:,"sulfinyl"] = None
df_molecules.loc[:,"sulfonyl"] = None
df_molecules.loc[:,"nitroso"] = None
df_molecules.loc[:,"nitro"] = None
df_molecules.head()

Unnamed: 0,NAME,molecule label,smiles,F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro
0,A 939572.mol,A 939572,CNC(=O)c1cccc(c1)NC(=O)N2CCC(CC2)Oc3ccccc3Cl,,,,,,
1,ABT-751.mol,ABT-751,COc1ccc(cc1)S(=O)(=O)Nc2cccnc2Nc3ccc(cc3)O,,,,,,
2,ABT737.mol,ABT737,CN(C)CC[C@@H](CSc1ccccc1)Nc2ccc(cc2[N+](=O)[O-...,,,,,,
3,AGI-6780.mol,AGI-6780,c1cc(cc(c1)NC(=O)Nc2cc(ccc2c3ccsc3)S(=O)(=O)NC...,,,,,,
4,AMI-1.mol,AMI-1,c1cc2c(cc1NC(=O)Nc3ccc4c(c3)cc(cc4O)S(=O)(=O)O...,,,,,,


In [9]:
df_molecules.iloc[420:,:]

Unnamed: 0,NAME,molecule label,smiles,F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro
420,nefazodone.mol,nefazodone,CCc1nn(c(=O)n1CCOc2ccccc2)CCCN3CCN(CC3)c4cccc(...,,,,,,
421,ondansetron.mol,ondansetron,Cc1nccn1C[C@@H]2CCc3c(c4ccccc4n3C)C2=O,,,,,,
422,pantoprazole.mol,pantoprazole,COc1ccnc(c1OC)CS(=O)c2[nH]c3ccc(cc3n2)OC(F)F,,,,,,
423,ramipril.mol,ramipril,CCOC(=O)[C@H](CCc1ccccc1)N[C@@H](C)C(=O)N2[C@H...,,,,,,
424,rofecoxib.mol,rofecoxib,CS(=O)(=O)c1ccc(cc1)C2=C(C(=O)OC2)c3ccccc3,,,,,,
425,silvesterol.mol,silvesterol,COc1ccc(cc1)[C@]23[C@@H]([C@H]([C@H]([C@]2(c4c...,,,,,,
426,simvastatin.mol,simvastatin,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,,,,,,
427,spironolactone.mol,spironolactone,CC(=O)S[C@@H]1CC2=CC(=O)CC[C@@]2([C@@H]3[C@@H]...,,,,,,
428,sunitinib.mol,sunitinib,CCN(CC)CCNC(=O)c1c(c([nH]c1C)/C=C\2/c3cc(ccc3N...,,,,,,
429,tetracycline.mol,tetracycline,C[C@]1(c2cccc(c2C(=O)C3=C([C@]4([C@@H](C[C@@H]...,,,,,,


In [10]:
#write to csv
df_molecules.to_csv("df_molecules.csv", encoding='utf-8')

In [11]:
# Run the following to populate the dataframe from terminal (runs faster):
import os
%run count_carbonyls.py
%run count_fluorines.py
%run count_sulfinyls.py
%run count_sulfonyls.py
%run count_nitroso.py
%run count_nitro.py

Done.
Done.
Done.
Done
Done.
Done.


### Calculating NHISS (Number of High Instrinsic State Substructures)
NHISS descriptor is the total number of fluorines and double bonded oxygens in the structure.
$ NHISS = fluorine + carbonyl + sulfinyl + 2*sulfonyl + nitroso + 2*nitro $

In [12]:
df_molecules.loc[:,"NHISS"] = None
for i, row in enumerate(df_molecules.iterrows()):
    NHISS= df_molecules.loc[i,"F"] + df_molecules.loc[i,"carbonyl"]+ df_molecules.loc[i,"sulfinyl"] + 2*df_molecules.loc[i,"sulfonyl"] + df_molecules.loc[i,"nitroso"] + 2*df_molecules.loc[i,"nitro"]
    df_molecules.loc[i,"NHISS"]=NHISS

df_molecules.head()

Unnamed: 0,NAME,molecule label,smiles,F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro,NHISS
0,A 939572.mol,A 939572,CNC(=O)c1cccc(c1)NC(=O)N2CCC(CC2)Oc3ccccc3Cl,0,2,0,0,0,0,2
1,ABT-751.mol,ABT-751,COc1ccc(cc1)S(=O)(=O)Nc2cccnc2Nc3ccc(cc3)O,0,0,0,1,0,0,2
2,ABT737.mol,ABT737,CN(C)CC[C@@H](CSc1ccccc1)Nc2ccc(cc2[N+](=O)[O-...,0,1,0,1,0,1,5
3,AGI-6780.mol,AGI-6780,c1cc(cc(c1)NC(=O)Nc2cc(ccc2c3ccsc3)S(=O)(=O)NC...,3,1,0,1,0,0,6
4,AMI-1.mol,AMI-1,c1cc2c(cc1NC(=O)Nc3ccc4c(c3)cc(cc4O)S(=O)(=O)O...,0,1,0,2,0,0,5


### Import Spmax descriptor  data

In [13]:
# number of SpMAX values calculated by Dragon 6
n=433

# read SpMAX values from csv
exp_data_path = "./"
filename="433-spmax1-8.csv"
exp_data_file = os.path.join(exp_data_path, filename)

# create a dataframe for SpMAX values
df_spmax_data = pd.read_csv(exp_data_file, sep=",", header=0)
df_spmax_data = df_spmax_data.iloc[:,0:8]
#df_exp_data = df_exp_data.ix[0:(n-1),1:4]

df_spmax_data.tail()

Unnamed: 0,NAME,SpMax1_Bh(s),SpMax2_Bh(s),SpMax3_Bh(s),SpMax4_Bh(s),SpMax5_Bh(s),SpMax6_Bh(s),SpMax7_Bh(s)
426,Tripelennamine.mol,4.902,4.743,4.331,4.016,3.823,3.674,3.623
427,Tizanidine.mol,5.106,4.765,4.692,3.983,3.872,3.531,3.433
428,mirtazapine.mol,4.902,4.693,4.328,3.896,3.891,3.679,3.587
429,Thiabendazole.mol,5.1,4.773,4.289,3.852,3.35,3.172,3.148
430,Thymol.mol,6.469,4.634,4.158,3.736,3.467,3.416,2.982


In [14]:
df_spmax_data.loc[df_spmax_data["NAME"]=="Buparlisib.mol"]

Unnamed: 0,NAME,SpMax1_Bh(s),SpMax2_Bh(s),SpMax3_Bh(s),SpMax4_Bh(s),SpMax5_Bh(s),SpMax6_Bh(s),SpMax7_Bh(s)
214,Buparlisib.mol,,,,5.17,,,


In [15]:
df_spmax_data.loc[df_spmax_data["NAME"]=="cisplatin.mol"]

Unnamed: 0,NAME,SpMax1_Bh(s),SpMax2_Bh(s),SpMax3_Bh(s),SpMax4_Bh(s),SpMax5_Bh(s),SpMax6_Bh(s),SpMax7_Bh(s)


In [16]:
df_molecules_spmax = pd.merge(df_molecules, df_spmax_data, on=["NAME"])
print(df_molecules_spmax.size)
df_molecules_spmax.tail(5)

7310


Unnamed: 0,NAME,molecule label,smiles,F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro,NHISS,SpMax1_Bh(s),SpMax2_Bh(s),SpMax3_Bh(s),SpMax4_Bh(s),SpMax5_Bh(s),SpMax6_Bh(s),SpMax7_Bh(s)
425,sunitinib.mol,sunitinib,CCN(CC)CCNC(=O)c1c(c([nH]c1C)/C=C\2/c3cc(ccc3N...,1,2,0,0,0,0,3,8.201,7.431,7.427,4.819,4.646,4.338,4.045
426,tetracycline.mol,tetracycline,C[C@]1(c2cccc(c2C(=O)C3=C([C@]4([C@@H](C[C@@H]...,0,3,0,0,0,0,3,7.45,7.427,7.416,6.502,6.47,6.462,6.445
427,turofexorate isopropyl.mol,turofexorate isopropyl,CC(C)OC(=O)C1=CN(CC(c2c1[nH]c3c2cccc3)(C)C)C(=...,2,2,0,0,0,0,4,8.24,8.161,7.434,7.426,4.963,4.535,4.49
428,vismodegib.mol,vismodegib,CS(=O)(=O)c1ccc(c(c1)Cl)C(=O)Nc2ccc(c(c2)c3ccc...,0,1,0,1,0,0,3,7.716,7.429,6.999,4.998,4.884,4.789,4.37
429,voxtalisib.mol,voxtalisib,CCn1c2c(cc(c1=O)c3ccn[nH]3)c(nc(n2)N)C,0,1,0,0,0,0,1,7.428,5.22,5.018,4.621,4.166,3.995,3.925


In [17]:
df_molecules_spmax.loc[df_molecules_spmax["NAME"]=="Buparlisib.mol"]

Unnamed: 0,NAME,molecule label,smiles,F,carbonyl,sulfinyl,sulfonyl,nitroso,nitro,NHISS,SpMax1_Bh(s),SpMax2_Bh(s),SpMax3_Bh(s),SpMax4_Bh(s),SpMax5_Bh(s),SpMax6_Bh(s),SpMax7_Bh(s)
47,Buparlisib.mol,Buparlisib,c1c(c(cnc1N)c2cc(nc(n2)N3CCOCC3)N4CCOCC4)C(F)(F)F,3,0,0,0,0,0,3,,,,5.17,,,


In [18]:
print(df_molecules_spmax.shape)

(430, 17)


In [19]:
csv = df_molecules_spmax.to_csv(file_name="df_molecules_spmax.csv", sep=",")
f = open("df_molecules_spmax.csv", "w")
f.write(csv)