In [1]:
# Packages RDKit
from rdkit import Chem, RDConfig
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem, rdMolAlign
from rdkit.Chem import Descriptors

# Packages
import pandas as pd 
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df_data = pd.read_excel('Data_Train_Test.xlsx')
df_data

Unnamed: 0,tPSA,WLOGP,Output,Name,SMILES
0,181.57,-0.01,0,Ertapenem,CC(O)C1C2C(C)C(SC3CNC(C3)C(=O)Nc3cccc(c3)C(O)=...
1,72.72,2.03,1,Salbutamol,CC(C)(C)NCC(O)c1ccc(O)c(CO)c1
2,167.58,3.01,0,Chlorhexidine,Clc1ccc(NC(=N)NC(=N)NCCCCNC(=N)NC(=N)Nc2ccc(Cl...
3,163.27,2.58,0,Azosemide,NS(=O)(=O)c1cc(-c2nnn[nH]2)c(NCc2cccs2)cc1Cl
4,173.45,-1.37,0,Azlocillin,CC1(C)SC2C(NC(=O)C(NC(=O)N3CCNC3=O)c3ccccc3)C(...
...,...,...,...,...,...
177,59.95,2.21,1,Cicloprolol,CC(C)NCC(O)COc1ccc(OCCOCC2CC2)cc1
178,358.19,-9.87,0,Capreomycin,CC1NC(=O)C(N)CNC(=O)C(NC(=O)C(NC(=O)C(CNC(=O)C...
179,41.57,3.79,1,Fenspiride,O=C1NCC2(CCN(CCc3ccccc3)CC2)O1
180,199.89,-1.63,0,Iohexol,CC(=O)N(CC(O)CO)c1c(I)c(C(=O)NCC(O)CO)c(I)c(C(...


In [3]:
X_TrainVal = df_data[['SMILES']][0:140].values
X_test = df_data[['SMILES']][140:].values

y_TrainVal = df_data[['Output']][0:140].values
y_test = df_data[['Output']][140:].values

In [4]:
mol_TrainVal=[]
mol_Test=[]

for k in range(X_TrainVal.shape[0]):
    mol_TrainVal.insert(k,Chem.MolFromSmiles(X_TrainVal[k][0]))
    
for k in range(X_test.shape[0]):
    mol_Test.insert(k,Chem.MolFromSmiles(X_test[k][0]))

In [5]:
# Extração de Propriedades

def extrac_prop(molecula):
    
    prop1 = []  # Massa Molecular
    prop2 = []  # área de superfície polar topológica
    prop3 = []  # logP
    prop4 = []  # Nº de H aceptores
    prop5 = []  # Nº de H doadores
    prop6 = []  # Número de anéis aromáticos
    prop7 = []  # Número de ligações rotacionáveis
    prop8 = []  # fração de carbonos sp3
    
    for k in range(len(molecula)):
        prop1.insert(k,Descriptors.MolWt(molecula[k]))
        prop2.insert(k,Descriptors.TPSA(molecula[k]))   
        prop3.insert(k,Descriptors.MolLogP(molecula[k]))
        prop4.insert(k,Descriptors.NumHAcceptors(molecula[k]))
        prop5.insert(k,Descriptors.NumHDonors(molecula[k]))
        prop6.insert(k,Descriptors.NumAromaticRings(molecula[k]))
        prop7.insert(k,Descriptors.NumRotatableBonds(molecula[k]))
        prop8.insert(k,Descriptors.FractionCSP3(molecula[k]))

    aprop1=np.array([prop1])
    aprop2=np.array([prop2])
    aprop3=np.array([prop3])
    aprop4=np.array([prop4])
    aprop5=np.array([prop5])
    aprop6=np.array([prop6])
    aprop7=np.array([prop7])
    aprop8=np.array([prop8])    
    
    return (aprop1,aprop2,aprop3,aprop4,aprop5,aprop6,aprop7,aprop8)

In [6]:
# dataframe training validation

(aprop1,aprop2,aprop3,aprop4,aprop5,aprop6,aprop7,aprop8)=extrac_prop(mol_TrainVal)

dados = np.concatenate((aprop1.T, aprop2.T, aprop3.T, aprop4.T, aprop5.T, aprop6.T, aprop7.T, aprop8.T), axis=1)
df_TrainVal = pd.DataFrame(data=dados, columns='MolWt tPSA LogP NHA NHD NAR NRB fcSP3'.split())

In [7]:
# dataframe test

(aprop1,aprop2,aprop3,aprop4,aprop5,aprop6,aprop7,aprop8)=extrac_prop(mol_Test)

dados = np.concatenate((aprop1.T, aprop2.T, aprop3.T, aprop4.T, aprop5.T, aprop6.T, aprop7.T, aprop8.T), axis=1)
df_Test = pd.DataFrame(data=dados, columns='MolWt tPSA LogP NHA NHD NAR NRB fcSP3'.split())

In [8]:
df_prop = pd.concat([df_TrainVal,df_Test], axis=0, ignore_index=True)
df_prop = df_prop.drop(['tPSA','LogP'], axis=1)

In [9]:
df_general = pd.concat([df_prop,df_data],axis=1)

In [10]:
# dataframe geral

df_general

Unnamed: 0,MolWt,NHA,NHD,NAR,NRB,fcSP3,tPSA,WLOGP,Output,Name,SMILES
0,475.523,7.0,5.0,1.0,7.0,0.454545,181.57,-0.01,0,Ertapenem,CC(O)C1C2C(C)C(SC3CNC(C3)C(=O)Nc3cccc(c3)C(O)=...
1,239.315,4.0,4.0,1.0,4.0,0.538462,72.72,2.03,1,Salbutamol,CC(C)(C)NCC(O)c1ccc(O)c(CO)c1
2,477.404,4.0,10.0,2.0,7.0,0.200000,167.58,3.01,0,Chlorhexidine,Clc1ccc(NC(=N)NC(=N)NCCCCNC(=N)NC(=N)Nc2ccc(Cl...
3,370.847,7.0,3.0,3.0,5.0,0.083333,163.27,2.58,0,Azosemide,NS(=O)(=O)c1cc(-c2nnn[nH]2)c(NCc2cccs2)cc1Cl
4,461.500,6.0,4.0,1.0,5.0,0.450000,173.45,-1.37,0,Azlocillin,CC1(C)SC2C(NC(=O)C(NC(=O)N3CCNC3=O)c3ccccc3)C(...
...,...,...,...,...,...,...,...,...,...,...,...
177,323.433,5.0,2.0,1.0,12.0,0.666667,59.95,2.21,1,Cicloprolol,CC(C)NCC(O)COc1ccc(OCCOCC2CC2)cc1
178,652.718,13.0,13.0,0.0,9.0,0.600000,358.19,-9.87,0,Capreomycin,CC1NC(=O)C(N)CNC(=O)C(NC(=O)C(NC(=O)C(CNC(=O)C...
179,260.337,3.0,1.0,1.0,3.0,0.533333,41.57,3.79,1,Fenspiride,O=C1NCC2(CCN(CCc3ccccc3)CC2)O1
180,821.141,9.0,8.0,1.0,12.0,0.526316,199.89,-1.63,0,Iohexol,CC(=O)N(CC(O)CO)c1c(I)c(C(=O)NCC(O)CO)c(I)c(C(...


In [11]:
# dataframe dos descritores

df_descript = df_general.drop(['Name','SMILES'], axis=1)
df_descript

Unnamed: 0,MolWt,NHA,NHD,NAR,NRB,fcSP3,tPSA,WLOGP,Output
0,475.523,7.0,5.0,1.0,7.0,0.454545,181.57,-0.01,0
1,239.315,4.0,4.0,1.0,4.0,0.538462,72.72,2.03,1
2,477.404,4.0,10.0,2.0,7.0,0.200000,167.58,3.01,0
3,370.847,7.0,3.0,3.0,5.0,0.083333,163.27,2.58,0
4,461.500,6.0,4.0,1.0,5.0,0.450000,173.45,-1.37,0
...,...,...,...,...,...,...,...,...,...
177,323.433,5.0,2.0,1.0,12.0,0.666667,59.95,2.21,1
178,652.718,13.0,13.0,0.0,9.0,0.600000,358.19,-9.87,0
179,260.337,3.0,1.0,1.0,3.0,0.533333,41.57,3.79,1
180,821.141,9.0,8.0,1.0,12.0,0.526316,199.89,-1.63,0


### Save Data

In [13]:
#df_descript.to_excel('dataset_HIA_.xlsx', index=False)
#df_general.to_excel('dataset_HIA_geral.xlsx', index=False)