In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import rdkit
import numpy as np
import pandas as pd
rdkit.__version__

'2023.03.3'

Read in the acid SMILES strings and generate descriptors for them

In [2]:
# Read in the acid smiles
smiles_acids = pd.read_csv("amide_smiles_substrates_acids.csv",index_col=0,header=0).index.to_list()
print(f"{len(smiles_acids)} acid SMILES read in.")

66 acid SMILES read in.


In [3]:
# define a function for descriptor calculation
def getMolDescriptors(mol, missingVal=np.nan):
    descriptors = {}
    # calculate the descriptors
    for name,function in Descriptors._descList:
        try:
            value = function(mol)
        except:
            # assign np.nan if descriptor calculation failed
            value = missingVal
        descriptors[name] = value
    return descriptors

In [4]:
# calculate the descriptors
descriptors_acids = [getMolDescriptors(Chem.MolFromSmiles(smiles)) for smiles in smiles_acids]

In [5]:
# move to a dataframe
df_acids = pd.DataFrame(descriptors_acids, index = smiles_acids)
df_acids

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
O=C(O)c1cc2ccccc2s1,10.580980,10.580980,0.399259,-0.850741,0.728317,178.212,172.164,178.008850,60,0,...,0,0,0,0,0,0,0,1,0,0
O=C(O)c1ccco1,9.968287,9.968287,0.023148,-1.032407,0.588598,112.084,108.052,112.016044,42,0,...,0,0,0,0,0,0,0,0,0,0
O=C(O)c1cccc(-c2ccccc2)c1,10.784467,10.784467,0.316068,-0.895573,0.804747,198.221,188.141,198.068080,74,0,...,0,0,0,0,0,0,0,0,0,0
CC(C(=O)O)c1ccc(-c2ccccc2)c(F)c1,13.966636,13.966636,0.396481,-0.956610,0.893750,244.265,231.161,244.089958,92,0,...,0,0,0,0,0,0,0,0,0,0
O=C(O)C(c1ccccc1)c1ccccc1,11.300378,11.300378,0.581111,-0.821852,0.848907,212.248,200.152,212.083730,80,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCCCOc1ccc(C(=O)O)cc1,10.531775,10.531775,0.285038,-0.911937,0.732558,194.230,180.118,194.094294,76,0,...,0,0,0,0,0,0,0,0,1,0
O=C(O)c1ccc(-c2ccccc2)cc1,10.648736,10.648736,0.314520,-0.893878,0.804747,198.221,188.141,198.068080,74,0,...,0,0,0,0,0,0,0,0,0,0
O=C(O)C1CCC(F)(F)CC1,12.448262,12.448262,0.113426,-2.619097,0.642153,164.151,154.071,164.064886,64,0,...,0,0,0,0,0,0,0,0,0,0
O=C(O)c1ccncc1Cl,10.309537,10.309537,0.080247,-1.034815,0.671147,157.556,153.524,156.993056,52,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# remove columns with nan values (meaning the descriptor calculation failed)
nan_columns = df_acids.columns[df_acids.isna().any()]
df_acids.dropna(axis=1,inplace=True)
print(f"Removed {len(nan_columns)} features with missing values: {list(nan_columns)}")

Removed 0 features with missing values: []


In [7]:
# save the data
df_acids.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/amide_rdkit_descr_acids.csv",
                index=True,header=True)

Read in the amine SMILES strings and generate descriptors for them

In [8]:
# Read in the amine smiles
smiles_amines = pd.read_csv("amide_smiles_substrates_amines.csv",index_col=0,header=0).index.to_list()
print(f"{len(smiles_amines)} amine SMILES read in.")

70 amine SMILES read in.


In [9]:
# calculate the descriptors
descriptors_amines = [getMolDescriptors(Chem.MolFromSmiles(smiles)) for smiles in smiles_amines]

In [10]:
# move to a dataframe
df_amines = pd.DataFrame(descriptors_amines, index = smiles_amines)
df_amines

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
Cc1ccc(N)nc1,5.315556,5.315556,0.579259,0.579259,0.537512,108.144,100.080,108.068748,42,0,...,0,0,0,0,0,0,0,0,0,0
NCc1ccc(F)cc1F,12.530787,12.530787,0.103287,-0.581019,0.631760,143.136,136.080,143.054656,54,0,...,0,0,0,0,0,0,0,0,0,0
NCc1ccc(Cl)cc1,5.627823,5.627823,0.580525,0.580525,0.634348,141.601,133.537,141.034527,48,0,...,0,0,0,0,0,0,0,0,0,0
COc1ccc(CN)cc1,5.400528,5.400528,0.587296,0.587296,0.663324,137.182,126.094,137.084064,54,0,...,0,0,0,0,0,0,0,0,0,0
COc1ccc(N)cn1,5.363316,5.363316,0.587222,0.587222,0.598238,124.143,116.079,124.063663,48,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CC(C)NC(C)C,3.305556,3.305556,0.625000,0.625000,0.553987,101.193,86.073,101.120449,44,0,...,0,0,0,0,0,0,0,0,0,0
Cc1ccc(S(N)(=O)=O)cc1,10.721948,10.721948,0.155648,-3.519583,0.673740,171.221,162.149,171.035400,60,0,...,0,1,0,0,0,0,0,0,0,0
Nc1ccc(Cl)cn1,5.493611,5.493611,0.494815,0.494815,0.572458,128.562,123.522,128.014126,42,0,...,0,0,0,0,0,0,0,0,0,0
NCC1CC1,5.233796,5.233796,0.912037,0.912037,0.474129,71.123,62.051,71.073499,30,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# remove columns with nan values (meaning the descriptor calculation failed)
nan_columns = df_amines.columns[df_amines.isna().any()]
df_amines.dropna(axis=1,inplace=True)
print(f"Removed {len(nan_columns)} features with missing values: {list(nan_columns)}")

Removed 0 features with missing values: []


In [12]:
# save the data
df_amines.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/amide_rdkit_descr_amines.csv",
                 index=True,header=True)

Read in the amide product SMILES strings and generate descriptors for them

In [13]:
# Read in the amine smiles
smiles_amides = pd.read_csv("amide_smiles_products.csv",index_col=0,header=0).index.to_list()
print(f"{len(smiles_amides)} amine SMILES read in.")

632 amine SMILES read in.


In [14]:
# calculate the descriptors
descriptors_amides = [getMolDescriptors(Chem.MolFromSmiles(smiles)) for smiles in smiles_amides]

In [15]:
# move to a dataframe
df_amides = pd.DataFrame(descriptors_amides, index = smiles_amides)
df_amides

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
Cc1ccc(NC(=O)c2cc3ccccc3s2)nc1,12.127948,12.127948,0.113377,-0.113377,0.766533,268.341,256.245,268.067034,94,0,...,0,0,0,0,0,0,0,1,0,0
O=C(NCc1ccc(F)cc1F)c1ccco1,13.228583,13.228583,0.022176,-0.690051,0.890606,237.205,228.133,237.060135,88,0,...,0,0,0,0,0,0,0,0,0,0
O=C(NCc1ccc(Cl)cc1)c1cccc(-c2ccccc2)c1,12.331298,12.331298,0.085434,-0.085434,0.724423,321.807,305.679,321.092042,114,0,...,0,0,0,0,0,0,0,0,0,0
COc1ccc(CNC(=O)C(C)c2ccc(-c3ccccc3)c(F)c2)cc1,14.538211,14.538211,0.141038,-0.442063,0.674914,363.432,341.256,363.163457,138,0,...,0,0,0,0,0,0,0,0,0,0
COc1ccc(NC(=O)c2cccc(-c3ccccc3)c2)cn1,12.381949,12.381949,0.171530,-0.171530,0.792046,304.349,288.221,304.121178,114,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
COc1ccc(NC(=O)C2CCN(C(=O)OCc3ccccc3)CC2)cn1,12.404385,12.404385,0.056232,-0.336050,0.875972,369.421,346.237,369.168856,142,0,...,0,0,0,0,0,0,0,0,0,0
CON(C)C(=O)c1c(C)cc(C)cc1C,11.929587,11.929587,0.104167,-0.104167,0.695986,207.273,190.137,207.125929,82,0,...,0,0,0,0,0,0,0,0,0,0
Cc1ccc(S(=O)(=O)NC(=O)c2ccc3nc(C)ccc3c2)cc1,12.300494,12.300494,0.055121,-3.899579,0.795138,340.404,324.276,340.088163,122,0,...,0,1,0,0,0,0,0,0,0,0
Cc1ccc(Cl)c(NC(=O)c2ccc3nccnc3c2)c1,12.303349,12.303349,0.226803,-0.226803,0.781557,297.745,285.649,297.066890,104,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# remove columns with nan values (meaning the descriptor calculation failed)
nan_columns = df_amides.columns[df_amides.isna().any()]
df_amides.dropna(axis=1,inplace=True)
print(f"Removed {len(nan_columns)} features with missing values: {list(nan_columns)}")

Removed 0 features with missing values: []


In [17]:
# save the data
df_amides.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/amide_rdkit_descr_prods.csv",index=True,header=True)