# Biblioeteke


In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
import mordred
from mordred import Calculator, descriptors

# Dataset

In [104]:
df=pd.read_csv('data/HSP_SMILES.csv',index_col=0)
df.reset_index(drop=True,inplace=True) #since we removed 8 molecules such as: water , N , [Br-], etc. that can't be turned to graph molecules, we reset the indexes in the dataset 
df

Unnamed: 0,al,CAS,smiles,δD,δP,δH
0,"1,1,1,2-Tetrachloroethane",b'630-20-6',ClCC(Cl)(Cl)Cl,18.0,4.4,4.2
1,"1,1,1-Trichloroethane",b'71-55-6',CC(Cl)(Cl)Cl,16.8,4.3,2.0
2,"1,1,1-Trifluoroethane",b'420-46-2',CC(F)(F)F,14.6,10.0,0.0
3,"1,1,2,2-Tetrabromoethane",b'79-27-6',BrC(Br)C(Br)Br,21.0,7.0,8.2
4,"1,1,2,2-Tetrachloroethane",b'79-34-5',ClC(Cl)C(Cl)Cl,18.8,5.1,5.3
...,...,...,...,...,...,...
1187,Quinine,b'130-95-0',[H][C@@]1([C@@H](C2=CC=NC3=CC=C(C=C23)OC)O)C[C...,19.0,6.6,11.0
1188,Sulfur Dioxide,b'9/5/7446',O=S=O,15.8,8.4,10.0
1189,Thionyl Chloride,b'9/7/7719',O=S(Cl)Cl,16.9,6.4,6.1
1190,Triethylene Glycol Monooleyl Ether,b'5274-66-8',COCCOCCOCCO,16.0,3.1,8.4


In [105]:
df['smiles'] = df['smiles'].astype(str)

#get smiles in a list
smiles=list(df['smiles'].values.astype(str))

#generate molecule objects from smiles list
mols = [Chem.MolFromSmiles(smile) for smile in smiles]



# Descriptors

Initialize descritpor calculator:

In [106]:
calc = Calculator(descriptors,ignore_3D=True) 
len(calc.descriptors)

1613

Calculate descritpors for molecules in `mols`:

In [107]:
descs = calc.pandas(mols) 
descs.info()

 54%|█████▍    | 648/1192 [00:09<00:05, 96.64it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 62%|██████▏   | 742/1192 [00:11<00:06, 70.86it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 1192/1192 [00:16<00:00, 73.28it/s] 


<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 1192 entries, 0 to 1191
Columns: 1613 entries, ABC to mZagreb2
dtypes: bool(2), float64(500), int64(309), object(802)
memory usage: 14.7+ MB


Error tracking:

In [108]:
ERRS = [mordred.error.MissingValueBase,
        mordred.error.Missing,
        mordred.error.Error,
        mordred.error.MultipleFragments,
        mordred.error.Missing3DCoordinate,
        mordred.error.Timeout]

# if there is an error, replace it with np.nan 
for err in ERRS:
    descs = descs.applymap(lambda x: np.nan if isinstance(x, err) else x)

The descriptor dataframe:

In [109]:
descs

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,4.012290,4.284816,0,0,5.818626,2.074313,4.148627,5.818626,0.969771,2.626363,...,7.989899,31.665095,165.891061,20.736383,28,3,24.0,22.0,4.312500,1.375000
1,3.464102,3.464102,0,0,4.000000,2.000000,4.000000,4.000000,0.800000,2.444466,...,7.625107,29.418928,131.930033,16.491254,16,0,20.0,16.0,4.062500,1.000000
2,3.464102,3.464102,0,0,4.000000,2.000000,4.000000,4.000000,0.800000,2.444466,...,7.625107,29.418928,84.018685,10.502336,16,0,20.0,16.0,4.062500,1.000000
3,3.932653,4.244375,0,0,6.000000,2.000000,4.000000,6.000000,1.000000,2.610845,...,7.626083,30.698690,341.688998,42.711125,29,4,22.0,21.0,4.222222,1.444444
4,3.932653,4.244375,0,0,6.000000,2.000000,4.000000,6.000000,1.000000,2.610845,...,7.626083,30.698690,165.891061,20.736383,29,4,22.0,21.0,4.222222,1.444444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187,18.958632,14.851657,0,1,32.595631,2.494948,4.989896,32.595631,1.358151,4.132305,...,10.353448,59.266606,324.183778,6.753829,1286,42,132.0,161.0,7.000000,5.333333
1188,1.414214,1.414214,0,0,2.828427,1.414214,2.828427,2.828427,0.942809,1.849457,...,4.174387,17.310771,63.961900,21.320633,4,0,6.0,4.0,2.250000,1.000000
1189,2.449490,2.449490,0,0,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,6.188264,24.179697,117.904691,29.476173,9,0,12.0,9.0,3.111111,1.000000
1190,7.071068,6.765664,0,0,13.191508,1.931852,3.863703,13.191508,1.199228,3.202455,...,7.601402,37.236738,164.104859,6.077958,220,8,38.0,36.0,4.250000,3.000000


Count the NaN values (errors) in columns:

In [110]:
nans = {}
for column_name in descs.columns:
    nans[column_name]=0
    
for column_name in descs.columns:
    column = descs[column_name]
    # Get the count of nans in column 
    count = (column.isna()).sum()
    nans[column_name]=count

In [111]:
#make dataframe from dict to see the amount of NaN values for each descriptor
nans = pd.DataFrame.from_dict(nans,orient='index',columns=['nans'])
nans

Unnamed: 0,nans
ABC,0
ABCGG,0
nAcid,0
nBase,0
SpAbs_A,4
...,...
WPol,0
Zagreb1,0
Zagreb2,0
mZagreb1,3


Select descriptors that have more than 5% of NaN values for our dataset:

(5% is arbitrary, but since the dataset is smaller, it's better to clear out as much errors as possible)

In [112]:
threshold = 0.05 #5%
n=descs.shape[0]
t=threshold*n #%5 in numbers 

nans[nans['nans']>=t]

Unnamed: 0,nans
AATS4dv,127
AATS5dv,288
AATS6dv,475
AATS7dv,697
AATS8dv,835
...,...
MDEN-12,1180
MDEN-13,1189
MDEN-22,1176
MDEN-23,1187


Remove the descriptors that satisfy this condition:

In [113]:
nans_to_drop = nans[nans['nans']>=t].index
print(len(nans_to_drop))
#new descriptor dataframe
descs_nonan=descs.drop(nans_to_drop,axis=1)

437


Now, count the number of zeros for each descriptor. We remove the descriptor if it has over 90% of zero values:

In [114]:
zeros = {}

for column_name in descs_nonan.columns:
    zeros[column_name]=0
    
for column_name in descs_nonan.columns:
    column = descs_nonan[column_name]
    # Get the count of Zeros in column 
    count = (column == 0).sum()
    zeros[column_name]=count

zeros = pd.DataFrame.from_dict(zeros,orient='index',columns=['zeros'])
threshold = 0.9
n=descs_nonan.shape[0]
t=threshold*n

zeros_to_drop=zeros[zeros['zeros']>=t].index
print(len(zeros_to_drop))

305


Drop these columns(descriptors):

In [115]:
descs_filtered = descs_nonan.drop(zeros_to_drop,axis=1)

In [117]:
descs_filtered

Unnamed: 0,ABC,ABCGG,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,4.012290,4.284816,5.818626,2.074313,4.148627,5.818626,0.969771,2.626363,2.285500,0.380917,...,7.989899,31.665095,165.891061,20.736383,28,3,24.0,22.0,4.312500,1.375000
1,3.464102,3.464102,4.000000,2.000000,4.000000,4.000000,0.800000,2.444466,2.121320,0.424264,...,7.625107,29.418928,131.930033,16.491254,16,0,20.0,16.0,4.062500,1.000000
2,3.464102,3.464102,4.000000,2.000000,4.000000,4.000000,0.800000,2.444466,2.121320,0.424264,...,7.625107,29.418928,84.018685,10.502336,16,0,20.0,16.0,4.062500,1.000000
3,3.932653,4.244375,6.000000,2.000000,4.000000,6.000000,1.000000,2.610845,2.309401,0.384900,...,7.626083,30.698690,341.688998,42.711125,29,4,22.0,21.0,4.222222,1.444444
4,3.932653,4.244375,6.000000,2.000000,4.000000,6.000000,1.000000,2.610845,2.309401,0.384900,...,7.626083,30.698690,165.891061,20.736383,29,4,22.0,21.0,4.222222,1.444444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187,18.958632,14.851657,32.595631,2.494948,4.989896,32.595631,1.358151,4.132305,4.427587,0.184483,...,10.353448,59.266606,324.183778,6.753829,1286,42,132.0,161.0,7.000000,5.333333
1188,1.414214,1.414214,2.828427,1.414214,2.828427,2.828427,0.942809,1.849457,1.707107,0.569036,...,4.174387,17.310771,63.961900,21.320633,4,0,6.0,4.0,2.250000,1.000000
1189,2.449490,2.449490,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,1.931852,0.482963,...,6.188264,24.179697,117.904691,29.476173,9,0,12.0,9.0,3.111111,1.000000
1190,7.071068,6.765664,13.191508,1.931852,3.863703,13.191508,1.199228,3.202455,3.100954,0.281905,...,7.601402,37.236738,164.104859,6.077958,220,8,38.0,36.0,4.250000,3.000000


In [118]:
DF=pd.concat([df,descs_filtered],axis=1)

new_cols = list(DF.columns)
trm=['δD', 'δP', 'δH']

for el in trm:
    new_cols.remove(el)
    
new_cols+=trm
DF = DF[new_cols]

DF.head()

Unnamed: 0,al,CAS,smiles,ABC,ABCGG,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,δD,δP,δH
0,"1,1,1,2-Tetrachloroethane",b'630-20-6',ClCC(Cl)(Cl)Cl,4.01229,4.284816,5.818626,2.074313,4.148627,5.818626,0.969771,...,20.736383,28,3,24.0,22.0,4.3125,1.375,18.0,4.4,4.2
1,"1,1,1-Trichloroethane",b'71-55-6',CC(Cl)(Cl)Cl,3.464102,3.464102,4.0,2.0,4.0,4.0,0.8,...,16.491254,16,0,20.0,16.0,4.0625,1.0,16.8,4.3,2.0
2,"1,1,1-Trifluoroethane",b'420-46-2',CC(F)(F)F,3.464102,3.464102,4.0,2.0,4.0,4.0,0.8,...,10.502336,16,0,20.0,16.0,4.0625,1.0,14.6,10.0,0.0
3,"1,1,2,2-Tetrabromoethane",b'79-27-6',BrC(Br)C(Br)Br,3.932653,4.244375,6.0,2.0,4.0,6.0,1.0,...,42.711125,29,4,22.0,21.0,4.222222,1.444444,21.0,7.0,8.2
4,"1,1,2,2-Tetrachloroethane",b'79-34-5',ClC(Cl)C(Cl)Cl,3.932653,4.244375,6.0,2.0,4.0,6.0,1.0,...,20.736383,29,4,22.0,21.0,4.222222,1.444444,18.8,5.1,5.3


In [121]:
DF.drop(['Lipinski','GhoseFilter'],axis=1,inplace=True) #this descriptors raise some errors in the code, so we remove them

In [119]:
DF.to_csv('data/HSP_descriptors_NO_ERRS_ZEROS.csv')