In [1]:
import pandas as pd
import numpy as np
import mordred
from rdkit import Chem
from mordred import Calculator, descriptors



In [2]:
# fetch smiles lists
df = pd.read_csv("...csv")

### Prepare rdkit molecules

In [3]:
# make rdkit mols
df['rdmol'] = b_nvts['smiles'].map(lambda x: Chem.MolFromSmiles(x))

# drop molecules that rdkit cannot handle
df= df.dropna(subset=['rdmol'])

In [4]:
# get inchi
df['inchi'] = df['rdmol'].map(Chem.MolToInchi)

In [6]:
# drop duplicates on inchi
df = df.drop_duplicates(subset="inchi")

### Calculate Mordred descriptors

In [11]:
# initialize Mordred calculators
calc=Calculator(descriptors, ignore_3D=True,)

In [12]:
%%time
# compute Mordred descriptors for all molecules (may take long)
md=calc.pandas(df['rdmol'])

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:55<00:00, 17.96it/s]


Wall time: 57.8 s


In [13]:
# replace mordred errors with NaNs
md=md.applymap(lambda x: np.nan if type(x) in [mordred.error.Missing,
                                               mordred.error.Error] else x)

In [14]:
# drop columns that have NaNs
md=md.dropna(axis=1)

In [180]:
# concatenate info and descriptors
to_save=pd.concat([df.drop('rdmol', axis=1), md], axis=1)

In [181]:
# peak
to_save.head()

Unnamed: 0,inchi,aryl,type,NV,Reaxys,Common,ABC,ABCGG,nAcid,nBase,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,InChI=1S/C9H12BClO3/c1-6(2)14-9-4-3-7(11)5-8(9...,aryl,ba,True,True,True,10.365564,9.738246,0,0,...,0.0,9.299175,45.12275,214.056802,8.232954,300,18,66.0,73.0,3.138889
1,InChI=1S/C8H10BFO2/c1-5-4-8(10)6(2)3-7(5)9(11)...,aryl,ba,True,True,True,8.91091,8.583028,0,0,...,0.0,9.296977,42.737439,168.075788,7.639809,187,17,58.0,66.0,2.666667
2,"InChI=1S/C12H16BFO2/c1-11(2)12(3,4)16-13(15-11...",aryl,bpin,True,True,True,12.630705,11.241455,0,0,...,7.195187,9.919115,63.748556,222.122738,6.941336,423,25,88.0,104.0,3.256944
3,"InChI=1S/C15H24BNO4/c1-13(2,3)19-12(18)17-10-8...",hetaryl,bpin,True,True,True,16.562115,15.535186,0,0,...,7.635787,10.17256,71.682278,293.179839,6.515108,899,32,116.0,136.0,4.1875
4,InChI=1S/C12H11BO3/c14-13(15)10-6-8-12(9-7-10)...,aryl,ba,True,True,True,12.199155,9.978922,0,0,...,0.0,9.370331,47.806046,214.080125,7.928894,489,20,78.0,87.0,3.611111


In [184]:
# save to csv
to_save.to_csv("...", index=False)