In [8]:
import pandas as pd
import numpy as np
import mordred
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [3]:
# fetch smiles lists
df = pd.read_csv("C:/Users/AndrzejZuranski/Dropbox/DataX_PU/Projects/Shutian/data/yields_combined_from_SKK.csv")

### Prepare rdkit molecules

In [4]:
# make rdkit mols
df['rdmol'] = df['smi'].map(lambda x: Chem.MolFromSmiles(x))

# drop molecules that rdkit cannot handle
df= df.dropna(subset=['rdmol'])

# get inchi
df['inchi'] = df['rdmol'].map(Chem.MolToInchi)

# drop duplicates on inchi
df = df.drop_duplicates(subset="inchi")

#### Optimize geometry

In [9]:
%%time
df['rdmolH'] = df['rdmol'].map(Chem.AddHs)
_ = df['rdmolH'].map(AllChem.EmbedMolecule)
_ = df['rdmolH'].map(AllChem.MMFFOptimizeMolecule)

Wall time: 658 ms


### Calculate Mordred descriptors

In [10]:
# initialize Mordred calculators
calc=Calculator(descriptors, ignore_3D=True,)

In [19]:
%%time
# compute Mordred descriptors for all molecules (may take long)
md=calc.pandas(df['rdmol'])

100%|███████████████████████████████████████████████████████████████████████████████| 52/52 [00:02<00:00, 24.21it/s]


Wall time: 3.09 s


In [20]:
# replace mordred errors with NaNs
md=md.applymap(lambda x: np.nan if type(x) in [mordred.error.Missing,
                                               mordred.error.Error] else x)

In [21]:
# drop columns that have NaNs
md=md.dropna(axis=1)

In [22]:
# add smi
md.insert(0,'smi', df['smi'])

In [24]:
# save to csv
md.to_csv("C:/Users/AndrzejZuranski/Dropbox/DataX_PU/Projects/Shutian/data/md_desc.csv",
          index=False)