# Preparation of Dataset from Esaki et al, 2019 Mol Inform

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

DATAPATH = "../data/esaki2019"

In [None]:
from rdkit.Chem import SDMolSupplier
from rdkit import Chem

path = os.path.join(DATAPATH, "esaki2019.sdf")
suppl = SDMolSupplier(path)
R = []

cols = ["ChEMBL ID", "Compound name", "Dataset", "Observed", "Predicted"]
for i, mol in tqdm(enumerate(suppl)):
    if mol is None: continue
    props = mol.GetPropsAsDict()
    r = [props[c] for c in cols] + [Chem.MolToSmiles(mol)]
    R += [r]
df= pd.DataFrame(R, columns = cols+["smiles"])

In [None]:
clf = []
for o in df["Observed"].tolist():
    if o == "Stable":
        c = 1
    else:
        c = 0
    clf.append(c)
df["obs"]=clf

In [None]:
df.rename(columns={"ChEMBL ID": "ChEMBL_ID"}, inplace=True)

In [None]:
#check invalid smiles

def invalid_mols(data, smi_col):
    smiles = data[smi_col].tolist()
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    pre = len(data)
    for i, mol in enumerate(mols):
        if mol is None:
            smi=smiles[i]
            data.drop(index = i, inplace = True)
    post= len(data)
    print(str(pre-post))
    return data

df = invalid_mols(df, "smiles")

In [None]:
df[["ChEMBL_ID", "smiles", "obs"]].to_csv(os.path.join(DATAPATH, "pr_esaki2021.csv"), index=False)