In [34]:
import os
import pandas as pd
import rdkit
from standardiser import standardise
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')                                                                                                                                                           

data_dir = os.path.join('..', 'data')
np_dir = os.path.join(data_dir, 'all', 'NP2')
sd_dir = os.path.join(data_dir, 'all', 'SD2')

In [38]:
def molecule_loader(subfolder):
    sdf_paths = []
    mol_paths = []
    mol2_paths = []
    names = []
    for fn in os.listdir(subfolder):
        if fn.endswith(".sdf"):
            sdf_paths.append(os.path.join(subfolder, fn))
            names += [fn[:-4]]
        if fn.endswith(".mol"):
            mol_paths.append(os.path.join(subfolder, fn))
            names += [fn[:-4]]
        if fn.endswith(".mol2"):
            mol2_paths.append(os.path.join(subfolder, fn))
            names += [fn[:-5]]
    mols = []
    for sdf_path in sdf_paths:
        mols.append(rdkit.Chem.SDMolSupplier(sdf_path))
    for mol_path in mol_paths:
        mols.append(rdkit.Chem.MolFromMolFile(mol_path))
    for mol2_path in mol2_paths:
        mols.append(rdkit.Chem.MolFromMol2File(mol2_path))
    mols_ = []
    c = 0
    for i, mol in enumerate(mols):
        try:
            mol = standardise.run(mol)
            if mol is not None:
                mols_ += [(names[i], mol)]
        except:
            c += 1
            continue
    print("Number of non-standardized molecules (skipped) {0}".format(c))
    return mols_

np_mols = molecule_loader(sd_dir)
sd_mols = molecule_loader(np_dir)

def mols_to_table(mols, category):
    mols_ = []
    for name, mol in mols:
        mols_ += [(name, rdkit.Chem.MolToInchiKey(mol), rdkit.Chem.MolToSmiles(mol), category)]
    df = pd.DataFrame(mols_, columns=['file_name', "inchikey", 'smiles', "category"])
    return df

np_df = mols_to_table(np_mols, "natural")
sd_df = mols_to_table(sd_mols, "synthetic")

df = pd.concat([np_df, sd_df]).drop_duplicates().reset_index(drop=True)



Number of non-standardized molecules (skipped) 87
Number of non-standardized molecules (skipped) 150


In [40]:
df.to_csv(os.path.join(data_dir, 'all', 'all_molecules.csv'), index=False)

Unnamed: 0,file_name,inchikey,smiles,category
0,MCSJ21_0003,LYWZYDYFZJVNFA-KEBDBYFISA-N,C(=N/n1cnnc1)\c1c(-c2ccccc2)[nH]c2ccccc12,natural
1,MCSJ20_0003,ATJLFWUPTQUWHM-UHFFFAOYSA-N,CN(C)CCOc1ccc(CNC(=O)CSc2cn(C)c3ccccc23)cc1,natural
2,Orotic acid,ROPLRWWORDJINQ-UHFFFAOYSA-N,CN1C(=O)C(=O)c2cc(C(=O)O)ccc21,natural
3,Taribavirin Hydrochloride,ALEMQFKPROZQHR-UHFFFAOYSA-N,CCS(=O)(=O)N1CC(CC#N)(n2cc(-c3ncnc4[nH]ccc34)c...,natural
4,JBB19_0008,KVFCOBZWUCKJFB-UHFFFAOYSA-N,Cc1cccc(O)c1C(=O)Oc1cccc(C)c1C(=O)Oc1cncc(Cl)c1,natural
...,...,...,...,...
743,10-Hydroxyusambarens,FZKRWTVMKFSFSG-CYBMUJFWSA-N,CNc1ccc([C@H]2CCn3c2nc2ccccc2c3=O)cc1C(=O)OC,synthetic
744,J04_02,DQEAAVKYRCHQOQ-IEFPXSNCSA-N,C[C@H]1O[C@H](Oc2ccc3ccc(=O)oc3c2-c2c(O)ccc3cc...,synthetic
745,J18_21,WKOLLVMJNQIZCI-UHFFFAOYSA-N,COc1cc(C(=O)O)ccc1O,synthetic
746,J08_21,IQPNAANSBPBGFQ-UHFFFAOYSA-N,O=c1cc(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,synthetic
