In [2]:
import os
import pandas as pd
import rdkit
from standardiser import standardise
from rdkit import RDLogger
import shutil

RDLogger.DisableLog("rdApp.*")

data_dir = os.path.join("..", "data")
np_dir = os.path.join(data_dir, "original", "NP")
sd_dir = os.path.join(data_dir, "original", "SD")

# Data curation
The NP and SD folders contain duplicated and incorrect SMILES.

We will use the sd and np PCA files to extract the correct names and apply them to sort the folders into NP_curated and SD_curated

In [8]:
np = pd.read_csv(os.path.join(data_dir, "original", "PCA", "np_pca_cleaned.csv"))
sd = pd.read_csv(os.path.join(data_dir, "original", "PCA", "sd_pca_cleaned.csv"))

np_names = [name.replace('.****', '').replace('.mol', '') for name in np["Title"]]

sd_names = [name.replace('.****', '').replace('.mol', '') for name in sd["Title"]]

print(np.shape, len(np_names), len(set(np_names)))
print(sd.shape, len(sd_names), len(set(sd_names)))
print(len(set(np_names)-set(sd_names)), len(set(sd_names)-set(np_names)))

(626, 121) 626 626
(656, 121) 656 656
626 656


In [13]:
source_folder = os.path.join(data_dir, "original", 'NP')
destination_folder = os.path.join(data_dir, "original", 'NP_curated')
correct_files = []
for file_name in os.listdir(source_folder):
    base_name = os.path.splitext(file_name)[0]
    if base_name in np_names:
        correct_files.append(base_name)
        source_path = os.path.join(source_folder, file_name)
        destination_path = os.path.join(destination_folder, file_name)
        shutil.copy2(source_path, destination_path)
num_files = len(os.listdir(destination_folder))

print(len(np_names),len(correct_files), num_files)

626 617 617


In [14]:
np_missing = list(set(np_names)-set(correct_files))
np_missing

['Ancistrotanzanine-C',
 '26,27-Dinorergosta-5,23-dien-3-ol,(3beta)',
 'Jubanine H',
 'Cryptobeilic-acid-C',
 '26,27-Dinorergost-5-ene-3,24-diol,(3beta)',
 'J02_12-E',
 'Androstan-17-one,3-ethyl-3-hydroxy,(5alpha)',
 '162727',
 'Annonidine-F',
 'J02_12-Z']

In [14]:
name_changes = {'Ancistrotanzanine-C': 'Ancistrotanzanine C',
 '26,27-Dinorergosta-5,23-dien-3-ol,(3beta)':'26,27-Dinorergosta-5,23-dien-3-ol,(3.beta)',
 'Cryptobeilic-acid-C':'Cryptobeilic acid C',
 '26,27-Dinorergost-5-ene-3,24-diol,(3beta)':'26,27-Dinorergost-5-ene-3,24-diol,(3.beta)',
 'J02_12-E':'J02_12_E',
 'Androstan-17-one,3-ethyl-3-hydroxy,(5alpha)':'Androstan-17-one, 3-ethyl-3-hydroxy,(5alpha)',
 'Annonidine-F':'Annonidine F',
 'J02_12-Z':'J02_12_Z',
 'Jubanine H': 'Jubanine B', 
 '162727': 'MCSJ37_0012'}

source_folder = os.path.join(data_dir, "original", 'NP')
destination_folder = os.path.join(data_dir, "original", 'NP_curated')
for k,v in name_changes.items():
    for file_name in os.listdir(source_folder):
        base_name = os.path.splitext(file_name)[0]
        if base_name in v:
            print(base_name)
            source_path = os.path.join(source_folder, file_name)
            destination_path = os.path.join(destination_folder, file_name)
            shutil.copy2(source_path, destination_path)
    num_files = len(os.listdir(destination_folder))

print(len(np_names), num_files)

Ancistrotanzanine C
26,27-Dinorergosta-5,23-dien-3-ol,(3.beta)
Cryptobeilic acid C
26,27-Dinorergost-5-ene-3,24-diol,(3.beta)
J02_12_E
Androstan-17-one, 3-ethyl-3-hydroxy,(5alpha)
Annonidine F
J02_12_Z
Jubanine B
MCSJ37_0012
626 626


In [13]:
from collections import defaultdict
directory_path = os.path.join(data_dir, "original", 'NP_curated')
file_dict = defaultdict(list)
for filename in os.listdir(directory_path):
    base_name = filename.replace('.mol', '').replace('.sdf', '').replace('.mol2', '')
    file_dict[base_name].append(filename)
duplicates = {key: value for key, value in file_dict.items() if len(value) > 1}
if duplicates:
    print("Duplicate files found:")
    for base_name, files in duplicates.items():
        print(f"Base name: {base_name} - Files: {', '.join(files)}")
else:
    print("No duplicates found.")
#manually delete the .sdf file

Duplicate files found:
Base name: MCSJ49_0002 - Files: MCSJ49_0002.sdf, MCSJ49_0002.mol


In [15]:
import shutil

source_folder = os.path.join(data_dir, "original", 'SD')
destination_folder = os.path.join(data_dir, "original", 'SD_curated')
correct_files = []
for file_name in os.listdir(source_folder):
    base_name = os.path.splitext(file_name)[0]
    if base_name in sd_names:
        correct_files.append(base_name)
        source_path = os.path.join(source_folder, file_name)
        destination_path = os.path.join(destination_folder, file_name)
        shutil.copy2(source_path, destination_path)
num_files = len(os.listdir(destination_folder))

print(len(sd_names),len(correct_files), num_files)
sd_missing = list(set(sd_names)-set(correct_files))
sd_missing

656 653 653


['Orotic-acid',
 'Taribavirin-Hydrochloride',
 'LY411575',
 'Chloramphenicol-succinate']

In [10]:
not_found = []
name_changes = {'Orotic-acid': 'Orotic acid',
 'Taribavirin-Hydrochloride': 'Taribavirin Hydrochloride',
 'LY411575':'LY411575 ',
 'Chloramphenicol-succinate':'Chloramphenicol succinate'}

source_folder = os.path.join(data_dir, "original", 'SD')
destination_folder = os.path.join(data_dir, "original", 'SD_curated')
for k,v in name_changes.items():
    for file_name in os.listdir(source_folder):
        base_name = os.path.splitext(file_name)[0]
        if base_name in v:
            print(base_name)
            source_path = os.path.join(source_folder, file_name)
            destination_path = os.path.join(destination_folder, file_name)
            shutil.copy2(source_path, destination_path)
    num_files = len(os.listdir(destination_folder))

print(len(sd_names), num_files)

Orotic acid
Taribavirin Hydrochloride
LY411575 
Chloramphenicol succinate
656 657


In [12]:
import os
from collections import defaultdict
directory_path = os.path.join(data_dir, "original", 'SD_curated')
file_dict = defaultdict(list)
for filename in os.listdir(directory_path):
    base_name = filename.replace('.mol', '').replace('.sdf', '').replace('.mol2', '')
    file_dict[base_name].append(filename)
duplicates = {key: value for key, value in file_dict.items() if len(value) > 1}
if duplicates:
    print("Duplicate files found:")
    for base_name, files in duplicates.items():
        print(f"Base name: {base_name} - Files: {', '.join(files)}")
else:
    print("No duplicates found.")
#manually delete the .sdf file

Duplicate files found:
Base name: SA5_0007 - Files: SA5_0007.sdf, SA5_0007.mol


In [18]:
# FINAL Check
np_files = len(os.listdir(os.path.join(data_dir, "original", 'NP_curated')))
sd_files = len(os.listdir(os.path.join(data_dir, "original", 'SD_curated')))

print(np_files, sd_files)

626 656


## Curated list standardised

From the NP_curated and SD_curated we try to obtain only molecules that can be parsed by the standardiser (see scripts/00_parse_manually_curated_data.py)
We also keep the molecules that:
- Cannot be parsed by the standardiser
- Are duplicated (same molecule different stereochemistry, which is not taken into account at 2D level)

In [5]:
def molecule_loader(subfolder):
    sdf_paths = []

    for fn in os.listdir(subfolder):
        sdf_paths.append(os.path.join(subfolder, fn))

    mols = []
    paths = []
    names = []
    for sdf_path in sdf_paths:
        name = sdf_path.split("/")[-1][:-4]
        suppl = rdkit.Chem.SDMolSupplier(sdf_path)
        mols_ = [mol for mol in suppl if mol is not None]
        if len(mols_) == 0:
            continue
        if len(mols_) > 1:
            mols_ = [mols_[0]]
        mols += mols_
        paths += [sdf_path]
        names += [name]

    assert len(mols) == len(names)
    print("TOTAL MOLS", len(mols))
    mols_ = []
    non_parsed_mols = []
    c = 0
    for i, mol in enumerate(mols):
        try:
            mol = standardise.run(mol)
            if mol is not None:
                mols_ += [(names[i], mol)]
        except:
            c += 1
            non_parsed_mols += [names[i]]
            continue
    print(
        "Number of non-standardized molecules (skipped) {0}. File: {1}".format(
            c, subfolder
        )
    )
    return mols_, non_parsed_mols

In [6]:
mols, non_parsed_mols = molecule_loader("../data/original/pubchem_sdfs")

TOTAL MOLS 18
Number of non-standardized molecules (skipped) 18. File: ../data/original/pubchem_sdfs


## Final Dataset

After checking all molecules that could not be standardised initially (checking the original files from PubChem), we have decided to discard them. We will prepare a NP and SD folder with only the files that we process in all_molecules.csv to do the MOE analysis and Scaffold analysis only in these molecules, to be homogeneous 

In [7]:
df = pd.read_csv("../data/all_molecules.csv")

In [12]:
np_names = df[df["category"]=="natural"]["file_name"].tolist()
source_folder = os.path.join(data_dir, "original", 'NP_curated')
destination_folder = os.path.join(data_dir, "original", 'NP_final')
correct_files = []
for file_name in os.listdir(source_folder):
    base_name = os.path.splitext(file_name)[0]
    if base_name in np_names:
        correct_files.append(base_name)
        source_path = os.path.join(source_folder, file_name)
        destination_path = os.path.join(destination_folder, file_name)
        shutil.copy2(source_path, destination_path)
num_files = len(os.listdir(destination_folder))

print(len(np_names),len(correct_files), num_files)

616 616 616


In [15]:
sd_names = df[df["category"]=="synthetic"]["file_name"].tolist()
source_folder = os.path.join(data_dir, "original", 'SD_curated')
destination_folder = os.path.join(data_dir, "original", 'SD_final')
correct_files = []
for file_name in os.listdir(source_folder):
    base_name = os.path.splitext(file_name)[0]
    if base_name in sd_names:
        correct_files.append(base_name)
        source_path = os.path.join(source_folder, file_name)
        destination_path = os.path.join(destination_folder, file_name)
        shutil.copy2(source_path, destination_path)
num_files = len(os.listdir(destination_folder))

print(len(sd_names),len(correct_files), num_files)

615 615 615
