In [38]:
import os
import pandas as pd
import numpy as np
from collections import OrderedDict
from pathos.multiprocessing import ProcessingPool as Pool
from rdkit import Chem
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
from rdkit.Chem.rdMolDescriptors import CalcNumHBA, CalcNumHBD, CalcNumRotatableBonds
from rdkit.Chem import Descriptors

In [1]:
file_folder = '/fileserver-gamma/chaoting/ML/cvae-transformer/Inference-Dataset/moses/uc-sampling/'

file_prefix1 = 'vaetf1_-37'
file_prefix2 = 'vaetf2_-38'
file_prefix3 = 'vaetf3_-37'

In [22]:
train = pd.read_csv('/fileserver-gamma/chaoting/ML/dataset/moses/raw/train.csv', index_col=[0])

In [6]:
df1 = pd.read_csv(os.path.join(file_folder, f'{file_prefix1}_prop.csv'), index_col=[0])
df2 = pd.read_csv(os.path.join(file_folder, f'{file_prefix2}_prop.csv'), index_col=[0])
df3 = pd.read_csv(os.path.join(file_folder, f'{file_prefix3}_prop.csv'), index_col=[0])

In [15]:
def murcko_scaffold(smi):
    try:
        return MurckoScaffoldSmiles(smiles=smi)
    except:
        return None

In [19]:
with Pool(4) as pool:
    ms1 = pool.map(murcko_scaffold, df1['SMILES'])
with Pool(4) as pool:
    ms2 = pool.map(murcko_scaffold, df2['SMILES'])
with Pool(4) as pool:
    ms3 = pool.map(murcko_scaffold, df3['SMILES'])

In [27]:
tms = set(train['scaffold'])
tms = {i for i in tms if pd.notna(i)}

ms1 = set(ms1)
ms2 = set(ms2)
ms3 = set(ms3)

In [29]:
print('# scaffold (train):', len(tms))
print('# scaffold (gen1):', len(ms1))
print('# scaffold (gen2):', len(ms2))
print('# scaffold (gen3):', len(ms3))

# murcko scaffold (train): 376593
# murcko scaffold (gen1): 19852
# murcko scaffold (gen2): 20014
# murcko scaffold (gen3): 19936


In [32]:
print('# new scaffold (gen1):', len(ms1 - tms))
print('# new scaffold (gen1):', len(ms2 - tms))
print('# new scaffold (gen1):', len(ms3 - tms))

# new scaffold (gen1): 8145
# new scaffold (gen1): 8154
# new scaffold (gen1): 8220


In [36]:
def MW(mol):
    return Descriptors.MolWt(mol)

def BertzCT(mol):
    return Chem.GraphDescriptors.BertzCT(mol)

def HBD(mol):
    return CalcNumHBD(mol)

def HBA(mol):
    return CalcNumHBA(mol)

def RBN(mol):
    return CalcNumRotatableBonds(mol)

def logP(mol):
    return Descriptors.MolLogP(mol)

def tPSA(mol):
    return Descriptors.TPSA(mol)

property_fn = {
    'MW'     : MW,
    'BertzCT': BertzCT,
    'HBD'    : HBD,
    'HBA'    : HBA,
    'RBN'    : RBN,
    'logP'   : logP,
    'tPSA'   : tPSA,
}

In [39]:
def get_mol(smi_or_mol):
    """convert smiles to mol. (copied from molgpt)
    """
    if isinstance(smi_or_mol, str):
        if len(smi_or_mol) == 0:
            return None
        mol = Chem.MolFromSmiles(smi_or_mol)
        if mol is None:
            return None
        try:
            Chem.SanitizeMol(mol)
        except ValueError:
            return None
        return mol
    return smi_or_mol

def mapper(fn, obj, n_jobs):
    if n_jobs == 1:
        res = list(map(fn, obj))
    else:
        with Pool(n_jobs) as pool:
            res = pool.map(fn, obj)
    return res

def mols_to_props(mols, property_fns, n_jobs=1,
                  col_names=None):
    props = OrderedDict()
    for i, (p, fn) in enumerate(property_fns.items()):
        name = p if col_names is None else col_names[i]
        with Pool(n_jobs) as pool:
            print('property fn:', fn.__name__)
            props[name] = list(pool.map(fn, mols))
    return pd.DataFrame(props)

In [41]:
n_jobs = 16

tms_mol = mapper(get_mol, list(tms)[:1000], n_jobs)
props = mols_to_props(tms_mol, property_fn, n_jobs)

property fn: MW


KeyboardInterrupt: 