# This notebook performs a goodlist/badlist substructure screening for a multi-molecule .sdf input file.  Both goodlist(s) and badlist(s) should be in .sdf format as well.  Also screens for rings of a minimum size, and maximum bond order.  If no goodlists or badlists are provided, the script will remove structures with 3- or 4-membered rings and any molecules containing triple bonds.

## Module imports below

In [1]:
from rdkit import Chem
import re

## Function definitions below

In [2]:
def bracketRemove(smi):
    """Remove brackets from string.
    
    Removes brackets that surround single characters in a string.  Written to remove brackets from around single atoms in some SMILES strings where they are not useful to the program.

    Parameters
    ----------
    smi : string

    Returns
    -------
    Input string without brackets surrounding single letters

    """
    m = re.sub(r"\[(\w)]", r'\1', smi)
    return m

In [3]:
def minRingSize(mol, ring_min):
    """Determine presence of too-small rings.
    
    Determine if there are rings present (size 3 or greater) that are smaller than a user-defined minimum.

    Parameters
    ----------
    mol : RDKit Mol object
    
    ring_min : int
        Minimum ring size desired.  E.g. If ring_min = 6, then the function will look for rings of size 3, 4, and 5.

    Returns
    -------
    True if there are no rings below the specified minimum ring size; returns False otherwise.

    """
    ring_atoms = mol.GetRingInfo().AtomRings()
    sizes = set([len(a) for a in ring_atoms]) #lengths of all ring systems in molecule
    mins = set(list(range(3,ring_min)))
    return sizes.isdisjoint(mins)

In [4]:
def maxBonds(mol, bonds):
    """Test maximum bond order of input molecule.
    
    Tests if a molecule has a bond order equal to the input value.

    Parameters
    ----------
    mol : RDKit Mol object
    
    bonds : float
        Maximum bond order desired in a molecule.

    Returns
    -------
    True if at least one bond in the molecule has the specified bond order.  Returns False otherwise.

    """
    bonds = float(bonds)
    max_bonds = False
    for b in mol.GetBonds():
        if b.GetBondTypeAsDouble() == bonds:
            max_bonds = True
            break
        else:
            continue
    return max_bonds

## Enter screening parameters in the cell below.  All filenames must be in .sdf format.  In the case of multiple goodlist and/or badlist files, enter each filename as a string between the square brackets, with a comma separating each string.

In [5]:
in_file = "PC[0-3]H[3-9].sdf"
out_file = "VAIL-P.sdf"
goodlist = []
badlist = ["BadAaList_OB.sdf", "BadAromaticsList_OB.sdf", "BadRingList_OB.sdf", "allene.sdf"]
max_bond = 3
min_ring = 5

## Main program

In [6]:
# load in source data file
data = Chem.SDMolSupplier(in_file)
data_set = set([Chem.MolToSmiles(m) for m in data]) # set of all input SMILES
bad_mols = set() # set of SMILES for molecules with unwanted substructures; output will lack these molecules

In [7]:
# perform ring size screening
for mol in data:
    if not minRingSize(mol, min_ring): #evaluates as True if molecule has a ring below the stated minimum size
        bad_mols.add(Chem.MolToSmiles(mol))
len(bad_mols)

34

In [8]:
# perform bond order screening
for mol in data:
    if maxBonds(mol, max_bond):
        bad_mols.add(Chem.MolToSmiles(mol))
len(bad_mols)

43

In [10]:
# check for unwanted substructures
if len(badlist) > 0:
    # collapse all badlist substructures into a list
    bad_substructs = []
    for x in badlist:
        bad = Chem.SDMolSupplier(x)
        for y in bad:
            bad_substructs.append(y)
    for mol in data:
        bad_chk = False
        for bad in bad_substructs:
            if not bad_chk:
                if mol.HasSubstructMatch(bad):
                    bad_chk = True
                    break
                else:
                    continue
            else:
                continue
        if bad_chk:
            bad_mols.add(Chem.MolToSmiles(mol))
len(bad_mols)

48

In [11]:
# check for desired substructures (must be included in output)
if len(goodlist) > 0:
    # collapse all goodlist substructures into a list
    good_substructs = []
    for x in args.goodlists:
        good = Chem.SDMolSupplier(x)
        for y in good:
            good_substructs.append(Chem.MolFromSmiles(bracketRemove(Chem.MolToSmiles(y))))
    for mol in data:
        for good in good_substructs:
            if not mol.HasSubstructMatch(good): # evaluates as True if the structure lacks a goodlist substructure
                bad_mols.add(Chem.MolToSmiles(mol))
len(bad_mols)

48

In [12]:
# calculate and output filtered molecules
out_mols = list(data_set - bad_mols)
w = Chem.SDWriter(out_file)
for mol in out_mols:
    m2 = Chem.MolFromSmiles(mol)
    w.write(m2)
w.close()

## No removal of allenes, need to investigate

In [None]:
results = Chem.SDMolSupplier("VAIL-P.sdf")
results[10]

In [None]:
allene = Chem.SDMolSupplier("allene.sdf")
results[10].HasSubstructMatch(allene[0])

In [None]:
results[10].HasSubstructMatch(Chem.MolFromSmarts("[CX4&D2]"))

In [None]:
[Chem.MolToSmiles(x) for x in bad_substructs]

In [None]:
# broader allene definition, include aliphatic and aromatic C
allene = Chem.MolFromSmarts("[#6](=[*])=[*]")
w = Chem.SDWriter("allene.sdf")
w.write(allene)
w.close()