In [1]:
import pandas as pd
import mols2grid
#import useful_rdkit_utils as uru
from rdkit import Chem
from tqdm.auto import tqdm
from itertools import chain
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.rdMolDescriptors import CalcNumRings

In [2]:
ring_db_pat = Chem.MolFromSmarts("[#6R,#16R]=[OR0,SR0,CR0,NR0]")
ring_atom_pat = Chem.MolFromSmarts("[R]")

In [3]:
def fix_bond_stereo(mol):
    """Loop over double bonds and change stereo specification for double bonds that don't have stereo
    :param mol: input RDKit molecule
    :return: output RDKit molecule
    """
    for bnd in mol.GetBonds():
        if bnd.GetBondType() == Chem.BondType.DOUBLE:
            begin_atm = bnd.GetBeginAtom()
            end_atm = bnd.GetEndAtom()
            # Look for double bond atoms with two attached hydrogens
            if begin_atm.GetDegree() == 1 or end_atm.GetDegree() == 1:
                bnd.SetStereo(Chem.BondStereo.STEREONONE)
    return mol

In [4]:
def tag_bonds_to_preserve(mol):
    """Assign the property "protected" to all ring carbonyls, etc.
    :param mol: input molecule
    :return: None
    """
    for bnd in mol.GetBonds():
        bnd.SetBoolProp("protected", False)
    for match in mol.GetSubstructMatches(ring_db_pat):
        bgn, end = match
        bnd = mol.GetBondBetweenAtoms(bgn, end)
        bnd.SetBoolProp("protected", True)

In [5]:
def cleave_linker_bonds(mol):
    """Cleave bonds that are not in rings and not protected
    :param mol: input molecule
    :return: None
    """
    frag_bond_list = []
    for bnd in mol.GetBonds():
        if not bnd.IsInRing() and not bnd.GetBoolProp("protected") and bnd.GetBondType() == Chem.BondType.SINGLE:
            frag_bond_list.append(bnd.GetIdx())

    if len(frag_bond_list):
        frag_mol = Chem.FragmentOnBonds(mol, frag_bond_list)
        Chem.SanitizeMol(frag_mol)
        # Chem.AssignStereochemistry(frag_mol, cleanIt=True, force=True)
        return frag_mol
    else:
        return mol

In [6]:
def cleanup_fragments(mol, keep_dummy=False):
    """Split a molecule containing multiple ring systems into individual ring systems
    :param keep_dummy: retain dummy atoms
    :param mol: input molecule
    :return: a list of SMILES corresponding to individual ring systems
    """
    frag_list = Chem.GetMolFrags(mol, asMols=True)
    ring_system_list = []
    for frag in frag_list:
        if frag.HasSubstructMatch(ring_atom_pat):
            for atm in frag.GetAtoms():
                if atm.GetAtomicNum() == 0:
                    if keep_dummy:
                        atm.SetProp("atomLabel", "R")
                    else:
                        atm.SetAtomicNum(1)
                    atm.SetIsotope(0)
            # Convert explict Hs to implicit
            frag = Chem.RemoveAllHs(frag)
            frag = fix_bond_stereo(frag)
            ring_system_list.append(frag)
    return ring_system_list

In [7]:
def find_ring_systems(mol, keep_dummy=False, as_mols=False):
    """Find the ring systems for an RDKit molecule
    :param as_mols: return results a molecules (otherwise return SMILES)
    :param keep_dummy: retain dummy atoms
    :param mol: input molecule
    :return: a list of SMILES corresponding to individual ring systems
    """
    tag_bonds_to_preserve(mol)
    frag_mol = cleave_linker_bonds(mol)
    output_list = cleanup_fragments(frag_mol, keep_dummy=keep_dummy)
    if not as_mols:
        output_list = [Chem.MolToSmiles(x) for x in output_list]
    return output_list

In this notebook we'll analyze a set of marketed drugs from the ChEMBL database and find the most commonly occuring ring systems.  To do this, we'll follow these steps. 
1. Read the drugs as SMILES
2. Convert the SMILES to RDKit Molecules
3. Indentify the ring systems in the molecules
4. Collect the individual ring systems and count their frequencies

This analysis is similar to the one performed in Taylor, R. D., MacCoss, M., & Lawson, A. D. (2014). [Rings in drugs: Miniperspective](https://pubs.acs.org/doi/10.1021/jm4017625), Journal of Medicinal Chemistry, 57(14), 5845-5859.

Enable progress_apply in Pandas

In [8]:
tqdm.pandas()

### 1. Read drugs from ChEMBL as SMILES
Read the drugs from the ChEMBL database

In [9]:
chembl_drugs = "../data/chembl_drugs.smi"
df = pd.read_csv(chembl_drugs,sep=" ",names=["SMILES","Name"])

### 2. Convert the SMILES to RDKit Molecules
Add a molecule column to the dataframe

In [10]:
df['mol'] = df.SMILES.progress_apply(Chem.MolFromSmiles)

  0%|          | 0/1203 [00:00<?, ?it/s]

### 3. Indentify the ring systems in the molecules
Instantiate a RingSystemFinder object

Find the ring systems in the ChEMBL drugs

In [11]:
df['ring_systems'] = df.mol.progress_apply(find_ring_systems)

  0%|          | 0/1203 [00:00<?, ?it/s]

### 4. Collect the individual ring systems and count their frequencies
The ring_system column in **df** is a list of lists.  We need to flatten that list so we can count the number of times each ring system occurs.  The **chain** method in the itertools package provides a convenient was to do this. 

In [12]:
ring_list = chain(*df.ring_systems.values)
ring_list

<itertools.chain at 0x7ba1931c13c0>

The **chain** method used above returns an iterator.  We can use that iterator to create a Pandas series. 

In [13]:
ring_series = pd.Series(ring_list)
ring_series

0                           c1ccccc1
1                           c1ccncc1
2              O=C1CC(=O)NC(=O)[N-]1
3              O=C1C=CC(=O)c2ccccc21
4       O=c1[nH]c(=O)c2[nH]cnc2[nH]1
                    ...             
2453                        c1ccccc1
2454                        c1ccccc1
2455       O=c1[nH]c(=O)c2ccsc2[nH]1
2456                        c1ccnnc1
2457                        c1ccccc1
Length: 2458, dtype: object

Now that we have a Pandas series, we can use the value_counts method to count the occurences of the different ring systems.

In [14]:
ring_series.value_counts()

c1ccccc1                               911
c1ccncc1                                94
C1CNCCN1                                83
C1CCNCC1                                74
C1CC1                                   48
                                      ... 
c1ccn2ncnc2c1                            1
C1=NCCO1                                 1
c1cn2c(cnc3[nH]ccc32)n1                  1
O=C1CC[C@]23CCCCC[C@H](CCC2)[C@H]13      1
O=c1[nH]nc2c3c(cccc13)NCC2               1
Name: count, Length: 415, dtype: int64

In order to make the **value_counts** output easier to work with, we'll convert it into a dataframe. 

In [15]:
ring_df = pd.DataFrame(ring_series.value_counts()).reset_index()
ring_df.columns = ["SMILES","Count"]
ring_df

Unnamed: 0,SMILES,Count
0,c1ccccc1,911
1,c1ccncc1,94
2,C1CNCCN1,83
3,C1CCNCC1,74
4,C1CC1,48
...,...,...
410,c1ccn2ncnc2c1,1
411,C1=NCCO1,1
412,c1cn2c(cnc3[nH]ccc32)n1,1
413,O=C1CC[C@]23CCCCC[C@H](CCC2)[C@H]13,1


Now that we have our results in a dataframe, we can use mols2grid to display the chemical structures of the ring systems along with the associated counts. 

In [16]:
mols2grid.display(ring_df,smiles_col="SMILES",subset=["img","Count"],selection=False)

MolGridWidget()