In [39]:
import random
from pathlib import Path

import numpy as np
from rdkit import Chem
from tqdm.auto import tqdm

SMIS_TOP_1k = "../data/2_molpal/6p9l_hts_results/smis-top1k.txt"
p_top_1k = Path(SMIS_TOP_1k)
smis = p_top_1k.read_text().splitlines()
mols = [Chem.MolFromSmiles(smi) for smi in tqdm(smis)]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [2]:
def get_ring_systems(mol, includeSpiro: bool = False):
    """from https://rdkit.org/docs/Cookbook.html#count-ring-systems"""
    ri = mol.GetRingInfo()

    systems = []
    for ring in ri.AtomRings():
        ringAts = set(ring)
        nSystems = []
        for system in systems:
            nInCommon = len(ringAts.intersection(system))
            if nInCommon and (includeSpiro or nInCommon>1):
                ringAts = ringAts.union(system)
            else:
                nSystems.append(system)
        nSystems.append(ringAts)
        systems = nSystems
        
    return systems

In [3]:
def get_Ncycles(mol: Chem.Mol, include_spiro: bool = False):
    min_common_atoms = 0 if include_spiro else 1

    ring_atomss = [set(atoms) for atoms in mol.GetRingInfo().AtomRings()]
    ring_systems = get_ring_systems(mol, include_spiro)

    return [
        sum(len(atoms & rs) > min_common_atoms for atoms in ring_atomss)
        for rs in get_ring_systems(mol, include_spiro)
    ]

In [26]:
def filter_Ncycles(mols, N: int = 4):
    max_Ncycles = [max(get_Ncycles(mol)) for mol in tqdm(mols)]

    return [mol for mol, max_Ncycle in zip(mols, max_Ncycles) if max_Ncycle < N]

In [27]:
def filter_SMARTS(mols, smarts: str):
    substructure = Chem.MolFromSmarts(smarts)
    
    return [mol for mol in mols if not mol.HasSubstructMatch(substructure)]

In [37]:
mols_no_4cycles = filter_Ncycles(mols, 4)
mols_no_4_cycles_nitros = filter_SMARTS(mols_no_4cycles, "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]")

  0%|          | 0/1000 [00:00<?, ?it/s]

In [46]:
with open(p_top_1k.with_suffix(".filtered.txt"), "w") as f:
    f.writelines("\n".join(Chem.MolToSmiles(mol) for mol in mols_no_4_cycles_nitros))