# Download the PubChem IDs, Filter Them
Get the SMILES strings from PubChem and then filter down based on certain criteria

In [1]:
from tempfile import TemporaryDirectory
from multiprocessing.pool import Pool
from more_itertools import batched, peekable
from functools import partial
from shutil import copyfileobj
from rdkit.Chem import Descriptors
from rdkit import RDLogger, Chem
from pathlib import Path
from tqdm import tqdm
import requests
import gzip
import yaml
import os

Configuration

In [2]:
batch_size = 1000
max_molwt = 300
criteria_path = 'criteria/criteria-v3.1.yml'

Surpress complaints from RDKit

In [3]:
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

## Screening Function
Something to remove undesired molecules

In [4]:
def screen_molecules(
    to_screen: list[str],
    max_molecular_weight: float,
    forbidden_smarts: list[str],
    required_smarts: list[str],
    allowed_elements: list[str],
    min_conjugation: int,
    allow_disconnected: bool,
) -> list[str]:
    """Screen molecules that pass molecular weights and substructure filters

    Args:
        to_screen: List of SMILES strings to string
        max_molecular_weight: Maximum molecular weight (g/mol)
        forbidden_smarts: List of SMARTS that cannot appear in a molecule
        required_smarts: List of SMARTS that must appear in the molecule
        allowed_elements: List of allowed elements
        allow_disconnected: Whether to allow non-bonded connections
    Returns: 
        List of SMILES strings which pass
    """
    # Pre-parse the SMARTS strings
    forbidden_smarts = [Chem.MolFromSmarts(s) for s in forbidden_smarts]
    required_smarts = [Chem.MolFromSmarts(s) for s in required_smarts]

    passed = []
    
    # Function for counting conjugation
    def count_conj(mol):
        """Count the number of conjugated bonds in a molecule

        Assumes they are all part of the same group

        Args:
            mol: Molecule to evaluate
        Returns:
            Number of conjugated bonds
        """

        # Check if any are conjugated
        is_conj = [bond.GetIsConjugated() for bond in mol.GetBonds()]

        # If any are conjugated, count the number of multiple bonds
        if any(is_conj):
            kekul_mol = Chem.Kekulize(mol, True)
            return sum(i and bond.GetBondTypeAsDouble() >= 2 for i, bond in zip(is_conj, mol.GetBonds()))
        else:
            return 0

    for smiles in to_screen:
        # Check first if it has a non-bond
        if '.' in smiles and not allow_disconnected:
            continue
        
        mol = Chem.MolFromSmiles(smiles)

        # Skip if molecule does not parse
        if mol is None:
            continue

        # Skip if molecular weight is above a threshold
        mol_wt = Descriptors.MolWt(mol)
        if mol_wt > max_molecular_weight:
            continue

        # Skip if it contains a disallowed elements
        if any(atom.GetSymbol() not in allowed_elements for atom in mol.GetAtoms()):
                continue
        
        # Skip if it contains a disallowed group
        try:
            if any(mol.HasSubstructMatch(s) for s in forbidden_smarts):
                continue
        except:
            continue
        
        # Skip if it does not contain all of the allowed groups
        try:
            if not all(mol.HasSubstructMatch(s) for s in required_smarts):
                continue
        except:
            continue
            
        # Skip if does not have enough conjugated bonds
        n_conj = count_conj(mol)
        if n_conj < min_conjugation:
            continue
            
        # Add it to the output
        passed.append(smiles)

    return passed

Load our criteria

In [5]:
with open(criteria_path) as fp:
    criteria = yaml.safe_load(fp)
criteria

{'allowed_elements': ['C', 'H', 'O', 'N', 'F', 'S', 'P', 'Cl', 'Br'],
 'forbidden_smarts': ['[CX3](=O)[OX1H0-,OX2H1]',
  '[CX3](=O)[OX2H1]',
  '[#6][#6][OX2H]',
  '[CX3](=[OX1])N=[CX3](=[OX1])',
  'O[CX4][F,Cl,Br,I]'],
 'required_smarts': ['a'],
 'min_conjugation': 3,
 'allow_disconnected': False}

Pin them to the function

In [6]:
screen_fun = partial(screen_molecules, max_molecular_weight=max_molwt, **criteria)

## Make functions to iterate from PubChem Data Files
PubChem supplies a mapping of their "Compound ID" to a SMILES string and synonyms as separate files. 

The Data Files are hosted on an [FTP server](https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/) We can access them via HTTP requests.

In [7]:
def get_smiles_strings() -> str:
    """Iterate over all of the SMILES strings in PubChem
    
    Yields:
        SMILES string of a molecule
    """
    with TemporaryDirectory(prefix='smiles') as tmp:
        file_path = Path(tmp) / 'smiles.gz'
        with requests.get('https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-SMILES.gz', stream=True) as req, file_path.open('wb') as fo:
            copyfileobj(req.raw, fo)
    
        with gzip.open(file_path, 'rt') as fp:
            for line in fp:
                id_str, smiles = line[:-1].split("\t")
                yield smiles
smiles_iter = peekable(get_smiles_strings())
assert smiles_iter.peek() == 'CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C'

Make sure we can screen

In [8]:
screen_fun([smiles_iter.peek()])

[]

## Filter PubChem
Downselect in parallel

In [9]:
output_path = Path('output') / f'pubchem-{Path(criteria_path).name[:-4]}-molwt={max_molwt}.smi'
print(f'Writing to: {output_path}')

Writing to: output/pubchem-criteria-v3.1-molwt=300.smi


In [None]:
pbar = tqdm()
with Pool(min(os.cpu_count(), 8)) as pool, output_path.open('w') as fp:
    for smiles_batch in pool.imap_unordered(screen_fun, batched(smiles_iter, batch_size)):
        for smiles in smiles_batch:
            print(smiles.strip(), file=fp)
            pbar.update(1)

16736370it [1:28:13, 1633.94it/s] 