Skip to content

Commit

Permalink
Skip scaffold generation on invalid molecules (#3520)
Browse files Browse the repository at this point in the history
* fix scaffold splitter for invalid molecules
  • Loading branch information
arunppsg committed Aug 14, 2023
1 parent 6403e46 commit df89c79
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 13 deletions.
37 changes: 24 additions & 13 deletions deepchem/splits/splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
Contains an abstract base class that supports chemically aware data splits.
"""
import inspect
import itertools
import logging
import os
import random
import tempfile
import itertools
import logging
from typing import Any, Dict, List, Iterator, Optional, Sequence, Tuple
from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union

import deepchem as dc
import numpy as np
import pandas as pd

import deepchem as dc
from deepchem.data import Dataset, DiskDataset
from deepchem.utils import get_print_threshold

Expand Down Expand Up @@ -1274,7 +1273,8 @@ def split(
return train_inds, valid_inds, test_inds


def _generate_scaffold(smiles: str, include_chirality: bool = False) -> str:
def _generate_scaffold(smiles: str,
include_chirality: bool = False) -> Union[str, None]:
"""Compute the Bemis-Murcko scaffold for a SMILES string.
Bemis-Murcko scaffolds are described in DOI: 10.1021/jm9602928.
Expand Down Expand Up @@ -1309,6 +1309,12 @@ def _generate_scaffold(smiles: str, include_chirality: bool = False) -> str:
raise ImportError("This function requires RDKit to be installed.")

mol = Chem.MolFromSmiles(smiles)
if mol is None:
logger.info(
'Not generating scaffold for smiles %s - invalid smiles string' %
smiles)
return None

scaffold = MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
return scaffold

Expand Down Expand Up @@ -1502,9 +1508,13 @@ class ScaffoldSplitter(Splitter):
.. [1] Bemis, Guy W., and Mark A. Murcko. "The properties of known drugs.
1. Molecular frameworks." Journal of medicinal chemistry 39.15 (1996): 2887-2893.
Note
----
This class requires RDKit to be installed.
Notes
-----
- This class requires RDKit to be installed.
- When a SMILES representation of a molecule is invalid, the splitter skips processing
the datapoint i.e it will not include the molecule in any splits.
"""

def split(
Expand Down Expand Up @@ -1588,10 +1598,11 @@ def generate_scaffolds(self,
if ind % log_every_n == 0:
logger.info("Generating scaffold %d/%d" % (ind, data_len))
scaffold = _generate_scaffold(smiles)
if scaffold not in scaffolds:
scaffolds[scaffold] = [ind]
else:
scaffolds[scaffold].append(ind)
if scaffold is not None:
if scaffold not in scaffolds:
scaffolds[scaffold] = [ind]
else:
scaffolds[scaffold].append(ind)

# Sort from largest to smallest scaffold sets
scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
Expand Down
11 changes: 11 additions & 0 deletions deepchem/splits/tests/test_scaffold_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,14 @@ def test_scaffolds(self):
# has to be smaller or equal than number of total molecules
scaffolds_separate_cnt = len(scaffolds_separate)
self.assertTrue(scaffolds_separate_cnt <= train_dataset.X.shape[0])

def test_generate_scaffold(self):
from deepchem.splits.splitters import _generate_scaffold
valid_smiles = r's1cc(nc1\[N]=C(\N)N)C'
scaffold = _generate_scaffold(valid_smiles)
self.assertTrue(scaffold == 'c1cscn1')

# Invalid because valence for atom 5 N is greater than permitted (4)
invalid_smiles = r's1cc(nc1\[NH]=C(\N)N)C'
scaffold = _generate_scaffold(invalid_smiles)
self.assertIsNone(scaffold)

0 comments on commit df89c79

Please sign in to comment.