From 067cc2d8e9c3d79acc34198ca07bb5ee1a8e3c8c Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Wed, 8 Jul 2020 18:07:11 -0700 Subject: [PATCH 1/8] changes --- deepchem/feat/__init__.py | 5 +- deepchem/feat/atomic_coordinates.py | 4 -- deepchem/feat/base_classes.py | 98 ++++++++++++++++++++++---- deepchem/feat/basic.py | 40 +++++++---- deepchem/feat/coulomb_matrices.py | 66 ++++++++++++----- deepchem/feat/fingerprints.py | 30 +++++--- deepchem/feat/graph_features.py | 60 ++++++++++++---- deepchem/feat/materials_featurizers.py | 22 +++--- deepchem/feat/one_hot.py | 78 +++++++++++--------- deepchem/feat/raw_featurizer.py | 38 ++++++++-- deepchem/feat/rdkit_grid_featurizer.py | 40 +++++------ deepchem/feat/smiles_featurizers.py | 40 +++++++++-- deepchem/feat/tests/test_basic.py | 32 +++++++++ docs/featurizers.rst | 4 ++ 14 files changed, 409 insertions(+), 148 deletions(-) diff --git a/deepchem/feat/__init__.py b/deepchem/feat/__init__.py index 2e5e6870bf..c0ca417c3c 100644 --- a/deepchem/feat/__init__.py +++ b/deepchem/feat/__init__.py @@ -1,11 +1,8 @@ """ Making it easy to import in classes. """ -__author__ = "Bharath Ramsundar" -__copyright__ = "Copyright 2016, Stanford University" -__license__ = "MIT" - from deepchem.feat.base_classes import Featurizer +from deepchem.feat.base_classes import MolecularFeaturizer from deepchem.feat.base_classes import ComplexFeaturizer from deepchem.feat.base_classes import UserDefinedFeaturizer from deepchem.feat.graph_features import ConvMolFeaturizer diff --git a/deepchem/feat/atomic_coordinates.py b/deepchem/feat/atomic_coordinates.py index ae1db807f5..e17d575570 100644 --- a/deepchem/feat/atomic_coordinates.py +++ b/deepchem/feat/atomic_coordinates.py @@ -1,10 +1,6 @@ """ Atomic coordinate featurizer. """ -__author__ = "Joseph Gomes and Bharath Ramsundar" -__copyright__ = "Copyright 2016, Stanford University" -__license__ = "MIT" - import logging import numpy as np from deepchem.utils.save import log diff --git a/deepchem/feat/base_classes.py b/deepchem/feat/base_classes.py index f7585f0fa4..3745421e00 100644 --- a/deepchem/feat/base_classes.py +++ b/deepchem/feat/base_classes.py @@ -6,9 +6,7 @@ import numpy as np import multiprocessing -__author__ = "Steven Kearnes" -__copyright__ = "Copyright 2014, Stanford University" -__license__ = "BSD 3-clause" +logger = logging.getLogger(__name__) def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message): @@ -16,6 +14,53 @@ def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message): return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file) +class Featurizer(object): + """Abstract class for calculating a set of features for a datapoint. + + This class is abstract and cannot be invoked directly. You'll + likely only interact with this class if you're a developer. In + that case, you might want to make a child class which + implements the `_featurize` method for calculating features for + a single datapoints if you'd like to make a featurizer for a + new datatype. + """ + + def featurize(self, datapoints, log_every_n=1000): + """Calculate features for datapoints. + + Parameters + ---------- + datapoints: object + Any blob of data you like. Subclasss should instantiate this. + + Returns + ------- + A numpy array containing a featurized representation of + `datapoints`. + """ + datapoints = list(datapoints) + features = [] + for i, point in enumerate(datapoints): + if point is not None: + features.append(self._featurize(point)) + else: + features.append(np.array([])) + + features = np.asarray(features) + return features + + def __call__(self, datapoints): + """Calculate features for datapoints. + + Parameters + ---------- + datapoints: object + Any blob of data you like. Subclasss should instantiate + this. + """ + return self.featurize(datapoints) + + class ComplexFeaturizer(object): """" Abstract class for calculating features for mol/protein complexes. @@ -73,27 +118,56 @@ def _featurize_complex(self, mol_pdb, complex_pdb): raise NotImplementedError('Featurizer is not defined.') -class Featurizer(object): - """ - Abstract class for calculating a set of features for a molecule. +class MolecularFeaturizer(object): + """Abstract class for calculating a set of features for a + molecule. - Child classes implement the _featurize method for calculating features - for a single molecule. + The defining feature of a `MolecularFeaturizer` is that it + uses SMILES strings and RDKIT molecule objecgs to represent + small molecules. All other featurizers which are subclasses of + this class should plan to process input which comes as smiles + strings or RDKIT molecules. + + Child classes need to implement the _featurize method for + calculating features for a single molecule. + + Note + ---- + In general, subclasses of this class will require RDKit to be installed. """ def featurize(self, mols, verbose=True, log_every_n=1000): - """ - Calculate features for molecules. + """Calculate features for molecules. Parameters ---------- mols : iterable - RDKit Mol objects. + RDKit Mol, or SMILES string, or filename for + mol2/sdf/pdb/pdbqt file. + + Returns + ------- + A numpy array containing a featurized representation of + `datapoints`. """ - mols = list(mols) + try: + from rdkit import Chem + from rdkit.Chem.rdchem import Mol + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") + # Special case handling of single molecule + if isinstance(mols, str) or isinstance(mols, Mol): + mols = [mols] + else: + # Convert iterables to list + mols = list(mols) features = [] for i, mol in enumerate(mols): if mol is not None: + # Process only case of SMILES strings. + if isinstance(mol, str): + # mol must be a SMILES string so parse + mol = Chem.MolFromSmiles(mol) features.append(self._featurize(mol)) else: features.append(np.array([])) diff --git a/deepchem/feat/basic.py b/deepchem/feat/basic.py index 7afea234da..8731ea07ea 100644 --- a/deepchem/feat/basic.py +++ b/deepchem/feat/basic.py @@ -1,16 +1,16 @@ """ Basic molecular features. """ -__author__ = "Steven Kearnes" -__copyright__ = "Copyright 2014, Stanford University" -__license__ = "MIT" -from deepchem.feat import Featurizer +from deepchem.feat.base_classes import MolecularFeaturizer -class MolecularWeight(Featurizer): - """ - Molecular weight. +class MolecularWeight(MolecularFeaturizer): + """Molecular weight. + + Note + ---- + This class requires RDKit to be installed. """ name = ['mw', 'molecular_weight'] @@ -23,18 +23,26 @@ def _featurize(self, mol): mol : RDKit Mol Molecule. """ - from rdkit.Chem import Descriptors + try: + from rdkit.Chem import Descriptors + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") wt = Descriptors.ExactMolWt(mol) wt = [wt] return wt -class RDKitDescriptors(Featurizer): - """ - RDKit descriptors. +class RDKitDescriptors(MolecularFeaturizer): + """RDKit descriptors. + + This class comptues a list of chemical descriptors using RDKit. See http://rdkit.org/docs/GettingStartedInPython.html #list-of-available-descriptors. + + Note + ---- + This class requires RDKit to be installed. """ name = 'descriptors' @@ -69,9 +77,12 @@ class RDKitDescriptors(Featurizer): ]) def __init__(self): + try: + from rdkit.Chem import Descriptors + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") self.descriptors = [] self.descList = [] - from rdkit.Chem import Descriptors for descriptor, function in Descriptors.descList: if descriptor in self.allowedDescriptors: self.descriptors.append(descriptor) @@ -85,6 +96,11 @@ def _featurize(self, mol): ---------- mol : RDKit Mol Molecule. + + Returns + ------- + rval: np.ndarray + Vector of RDKit descriptors for `mol` """ rval = [] for desc_name, function in self.descList: diff --git a/deepchem/feat/coulomb_matrices.py b/deepchem/feat/coulomb_matrices.py index c61204e20c..e6135d16dc 100644 --- a/deepchem/feat/coulomb_matrices.py +++ b/deepchem/feat/coulomb_matrices.py @@ -3,21 +3,27 @@ See Montavon et al., _New Journal of Physics_ __15__ (2013) 095003. """ -__author__ = "Steven Kearnes" -__copyright__ = "Copyright 2014, Stanford University" -__license__ = "MIT" - import numpy as np import deepchem as dc -from deepchem.feat import Featurizer +from deepchem.feat.base_classes import MolecularFeaturizer from deepchem.utils import pad_array from deepchem.feat.atomic_coordinates import AtomicCoordinates -class BPSymmetryFunctionInput(Featurizer): - """ - Calculate Symmetry Function for each atom in the molecules - Methods described in https://journals.aps.org/prl/pdf/10.1103/PhysRevLett.98.146401 +class BPSymmetryFunctionInput(MolecularFeaturizer): + """Calculate Symmetry Function for each atom in the molecules + + This method is described in [1]_ + + References + ---------- + .. [1] Behler, Jörg, and Michele Parrinello. "Generalized neural-network + representation of high-dimensional potential-energy surfaces." Physical + review letters 98.14 (2007): 146401. + + Note + ---- + This class requires RDKit to be installed. """ def __init__(self, max_atoms): @@ -34,9 +40,11 @@ def _featurize(self, mol): return np.pad(features, ((0, self.max_atoms - n_atoms), (0, 0)), 'constant') -class CoulombMatrix(Featurizer): - """ - Calculate Coulomb matrices for molecules. +class CoulombMatrix(MolecularFeaturizer): + """Calculate Coulomb matrices for molecules. + + Coulomb matrices provide a representation of the electronic structure of a + molecule. This method is described in [1]_. Parameters ---------- @@ -55,14 +63,24 @@ class CoulombMatrix(Featurizer): seed : int, optional Random seed. - Example: - + Example + ------- >>> featurizers = dc.feat.CoulombMatrix(max_atoms=23) >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv >>> tasks = ["atomization_energy"] >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers) >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS Reading structures from deepchem/feat/tests/data/water.sdf. + + References + ---------- + .. [1] Montavon, Grégoire, et al. "Learning invariant representations of + molecules for atomization energy prediction." Advances in neural information + processing systems. 2012. + + Note + ---- + This class requires RDKit to be installed. """ conformers = True name = 'coulomb_matrix' @@ -74,6 +92,10 @@ def __init__(self, upper_tri=False, n_samples=1, seed=None): + try: + from rdkit import Chem + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") self.max_atoms = int(max_atoms) self.remove_hydrogens = remove_hydrogens self.randomize = randomize @@ -196,8 +218,10 @@ def get_interatomic_distances(conf): class CoulombMatrixEig(CoulombMatrix): - """ - Calculate the eigenvales of Coulomb matrices for molecules. + """Calculate the eigenvalues of Coulomb matrices for molecules. + + This featurizer computes the eigenvalues of the Coulomb matrices for provided + molecules. Coulomb matrices are described in [1]_. Parameters ---------- @@ -214,14 +238,20 @@ class CoulombMatrixEig(CoulombMatrix): seed : int, optional Random seed. - Example: - + Example + ------- >>> featurizers = dc.feat.CoulombMatrixEig(max_atoms=23) >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv >>> tasks = ["atomization_energy"] >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers) >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS Reading structures from deepchem/feat/tests/data/water.sdf. + + References + ---------- + .. [1] Montavon, Grégoire, et al. "Learning invariant representations of + molecules for atomization energy prediction." Advances in neural information + processing systems. 2012. """ conformers = True diff --git a/deepchem/feat/fingerprints.py b/deepchem/feat/fingerprints.py index 75b62c4ac5..015403868c 100644 --- a/deepchem/feat/fingerprints.py +++ b/deepchem/feat/fingerprints.py @@ -1,16 +1,15 @@ """ Topological fingerprints. """ -__author__ = "Steven Kearnes" -__copyright__ = "Copyright 2014, Stanford University" -__license__ = "MIT" +from deepchem.feat.base_classes import MolecularFeaturizer -from deepchem.feat import Featurizer +class CircularFingerprint(MolecularFeaturizer): + """Circular (Morgan) fingerprints. -class CircularFingerprint(Featurizer): - """ - Circular (Morgan) fingerprints. + Extended Connectivity Circular Fingerprints compute a bag-of-words style + representation of a molecule by breaking it into local neighborhoods and + hashing into a bit vector of the specified size. See [1]_ for more details. Parameters ---------- @@ -31,6 +30,15 @@ class CircularFingerprint(Featurizer): smiles : bool, optional (default False) Whether to calculate SMILES strings for fragment IDs (only applicable when calculating sparse fingerprints). + + References + ---------- + .. [1] Rogers, David, and Mathew Hahn. "Extended-connectivity fingerprints." + Journal of chemical information and modeling 50.5 (2010): 742-754. + + Note + ---- + This class requires RDKit to be installed. """ name = 'circular' @@ -42,6 +50,11 @@ def __init__(self, features=False, sparse=False, smiles=False): + try: + from rdkit import Chem + from rdkit.Chem import rdMolDescriptors + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") self.radius = radius self.size = size self.chiral = chiral @@ -51,8 +64,7 @@ def __init__(self, self.smiles = smiles def _featurize(self, mol): - """ - Calculate circular fingerprint. + """Calculate circular fingerprint. Parameters ---------- diff --git a/deepchem/feat/graph_features.py b/deepchem/feat/graph_features.py index f6ce39de78..39e722f07d 100644 --- a/deepchem/feat/graph_features.py +++ b/deepchem/feat/graph_features.py @@ -1,7 +1,6 @@ import numpy as np - import deepchem as dc -from deepchem.feat import Featurizer +from deepchem.feat.base_classes import MolecularFeaturizer from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates from deepchem.feat.mol_graphs import ConvMol, WeaveMol from deepchem.data import DiskDataset @@ -242,8 +241,15 @@ def bond_features(bond, use_chirality=False): ---------- use_chirality: bool, optional If true, use chirality information. + + Note + ---- + This method requires RDKit to be installed. """ - from rdkit import Chem + try: + from rdkit import Chem + except ModuleNotFoundError: + raise ValueError("This method requires RDKit to be installed.") bt = bond.GetBondType() bond_feats = [ bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, @@ -268,16 +274,20 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6, Parameters ---------- - mol: TODO - TODO + mol: RDKit Mol + Molecule to compute features on. edge_list: list - List of edges t oconsider + List of edges to consider canon_adj_list: list TODO bt_len: int, optional TODO graph_distance: bool, optional - TODO + If true, use graph distance between molecules. Else use euclidean distance. + + Note + ---- + This method requires RDKit to be installed. """ if graph_distance: max_distance = 7 @@ -334,10 +344,21 @@ def find_distance(a1, num_atoms, canon_adj_list, max_distance=7): return distance -class ConvMolFeaturizer(Featurizer): - """This class implements the featurization to implement graph convolutions from the Duvenaud graph convolution paper +class ConvMolFeaturizer(MolecularFeaturizer): + """This class implements the featurization to implement Duvenaud graph convolutions. + + Duvenaud graph convolutions [1]_ construct a vector of descriptors for each + atom in a molecule. The featurizer computes that vector of local descriptors. -Duvenaud, David K., et al. "Convolutional networks on graphs for learning molecular fingerprints." Advances in neural information processing systems. 2015. + References + --------- + .. [1] Duvenaud, David K., et al. "Convolutional networks on graphs for + learning molecular fingerprints." Advances in neural information processing + systems. 2015. + + Note + ---- + This class requires RDKit to be installed. """ name = ['conv_mol'] @@ -448,10 +469,23 @@ def __eq__(self, other): tuple(self.atom_properties) == tuple(other.atom_properties) -class WeaveFeaturizer(Featurizer): - """This class implements the featurization to implement Weave convolutions from the Google graph convolution paper. +class WeaveFeaturizer(MolecularFeaturizer): + """This class implements the featurization to implement Weave convolutions. + + Weave convolutions were introduced in [1]_. Unlike Duvenaud graph + convolutions, weave convolutions require a quadratic matrix of interaction + descriptors for each pair of atoms. These extra descriptors may provide for + additional descriptive power but at the cost of a larger featurized dataset. + + References + ---------- + .. [1] Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond + fingerprints." Journal of computer-aided molecular design 30.8 (2016): + 595-608. - Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints." Journal of computer-aided molecular design 30.8 (2016): 595-608. + Note + ---- + This class requires RDKit to be installed. """ name = ['weave_mol'] diff --git a/deepchem/feat/materials_featurizers.py b/deepchem/feat/materials_featurizers.py index 05797130ba..3e9e9e3730 100644 --- a/deepchem/feat/materials_featurizers.py +++ b/deepchem/feat/materials_featurizers.py @@ -23,17 +23,19 @@ class ElementPropertyFingerprint(Featurizer): matminer. It may be useful when only crystal compositions are available (and not 3D coordinates). + See references [1]_ [2]_ [3]_ [4]_ for more details. + References ---------- - MagPie data: Ward, L. et al. npj Comput Mater 2, 16028 (2016). - https://doi.org/10.1038/npjcompumats.2016.28 + .. [1] MagPie data: Ward, L. et al. npj Comput Mater 2, 16028 (2016). + https://doi.org/10.1038/npjcompumats.2016.28 - Deml data: Deml, A. et al. Physical Review B 93, 085142 (2016). - 10.1103/PhysRevB.93.085142 + .. [2] Deml data: Deml, A. et al. Physical Review B 93, 085142 (2016). + 10.1103/PhysRevB.93.085142 - Matminer: Ward, L. et al. Comput. Mater. Sci. 152, 60-69 (2018). + .. [3] Matminer: Ward, L. et al. Comput. Mater. Sci. 152, 60-69 (2018). - Pymatgen: Ong, S.P. et al. Comput. Mater. Sci. 68, 314-319 (2013). + .. [4] Pymatgen: Ong, S.P. et al. Comput. Mater. Sci. 68, 314-319 (2013). """ @@ -101,9 +103,11 @@ class SineCoulombMatrix(Featurizer): matminer. It may be useful when crystal structures with 3D coordinates are available. + See [1]_ for more details. + References ---------- - Faber et al. Inter. J. Quantum Chem. 115, 16, 2015. + .. [1] Faber et al. Inter. J. Quantum Chem. 115, 16, 2015. """ @@ -177,9 +181,11 @@ class StructureGraphFeaturizer(Featurizer): be useful when 3D coordinates are available and when using graph network models and crystal graph convolutional networks. + See [1]_ for more details. + References ---------- - T. Xie and J. C. Grossman, Phys. Rev. Lett. 120, 2018. + .. [1] T. Xie and J. C. Grossman, Phys. Rev. Lett. 120, 2018. """ diff --git a/deepchem/feat/one_hot.py b/deepchem/feat/one_hot.py index ffb82d6745..60592699cb 100644 --- a/deepchem/feat/one_hot.py +++ b/deepchem/feat/one_hot.py @@ -1,5 +1,5 @@ import numpy as np -from deepchem.feat import Featurizer +from deepchem.feat.base_classes import MolecularFeaturizer zinc_charset = [ ' ', '#', ')', '(', '+', '-', '/', '1', '3', '2', '5', '4', '7', '6', '8', @@ -8,13 +8,21 @@ ] -class OneHotFeaturizer(Featurizer): - """ - NOTE(LESWING) Not Thread Safe in initialization of charset +class OneHotFeaturizer(MolecularFeaturizer): + """Encodes a molecule as a one-hot array. + + This featurizer takes a molecule and encodes its Smiles string as a one-hot + array. + + Note + ---- + This class requires RDKit to be installed. Note that this featurizer is not + Thread Safe in initialization of charset """ def __init__(self, charset=None, padlength=120): - """ + """Initialize featurizer. + Parameters ---------- charset: obj:`list` of obj:`str` @@ -22,38 +30,40 @@ def __init__(self, charset=None, padlength=120): padlength: int length to pad the smile strings to """ + try: + from rdkit import Chem + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") self.charset = charset self.pad_length = padlength - def featurize(self, mols, verbose=True, log_every_n=1000): - """ + def _featurize(self, mol): + """Compute one-hot featurization of this molecule. + Parameters ---------- - mols: obj - List of rdkit Molecule Objects - verbose: bool - How much logging - log_every_n: - How often to log - Returns + mol : RDKit Mol + Molecule. + Returns ------- - obj - numpy array of features + rval: np.ndarray + Vector of RDKit descriptors for `mol` """ from rdkit import Chem - smiles = [Chem.MolToSmiles(mol) for mol in mols] + smiles = Chem.MolToSmiles(mol) if self.charset is None: self.charset = self._create_charset(smiles) return np.array([self.one_hot_encoded(smile) for smile in smiles]) def one_hot_array(self, i): - """ - Create a one hot array with bit i set to 1 + """Create a one hot array with bit i set to 1 + Parameters ---------- i: int bit to set to 1 + Returns ------- obj:`list` of obj:`int` @@ -62,25 +72,26 @@ def one_hot_array(self, i): return [int(x) for x in [ix == i for ix in range(len(self.charset))]] def one_hot_index(self, c): - """ - TODO(LESWING) replace with map lookup vs linear scan + """Compute one-hot index of charater. + Parameters ---------- - c + c: char character whose index we want + Returns ------- - int - index of c in self.charset + index of c in self.charset """ return self.charset.index(c) def pad_smile(self, smile): - """ - Pad A Smile String to self.pad_length + """Pad a smile string to `self.pad_length` + Parameters ---------- smile: str + The smiles string to be padded. Returns ------- @@ -91,8 +102,8 @@ def pad_smile(self, smile): return smile.ljust(self.pad_length) def one_hot_encoded(self, smile): - """ - One Hot Encode an entire SMILE string + """One Hot Encode an entire SMILE string + Parameters ---------- smile: str @@ -100,16 +111,15 @@ def one_hot_encoded(self, smile): Returns ------- - object - np.array of one hot encoded arrays for each character in smile + np.array of one hot encoded arrays for each character in smile """ return np.array([ self.one_hot_array(self.one_hot_index(x)) for x in self.pad_smile(smile) ]) def untransform(self, z): - """ - Convert from one hot representation back to SMILE + """Convert from one hot representation back to SMILE + Parameters ---------- z: obj:`list` @@ -129,8 +139,8 @@ def untransform(self, z): return z1 def _create_charset(self, smiles): - """ - create the charset from smiles + """Create the charset from smiles + Parameters ---------- smiles: obj:`list` of obj:`str` diff --git a/deepchem/feat/raw_featurizer.py b/deepchem/feat/raw_featurizer.py index e0980e7113..82a91cb3d3 100644 --- a/deepchem/feat/raw_featurizer.py +++ b/deepchem/feat/raw_featurizer.py @@ -1,14 +1,44 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -from deepchem.feat import Featurizer +from deepchem.feat.base_classes import MolecularFeaturizer -class RawFeaturizer(Featurizer): +class RawFeaturizer(MolecularFeaturizer): + """Encodes a molecule as a SMILES string or RDKit mol. + + This featurizer can be useful when you're trying to transform a large + collection of RDKit mol objects as Smiles strings, or alternatively as a + "no-op" featurizer in your molecular pipeline. + + Note + ---- + This class requires RDKit to be installed. + """ def __init__(self, smiles=False): + """Initialize this featurizer. + + Parameter + --------- + smiles: bool, optional (default False) + If True, encode this molecule as a SMILES string. Else as a RDKit mol. + """ + try: + from rdkit import Chem + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") self.smiles = smiles def _featurize(self, mol): + """Calculate either smiles string or pass through raw molecule. + + Parameters + ---------- + mol : RDKit Mol + Molecule. + + Returns + ------- + Smiles string or raw molecule. + """ from rdkit import Chem if self.smiles: return Chem.MolToSmiles(mol) diff --git a/deepchem/feat/rdkit_grid_featurizer.py b/deepchem/feat/rdkit_grid_featurizer.py index 644a55b51e..4f307a74ca 100644 --- a/deepchem/feat/rdkit_grid_featurizer.py +++ b/deepchem/feat/rdkit_grid_featurizer.py @@ -1,7 +1,3 @@ -__author__ = "Bharath Ramsundar, Evan Feinberg, and Karl Leswing" -__copyright__ = "Copyright 2016, Stanford University" -__license__ = "MIT" - import logging import os import shutil @@ -19,9 +15,6 @@ from copy import deepcopy from deepchem.feat import ComplexFeaturizer from deepchem.utils.save import log -""" -TODO(LESWING) add sanitization with rdkit upgrade to 2017.* -""" def compute_centroid(coordinates): @@ -53,22 +46,23 @@ def generate_random__unit_vector(): def generate_random_rotation_matrix(): - """ - 1. Generate a random unit vector u, randomly sampled from the unit - 3-sphere (see function generate_random__unit_vector() for details) - 2. Generate a second random unit vector v - a. If absolute value of u \dot v > 0.99, repeat. - (This is important for numerical stability. Intuition: we want them to - be as linearly independent as possible or else the orthogonalized - version of v will be much shorter in magnitude compared to u. I assume - in Stack they took this from Gram-Schmidt orthogonalization?) - b. v" = v - (u \dot v)*u, i.e. subtract out the component of v that's in - u's direction - c. normalize v" (this isn"t in Stack but I assume it must be done) - 3. find w = u \cross v" - 4. u, v", and w will form the columns of a rotation matrix, R. The - intuition is that u, v" and w are, respectively, what the standard basis - vectors e1, e2, and e3 will be mapped to under the transformation. + """Generate a random rotation matrix in 3D. + + 1. Generate a random unit vector u, randomly sampled from the unit + 3-sphere (see function generate_random__unit_vector() for details) + 2. Generate a second random unit vector v + a. If absolute value of u \dot v > 0.99, repeat. + (This is important for numerical stability. Intuition: we want them to + be as linearly independent as possible or else the orthogonalized + version of v will be much shorter in magnitude compared to u. I assume + in Stack they took this from Gram-Schmidt orthogonalization?) + b. v" = v - (u \dot v)*u, i.e. subtract out the component of v that's in + u's direction + c. normalize v" (this isn"t in Stack but I assume it must be done) + 3. find w = u \cross v" + 4. u, v", and w will form the columns of a rotation matrix, R. The + intuition is that u, v" and w are, respectively, what the standard basis + vectors e1, e2, and e3 will be mapped to under the transformation. """ u = generate_random__unit_vector() v = generate_random__unit_vector() diff --git a/deepchem/feat/smiles_featurizers.py b/deepchem/feat/smiles_featurizers.py index 86451ec8b9..bb4e7470b1 100644 --- a/deepchem/feat/smiles_featurizers.py +++ b/deepchem/feat/smiles_featurizers.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from deepchem.feat import Featurizer +from deepchem.feat.base_classes import MolecularFeaturizer PAD_TOKEN = "" OUT_OF_VOCAB_TOKEN = "" @@ -50,10 +50,10 @@ def create_char_to_idx(filename, return char_to_idx -class SmilesToSeq(Featurizer): +class SmilesToSeq(MolecularFeaturizer): """ SmilesToSeq Featurizer takes a SMILES string, and turns it into a sequence. - Details taken from https://arxiv.org/abs/1712.02734. + Details taken from [1]_. SMILES strings smaller than a specified max length (max_len) are padded using the PAD token while those larger than the max length are not considered. Based @@ -62,10 +62,20 @@ class SmilesToSeq(Featurizer): mapping, the SMILES characters are turned into indices and the resulting sequence of indices serves as the input for an embedding layer. + References + ---------- + .. [1] Goh, Garrett B., et al. "Using rule-based labels for weak supervised + learning: a ChemNet for transferable chemical property prediction." + Proceedings of the 24th ACM SIGKDD International Conference on Knowledge + Discovery & Data Mining. 2018. + + Note + ---- + This class requires RDKit to be installed. """ def __init__(self, char_to_idx, max_len=250, pad_len=10, **kwargs): - """ + """Initialize this class. Parameters ---------- char_to_idx: dict @@ -75,6 +85,10 @@ def __init__(self, char_to_idx, max_len=250, pad_len=10, **kwargs): pad_len: int, default 10 Amount of padding to add on either side of the SMILES seq """ + try: + from rdkit import Chem + except ModuleNotFoundError: + raise ValueError("This class requires RDKit to be installed.") self.max_len = max_len self.char_to_idx = char_to_idx self.idx_to_char = {idx: letter for letter, idx in self.char_to_idx.items()} @@ -128,10 +142,11 @@ def _featurize(self, mol): return smile_seq -class SmilesToImage(Featurizer): - """ +class SmilesToImage(MolecularFeaturizer): + """Convert Smiles string to an image. + SmilesToImage Featurizer takes a SMILES string, and turns it into an image. - Details taken from https://arxiv.org/abs/1712.02734. + Details taken from [1]_. The default size of for the image is 80 x 80. Two image modes are currently supported - std & engd. std is the gray scale specification, @@ -143,6 +158,17 @@ class SmilesToImage(Featurizer): The coordinates of all atoms are computed, and lines are drawn between atoms to indicate bonds. For the respective channels, the atom and bond positions are set to the property values as mentioned in the paper. + + References + ---------- + .. [1] Goh, Garrett B., et al. "Using rule-based labels for weak supervised + learning: a ChemNet for transferable chemical property prediction." + Proceedings of the 24th ACM SIGKDD International Conference on Knowledge + Discovery & Data Mining. 2018. + + Note + ---- + This class requires RDKit to be installed. """ def __init__(self, diff --git a/deepchem/feat/tests/test_basic.py b/deepchem/feat/tests/test_basic.py index 6c78387159..8a4395f849 100644 --- a/deepchem/feat/tests/test_basic.py +++ b/deepchem/feat/tests/test_basic.py @@ -27,6 +27,18 @@ def testMW(self): """ assert np.allclose(self.engine([self.mol]), 180, atol=0.1) + def test_MW_on_smiles(self): + """ + Test MW invocation on smiles." + """ + assert np.allclose(self.engine('CC(=O)OC1=CC=CC=C1C(=O)O'), 180, atol=0.1) + + def test_MW_on_mol(self): + """ + Test MW invocation on RDKit mol." + """ + assert np.allclose(self.engine(self.mol), 180, atol=0.1) + class TestRDKitDescriptors(unittest.TestCase): """ @@ -51,3 +63,23 @@ def testRDKitDescriptors(self): descriptors[0, self.engine.descriptors.index('ExactMolWt')], 180, atol=0.1) + + def testRDKitDescriptorsOnSmiles(self): + """ + Test invocation on raw smiles. + """ + descriptors = self.engine('CC(=O)OC1=CC=CC=C1C(=O)O') + assert np.allclose( + descriptors[0, self.engine.descriptors.index('ExactMolWt')], + 180, + atol=0.1) + + def testRDKitDescriptorsOnMol(self): + """ + Test invocation on RDKit mol. + """ + descriptors = self.engine(self.mol) + assert np.allclose( + descriptors[0, self.engine.descriptors.index('ExactMolWt')], + 180, + atol=0.1) diff --git a/docs/featurizers.rst b/docs/featurizers.rst index 7e08dc831c..bd735c0be1 100644 --- a/docs/featurizers.rst +++ b/docs/featurizers.rst @@ -35,6 +35,10 @@ MolecularFeaturizer Molecular Featurizers are those that work with datasets of molecules. +.. autoclass:: deepchem.feat.MolecularFeaturizer + :members: + + ConvMolFeaturizer ^^^^^^^^^^^^^^^^^ From eeb43651f874e4cdba912dafc979eec33fb9fa09 Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Wed, 8 Jul 2020 18:17:15 -0700 Subject: [PATCH 2/8] Changes --- deepchem/feat/base_classes.py | 3 +-- deepchem/feat/graph_features.py | 5 +++-- deepchem/feat/smiles_featurizers.py | 23 ++++++++++++----------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/deepchem/feat/base_classes.py b/deepchem/feat/base_classes.py index 3745421e00..4f2c28aadb 100644 --- a/deepchem/feat/base_classes.py +++ b/deepchem/feat/base_classes.py @@ -35,8 +35,7 @@ def featurize(self, datapoints, log_every_n=1000): Returns ------- - A numpy array containing a featurized representation of - `datapoints`. + A numpy array containing a featurized representation of `datapoints`. """ datapoints = list(datapoints) features = [] diff --git a/deepchem/feat/graph_features.py b/deepchem/feat/graph_features.py index 39e722f07d..b1f2e5d75f 100644 --- a/deepchem/feat/graph_features.py +++ b/deepchem/feat/graph_features.py @@ -352,9 +352,10 @@ class ConvMolFeaturizer(MolecularFeaturizer): References --------- + .. [1] Duvenaud, David K., et al. "Convolutional networks on graphs for - learning molecular fingerprints." Advances in neural information processing - systems. 2015. + learning molecular fingerprints." Advances in neural information + processing systems. 2015. Note ---- diff --git a/deepchem/feat/smiles_featurizers.py b/deepchem/feat/smiles_featurizers.py index bb4e7470b1..b31d93bf18 100644 --- a/deepchem/feat/smiles_featurizers.py +++ b/deepchem/feat/smiles_featurizers.py @@ -21,17 +21,17 @@ def create_char_to_idx(filename, verbose=False): """Creates a dictionary with character to index mapping. - Parameters - ---------- - filename: str, - Name of the file containing the SMILES strings - max_len: int, default 250 - Maximum allowed length of the SMILES string - smiles_field: str, default smiles - Field indicating the SMILES strings int the file. - verbose: bool, default True - Whether to print the progress - """ + Parameters + ---------- + filename: str, + Name of the file containing the SMILES strings + max_len: int, default 250 + Maximum allowed length of the SMILES string + smiles_field: str, default smiles + Field indicating the SMILES strings int the file. + verbose: bool, default True + Whether to print the progress + """ smiles_df = pd.read_csv(filename) char_set = set() for smile in smiles_df[smiles_field]: @@ -76,6 +76,7 @@ class SmilesToSeq(MolecularFeaturizer): def __init__(self, char_to_idx, max_len=250, pad_len=10, **kwargs): """Initialize this class. + Parameters ---------- char_to_idx: dict From 76b6a1aba5b556b4da9e28932b17d1814e195134 Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Thu, 9 Jul 2020 15:02:44 -0700 Subject: [PATCH 3/8] Changes --- deepchem/feat/base_classes.py | 4 ++-- deepchem/feat/basic.py | 16 ++++++++++++---- deepchem/feat/raw_featurizer.py | 4 ++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/deepchem/feat/base_classes.py b/deepchem/feat/base_classes.py index 4f2c28aadb..4033575826 100644 --- a/deepchem/feat/base_classes.py +++ b/deepchem/feat/base_classes.py @@ -117,12 +117,12 @@ def _featurize_complex(self, mol_pdb, complex_pdb): raise NotImplementedError('Featurizer is not defined.') -class MolecularFeaturizer(object): +class MolecularFeaturizer(Featurizer): """Abstract class for calculating a set of features for a molecule. The defining feature of a `MolecularFeaturizer` is that it - uses SMILES strings and RDKIT molecule objecgs to represent + uses SMILES strings and RDKIT molecule objects to represent small molecules. All other featurizers which are subclasses of this class should plan to process input which comes as smiles strings or RDKIT molecules. diff --git a/deepchem/feat/basic.py b/deepchem/feat/basic.py index 8731ea07ea..c89aa12a62 100644 --- a/deepchem/feat/basic.py +++ b/deepchem/feat/basic.py @@ -2,6 +2,7 @@ Basic molecular features. """ +import numpy as np from deepchem.feat.base_classes import MolecularFeaturizer @@ -12,7 +13,6 @@ class MolecularWeight(MolecularFeaturizer): ---- This class requires RDKit to be installed. """ - name = ['mw', 'molecular_weight'] def _featurize(self, mol): """ @@ -22,6 +22,10 @@ def _featurize(self, mol): ---------- mol : RDKit Mol Molecule. + + Returns + ------- + np.ndarray of length 1 containing the molecular weight. """ try: from rdkit.Chem import Descriptors @@ -29,7 +33,7 @@ def _featurize(self, mol): raise ValueError("This class requires RDKit to be installed.") wt = Descriptors.ExactMolWt(mol) wt = [wt] - return wt + return np.asarray(wt) class RDKitDescriptors(MolecularFeaturizer): @@ -40,11 +44,15 @@ class RDKitDescriptors(MolecularFeaturizer): See http://rdkit.org/docs/GettingStartedInPython.html #list-of-available-descriptors. + Attributes + ---------- + descriptors: list + List of RDKit descriptor names used in this class. + Note ---- This class requires RDKit to be installed. """ - name = 'descriptors' # (ytz): This is done to avoid future compatibility issues like inclusion of # the 3D descriptors or changing the feature size. @@ -105,4 +113,4 @@ def _featurize(self, mol): rval = [] for desc_name, function in self.descList: rval.append(function(mol)) - return rval + return np.asarray(rval) diff --git a/deepchem/feat/raw_featurizer.py b/deepchem/feat/raw_featurizer.py index 82a91cb3d3..24dba2a65f 100644 --- a/deepchem/feat/raw_featurizer.py +++ b/deepchem/feat/raw_featurizer.py @@ -16,8 +16,8 @@ class RawFeaturizer(MolecularFeaturizer): def __init__(self, smiles=False): """Initialize this featurizer. - Parameter - --------- + Parameters + ---------- smiles: bool, optional (default False) If True, encode this molecule as a SMILES string. Else as a RDKit mol. """ From 2beebfa2c32d13f61f6aa401ba0b762100544c07 Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Thu, 9 Jul 2020 20:20:19 -0700 Subject: [PATCH 4/8] Changes --- deepchem/feat/base_classes.py | 18 +- deepchem/feat/basic.py | 6 +- deepchem/feat/coulomb_matrices.py | 50 +++++- deepchem/feat/graph_features.py | 246 ++++++++++++++++++++++++---- deepchem/feat/one_hot.py | 8 +- deepchem/feat/smiles_featurizers.py | 4 + docs/dataclasses.rst | 20 +++ docs/featurizers.rst | 41 +++++ docs/index.rst | 1 + 9 files changed, 343 insertions(+), 51 deletions(-) create mode 100644 docs/dataclasses.rst diff --git a/deepchem/feat/base_classes.py b/deepchem/feat/base_classes.py index 4033575826..09a4b66e60 100644 --- a/deepchem/feat/base_classes.py +++ b/deepchem/feat/base_classes.py @@ -30,8 +30,10 @@ def featurize(self, datapoints, log_every_n=1000): Parameters ---------- - datapoints: object - Any blob of data you like. Subclasss should instantiate this. + datapoints: iterable + A sequence of objects that you'd like to featurize. Subclassses of + `Featurizer` should instantiate the `_featurize` method that featurizes + objects in the sequence. Returns ------- @@ -40,6 +42,8 @@ def featurize(self, datapoints, log_every_n=1000): datapoints = list(datapoints) features = [] for i, point in enumerate(datapoints): + if i % log_every_n == 0: + logger.info("Featurizing datapoint %i" % i) if point is not None: features.append(self._featurize(point)) else: @@ -135,14 +139,14 @@ class MolecularFeaturizer(Featurizer): In general, subclasses of this class will require RDKit to be installed. """ - def featurize(self, mols, verbose=True, log_every_n=1000): + def featurize(self, mols, log_every_n=1000): """Calculate features for molecules. Parameters ---------- - mols : iterable - RDKit Mol, or SMILES string, or filename for - mol2/sdf/pdb/pdbqt file. + mols : RDKit Mol / SMILES string /iterable + RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES + strings. Returns ------- @@ -162,6 +166,8 @@ def featurize(self, mols, verbose=True, log_every_n=1000): mols = list(mols) features = [] for i, mol in enumerate(mols): + if i % log_every_n == 0: + logger.info("Featurizing datapoint %i" % i) if mol is not None: # Process only case of SMILES strings. if isinstance(mol, str): diff --git a/deepchem/feat/basic.py b/deepchem/feat/basic.py index c89aa12a62..086e2392c1 100644 --- a/deepchem/feat/basic.py +++ b/deepchem/feat/basic.py @@ -46,8 +46,8 @@ class RDKitDescriptors(MolecularFeaturizer): Attributes ---------- - descriptors: list - List of RDKit descriptor names used in this class. + descriptors: np.ndarray + 1D array of RDKit descriptor names used in this class. Note ---- @@ -108,7 +108,7 @@ def _featurize(self, mol): Returns ------- rval: np.ndarray - Vector of RDKit descriptors for `mol` + 1D array of RDKit descriptors for `mol` """ rval = [] for desc_name, function in self.descList: diff --git a/deepchem/feat/coulomb_matrices.py b/deepchem/feat/coulomb_matrices.py index e6135d16dc..e4b1707a53 100644 --- a/deepchem/feat/coulomb_matrices.py +++ b/deepchem/feat/coulomb_matrices.py @@ -27,6 +27,14 @@ class BPSymmetryFunctionInput(MolecularFeaturizer): """ def __init__(self, max_atoms): + """Initialize this featurizer. + + Parameters + ---------- + max_atoms: int + The maximum number of atoms expected for molecules this featurizer will + process. + """ self.max_atoms = max_atoms def _featurize(self, mol): @@ -92,6 +100,24 @@ def __init__(self, upper_tri=False, n_samples=1, seed=None): + """Initialize this featurizer. + + Parameters + ---------- + max_atoms: int + The maximum number of atoms expected for molecules this featurizer will + process. + remove_hydrogens: bool, optional (default False) + If True, remove hydrogens before processing them. + randomize: bool, optional (default False) + If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices. + upper_tri: bool, optional (default False) + Generate only upper triangle part of Coulomb matrices. + n_samples: int, optional (default 1) + If `randomize` is set to True, the number of random samples to draw. + seed: int, optional (default None) + Random seed to use. + """ try: from rdkit import Chem except ModuleNotFoundError: @@ -163,9 +189,7 @@ def coulomb_matrix(self, mol): return rval def randomize_coulomb_matrix(self, m): - """ - Randomize a Coulomb matrix as decribed in Montavon et al., - New Journal of Physics, 15, (2013), 095003: + """Randomize a Coulomb matrix as decribed in [1]_: 1. Compute row norms for M in a vector row_norms. 2. Sample a zero-mean unit-variance noise vector e with dimension @@ -181,6 +205,10 @@ def randomize_coulomb_matrix(self, m): Number of random matrices to generate. seed : int, optional Random seed. + + References + ---------- + .. [1] Montavon et al., New Journal of Physics, 15, (2013), 095003 """ rval = [] row_norms = np.asarray([np.linalg.norm(row) for row in m], dtype=float) @@ -263,6 +291,22 @@ def __init__(self, randomize=False, n_samples=1, seed=None): + """Initialize this featurizer. + + Parameters + ---------- + max_atoms: int + The maximum number of atoms expected for molecules this featurizer will + process. + remove_hydrogens: bool, optional (default False) + If True, remove hydrogens before processing them. + randomize: bool, optional (default False) + If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices. + n_samples: int, optional (default 1) + If `randomize` is set to True, the number of random samples to draw. + seed: int, optional (default None) + Random seed to use. + """ self.max_atoms = int(max_atoms) self.remove_hydrogens = remove_hydrogens self.randomize = randomize diff --git a/deepchem/feat/graph_features.py b/deepchem/feat/graph_features.py index b1f2e5d75f..82e01e37c0 100644 --- a/deepchem/feat/graph_features.py +++ b/deepchem/feat/graph_features.py @@ -1,3 +1,4 @@ +import enum import numpy as np import deepchem as dc from deepchem.feat.base_classes import MolecularFeaturizer @@ -14,21 +15,73 @@ def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message): def one_of_k_encoding(x, allowable_set): + """Encodes elements of a provided set as integers. + + Parameters + ---------- + x: object + Must be present in `allowable_set`. + allowable_set: list + List of allowable quantities. + + Example + ------- + >>> import deepchem as dc + >>> dc.feat.graph_features.one_of_k_encoding("a", ["a", "b", "c"]) + [True, False, False] + + Raises + ------ + `ValueError` if `x` is not in `allowable_set`. + """ if x not in allowable_set: - raise Exception("input {0} not in allowable set{1}:".format( + raise ValueError("input {0} not in allowable set{1}:".format( x, allowable_set)) return list(map(lambda s: x == s, allowable_set)) def one_of_k_encoding_unk(x, allowable_set): - """Maps inputs not in the allowable set to the last element.""" + """Maps inputs not in the allowable set to the last element. + + Unlike `one_of_k_encoding`, if `x` is not in `allowable_set`, this method + pretends that `x` is the last element of `allowable_set`. + + Parameters + ---------- + x: object + Must be present in `allowable_set`. + allowable_set: list + List of allowable quantities. + + Examples + -------- + >>> dc.feat.graph_features.one_of_k_encoding_unk("s", ["a", "b", "c"]) + [False, False, True] + """ if x not in allowable_set: x = allowable_set[-1] return list(map(lambda s: x == s, allowable_set)) def get_intervals(l): - """For list of lists, gets the cumulative products of the lengths""" + """For list of lists, gets the cumulative products of the lengths + + Note that we add 1 to the lengths of all lists (to avoid an empty list + propagating a 0). + + Parameters + ---------- + l: list of lists + Returns the cumulative product of these lengths. + + Examples + -------- + >>> dc.feat.graph_features.get_intervals([[1], [1, 2], [1, 2, 3]]) + [1, 3, 12] + + >>> dc.feat.graph_features.get_intervals([[1], [], [1, 2], [1, 2, 3]]) + >>> [1, 1, 3, 12] + """ intervals = len(l) * [0] # Initalize with 1 intervals[0] = 1 @@ -39,37 +92,59 @@ def get_intervals(l): def safe_index(l, e): - """Gets the index of e in l, providing an index of len(l) if not found""" + """Gets the index of e in l, providing an index of len(l) if not found + + Parameters + ---------- + l: list + List of values + e: object + Object to check whether `e` is in `l` + + Examples + -------- + >>> dc.feat.graph_features.safe_index([1, 2, 3], 1) + 0 + >>> dc.feat.graph_features.safe_index([1, 2, 3], 7) + 3 + """ try: return l.index(e) except: return len(l) -possible_atom_list = [ - 'C', 'N', 'O', 'S', 'F', 'P', 'Cl', 'Mg', 'Na', 'Br', 'Fe', 'Ca', 'Cu', - 'Mc', 'Pd', 'Pb', 'K', 'I', 'Al', 'Ni', 'Mn' -] -possible_numH_list = [0, 1, 2, 3, 4] -possible_valence_list = [0, 1, 2, 3, 4, 5, 6] -possible_formal_charge_list = [-3, -2, -1, 0, 1, 2, 3] -# To avoid importing rdkit, this is a placeholder list of the correct -# length. These will be replaced with rdkit HybridizationType below -possible_hybridization_list = ["SP", "SP2", "SP3", "SP3D", "SP3D2"] -possible_number_radical_e_list = [0, 1, 2] -possible_chirality_list = ['R', 'S'] - -reference_lists = [ - possible_atom_list, possible_numH_list, possible_valence_list, - possible_formal_charge_list, possible_number_radical_e_list, - possible_hybridization_list, possible_chirality_list -] - -intervals = get_intervals(reference_lists) -# We use E-Z notation for stereochemistry -# https://en.wikipedia.org/wiki/E%E2%80%93Z_notation -possible_bond_stereo = ["STEREONONE", "STEREOANY", "STEREOZ", "STEREOE"] -bond_fdim_base = 6 +class GraphConvConstants(enum.Enum): + """Allowed Atom Types.""" + possible_atom_list = [ + 'C', 'N', 'O', 'S', 'F', 'P', 'Cl', 'Mg', 'Na', 'Br', 'Fe', 'Ca', 'Cu', + 'Mc', 'Pd', 'Pb', 'K', 'I', 'Al', 'Ni', 'Mn' + ] + """Allowed Numbers of Hydrogens""" + possible_numH_list = [0, 1, 2, 3, 4] + """Allowed Valences for Atoms""" + possible_valence_list = [0, 1, 2, 3, 4, 5, 6] + """Allowed Formal Charges for Atoms""" + possible_formal_charge_list = [-3, -2, -1, 0, 1, 2, 3] + """This is a placeholder for documentation. These will be replaced with corresponding values of the rdkit HybridizationType""" + possible_hybridization_list = ["SP", "SP2", "SP3", "SP3D", "SP3D2"] + """Allowed number of radical electrons.""" + possible_number_radical_e_list = [0, 1, 2] + """Allowed types of Chirality""" + possible_chirality_list = ['R', 'S'] + """The set of all values allowed.""" + reference_lists = [ + possible_atom_list, possible_numH_list, possible_valence_list, + possible_formal_charge_list, possible_number_radical_e_list, + possible_hybridization_list, possible_chirality_list + ] + """The number of different values that can be taken. See `get_intervals()`""" + intervals = get_intervals(reference_lists) + """Possible stereochemistry. We use E-Z notation for stereochemistry + https://en.wikipedia.org/wiki/E%E2%80%93Z_notation""" + possible_bond_stereo = ["STEREONONE", "STEREOANY", "STEREOZ", "STEREOE"] + """Number of different bond types not counting stereochemistry.""" + bond_fdim_base = 6 def get_feature_list(atom): @@ -79,10 +154,39 @@ def get_feature_list(atom): ---------- atom: RDKit.rdchem.Atom Atom to get features for + + Examples + -------- + >>> from rdkit import Chem + >>> mol = Chem.MolFromSmiles("C") + >>> atom = mol.GetAtoms()[0] + >>> dc.feat.graph_features.get_feature_list(atom) + [0, 4, 4, 3, 0, 2] + + Note + ---- + This method requires RDKit to be installed. + + Returns + ------- + features: list + List of length 6. The i-th value in this list provides the index of the + atom in the corresponding feature value list. The 6 feature values lists + for this function are `[GraphConvConstants.possible_atom_list, + GraphConvConstants.possible_numH_list, + GraphConvConstants.possible_valence_list, + GraphConvConstants.possible_formal_charge_list, + GraphConvConstants.possible_num_radical_e_list]`. """ + possible_atom_list = GraphConvConstants.possible_atom_list + possible_numH_list = GraphConvConstants.possible_numH_list + possible_valence_list = GraphConvConstants.possible_valence_list + possible_formal_charge_list = GraphConvConstants.possible_formal_charge_list + possible_number_radical_e_list = GraphConvConstants.possible_number_radical_e_list + possible_hybridization_list = GraphConvConstants.possible_hybridization_list # Replace the hybridization from rdkit import Chem - global possible_hybridization_list + #global possible_hybridization_list possible_hybridization_list = [ Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2, Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.SP3D, @@ -101,7 +205,20 @@ def get_feature_list(atom): def features_to_id(features, intervals): - """Convert list of features into index using spacings provided in intervals""" + """Convert list of features into index using spacings provided in intervals + + Parameters + ---------- + features: list + List of features as returned by `get_feature_list()` + intervals: list + List of intervals as returned by `get_intervals()` + + Returns + ------- + id: int + The index in a feature vector given by the given set of features. + """ id = 0 for k in range(len(intervals)): id += features[k] * intervals[k] @@ -112,6 +229,20 @@ def features_to_id(features, intervals): def id_to_features(id, intervals): + """Given an index in a feature vector, return the original set of features. + + Parameters + ---------- + id: int + The index in a feature vector given by the given set of features. + intervals: list + List of intervals as returned by `get_intervals()` + + Returns + ------- + features: list + List of features as returned by `get_feature_list()` + """ features = 6 * [0] # Correct for null @@ -133,6 +264,11 @@ def atom_to_id(atom): ---------- atom: RDKit.rdchem.Atom Atom to convert to ids. + + Returns + ------- + id: int + The index in a feature vector given by the given set of features. """ features = get_feature_list(atom) return features_to_id(features, intervals) @@ -154,6 +290,10 @@ def atom_features(atom, If true, model hydrogens explicitly use_chirality: bool, optional If true, use chirality information. + + Returns + ------- + np.ndarray of per-atom features. """ if bool_id_feat: return np.array([atom_to_id(atom)]) @@ -245,6 +385,12 @@ def bond_features(bond, use_chirality=False): Note ---- This method requires RDKit to be installed. + + Returns + ------- + bond_feats: np.ndarray + Array of bond features. This is a 1-D array of length 6 if `use_chirality` + is `False` else of length 10 with chirality encoded. """ try: from rdkit import Chem @@ -278,16 +424,24 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6, Molecule to compute features on. edge_list: list List of edges to consider - canon_adj_list: list - TODO - bt_len: int, optional - TODO - graph_distance: bool, optional + canon_adj_list: list of lists + `canon_adj_list[i]` is a list of the atom indices that atom `i` shares a + list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in + canon_adj_list[j]`. + bt_len: int, optional (default 6) + The number of different bond types to consider. + graph_distance: bool, optional (default True) If true, use graph distance between molecules. Else use euclidean distance. Note ---- This method requires RDKit to be installed. + + Returns + ------- + features: np.ndarray + Of shape `(N, N, bt_len + max_distance + 1)`. This is the array of pairwise + features for all atom pairs. """ if graph_distance: max_distance = 7 @@ -326,6 +480,28 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6, def find_distance(a1, num_atoms, canon_adj_list, max_distance=7): + """Computes distances from provided atom. + + Parameters + ---------- + a1: RDKit atom + The source atom to compute distances from. + num_atoms: int + The total number of atoms. + canon_adj_list: list of lists + `canon_adj_list[i]` is a list of the atom indices that atom `i` shares a + list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in + canon_adj_list[j]`. + max_distance: int, optional (default 7) + The max distance to search. + + Returns + ------- + distances: np.ndarray + Of shape `(num_atoms, max_distance)`. Provides a one-hot encoding of the + distances. That is, `distances[i]` is a one-hot encoding of the distance + from `a1` to atom `i`. + """ distance = np.zeros((num_atoms, max_distance)) radial = 0 # atoms `radial` bonds away from `a1` diff --git a/deepchem/feat/one_hot.py b/deepchem/feat/one_hot.py index 60592699cb..73d50b471a 100644 --- a/deepchem/feat/one_hot.py +++ b/deepchem/feat/one_hot.py @@ -25,10 +25,10 @@ def __init__(self, charset=None, padlength=120): Parameters ---------- - charset: obj:`list` of obj:`str` - Each string is length 1 - padlength: int - length to pad the smile strings to + charset: list of str, optional (default None) + A list of strings, where each string is length 1. + padlength: int, optional (default 120) + length to pad the smile strings to. """ try: from rdkit import Chem diff --git a/deepchem/feat/smiles_featurizers.py b/deepchem/feat/smiles_featurizers.py index b31d93bf18..54699c5c93 100644 --- a/deepchem/feat/smiles_featurizers.py +++ b/deepchem/feat/smiles_featurizers.py @@ -31,6 +31,10 @@ def create_char_to_idx(filename, Field indicating the SMILES strings int the file. verbose: bool, default True Whether to print the progress + + Returns + ------- + A dictionary mapping characters to their integer indexes. """ smiles_df = pd.read_csv(filename) char_set = set() diff --git a/docs/dataclasses.rst b/docs/dataclasses.rst new file mode 100644 index 0000000000..e33ee2dd76 --- /dev/null +++ b/docs/dataclasses.rst @@ -0,0 +1,20 @@ +Data Classes +============ +DeepChem featurizers often transform members into "data classes". These are +classes that hold all the information needed to train a model on that data +point. Models then transform these into the tensors for training in their +:code:`default_generator` methods. + +Graph Convolutions +------------------ + +These classes document the data classes for graph convolutions. We plan to simplify these classes into a joint data representation for all graph convolutions in a future version of DeepChem, so these APIs may not remain stable. + +.. autoclass:: deepchem.feat.mol_graphs.ConvMol + :members: + +.. autoclass:: deepchem.feat.mol_graphs.MultiConvMol + :members: + +.. autoclass:: deepchem.feat.mol_graphs.WeaveMol + :members: diff --git a/docs/featurizers.rst b/docs/featurizers.rst index bd735c0be1..4012a52117 100644 --- a/docs/featurizers.rst +++ b/docs/featurizers.rst @@ -38,6 +38,47 @@ Molecular Featurizers are those that work with datasets of molecules. .. autoclass:: deepchem.feat.MolecularFeaturizer :members: +Here are some constants that are used by the graph convolutional featurizers for molecules. + +.. autoclass:: deepchem.feat.graph_features.GraphConvConstants + :members: + :undoc-members: + +There are a number of helper methods used by the graph convolutional classes which we document here. + +.. autofunction:: deepchem.feat.graph_features.one_of_k_encoding + +.. autofunction:: deepchem.feat.graph_features.one_of_k_encoding_unk + +.. autofunction:: deepchem.feat.graph_features.get_intervals + +.. autofunction:: deepchem.feat.graph_features.safe_index + +.. autofunction:: deepchem.feat.graph_features.get_feature_list + +.. autofunction:: deepchem.feat.graph_features.features_to_id + +.. autofunction:: deepchem.feat.graph_features.id_to_features + +.. autofunction:: deepchem.feat.graph_features.atom_to_id + +This function helps compute distances between atoms from a given base atom. + +.. autofunction:: deepchem.feat.graph_features.find_distance + +This function is important and computes per-atom feature vectors used by +graph convolutional featurizers. + +.. autofunction:: deepchem.feat.graph_features.atom_features + +This function computes the bond features used by graph convolutional +featurizers. + +.. autofunction:: deepchem.feat.graph_features.bond_features + +This function computes atom-atom features (for atom pairs which may not have bonds between them.) + +.. autofunction:: deepchem.feat.graph_features.pair_features ConvMolFeaturizer ^^^^^^^^^^^^^^^^^ diff --git a/docs/index.rst b/docs/index.rst index 236b152293..ef80b56f16 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -129,6 +129,7 @@ discussions about research, development or any general questions. If you'd like Datasets Data Loaders Featurizers + Data Classes Splitters Transformers Models From b66503dd86725558850d6b7e241b8aa55856c972 Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Thu, 9 Jul 2020 21:40:29 -0700 Subject: [PATCH 5/8] fix --- deepchem/feat/graph_features.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepchem/feat/graph_features.py b/deepchem/feat/graph_features.py index 82e01e37c0..6469645d08 100644 --- a/deepchem/feat/graph_features.py +++ b/deepchem/feat/graph_features.py @@ -690,9 +690,9 @@ def __init__(self, graph_distance=True, explicit_H=False, # If uses use_chirality self.use_chirality = use_chirality if self.use_chirality: - self.bt_len = bond_fdim_base + len(possible_bond_stereo) + self.bt_len = GraphConvConstants.bond_fdim_base + len(possible_bond_stereo) else: - self.bt_len = bond_fdim_base + self.bt_len = GraphConvConstants.bond_fdim_base def _featurize(self, mol): """Encodes mol as a WeaveMol object.""" From e8d4765b7441c2324a7a318c299af23098f178aa Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Thu, 9 Jul 2020 22:53:25 -0700 Subject: [PATCH 6/8] Change --- deepchem/feat/graph_features.py | 7 ++++--- deepchem/models/graph_models.py | 4 ---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/deepchem/feat/graph_features.py b/deepchem/feat/graph_features.py index 6469645d08..cde036f237 100644 --- a/deepchem/feat/graph_features.py +++ b/deepchem/feat/graph_features.py @@ -114,7 +114,7 @@ def safe_index(l, e): return len(l) -class GraphConvConstants(enum.Enum): +class GraphConvConstants(object): """Allowed Atom Types.""" possible_atom_list = [ 'C', 'N', 'O', 'S', 'F', 'P', 'Cl', 'Mg', 'Na', 'Br', 'Fe', 'Ca', 'Cu', @@ -690,9 +690,10 @@ def __init__(self, graph_distance=True, explicit_H=False, # If uses use_chirality self.use_chirality = use_chirality if self.use_chirality: - self.bt_len = GraphConvConstants.bond_fdim_base + len(possible_bond_stereo) + self.bt_len = int( + GraphConvConstants.bond_fdim_base) + len(possible_bond_stereo) else: - self.bt_len = GraphConvConstants.bond_fdim_base + self.bt_len = int(GraphConvConstants.bond_fdim_base) def _featurize(self, mol): """Encodes mol as a WeaveMol object.""" diff --git a/deepchem/models/graph_models.py b/deepchem/models/graph_models.py index dc13f7ed86..f4f71fae72 100644 --- a/deepchem/models/graph_models.py +++ b/deepchem/models/graph_models.py @@ -430,10 +430,6 @@ def __init__(self, if dropout is None or dropout == 0.0: raise ValueError('Dropout must be included to predict uncertainty') - ############################################ - print("self.dropout") - print(self.dropout) - ############################################ # Build the model. atom_features = Input(shape=(self.n_atom_feat,)) From ae9b5670fe0e2a0b4fbbc84aa003b6588a64882e Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Sat, 11 Jul 2020 15:32:29 -0700 Subject: [PATCH 7/8] Changes --- deepchem/feat/base_classes.py | 32 ++++++++++++++++++-------------- deepchem/feat/graph_features.py | 2 +- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/deepchem/feat/base_classes.py b/deepchem/feat/base_classes.py index 09a4b66e60..576a94c9b6 100644 --- a/deepchem/feat/base_classes.py +++ b/deepchem/feat/base_classes.py @@ -44,9 +44,11 @@ def featurize(self, datapoints, log_every_n=1000): for i, point in enumerate(datapoints): if i % log_every_n == 0: logger.info("Featurizing datapoint %i" % i) - if point is not None: + try: features.append(self._featurize(point)) - else: + except: + logger.warning( + "Failed to featurize datapoint %d. Appending empty array") features.append(np.array([])) features = np.asarray(features) @@ -139,12 +141,12 @@ class MolecularFeaturizer(Featurizer): In general, subclasses of this class will require RDKit to be installed. """ - def featurize(self, mols, log_every_n=1000): + def featurize(self, molecules, log_every_n=1000): """Calculate features for molecules. Parameters ---------- - mols : RDKit Mol / SMILES string /iterable + molecules: RDKit Mol / SMILES string /iterable RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES strings. @@ -159,22 +161,24 @@ def featurize(self, mols, log_every_n=1000): except ModuleNotFoundError: raise ValueError("This class requires RDKit to be installed.") # Special case handling of single molecule - if isinstance(mols, str) or isinstance(mols, Mol): - mols = [mols] + if isinstance(molecules, str) or isinstance(molecules, Mol): + molecules = [molecules] else: # Convert iterables to list - mols = list(mols) + molecutes = list(molecules) features = [] - for i, mol in enumerate(mols): + for i, mol in enumerate(molecules): if i % log_every_n == 0: logger.info("Featurizing datapoint %i" % i) - if mol is not None: + try: # Process only case of SMILES strings. if isinstance(mol, str): # mol must be a SMILES string so parse mol = Chem.MolFromSmiles(mol) features.append(self._featurize(mol)) - else: + except: + logger.warning( + "Failed to featurize datapoint %d. Appending empty array") features.append(np.array([])) features = np.asarray(features) @@ -191,16 +195,16 @@ def _featurize(self, mol): """ raise NotImplementedError('Featurizer is not defined.') - def __call__(self, mols): + def __call__(self, molecules): """ Calculate features for molecules. Parameters ---------- - mols : iterable - RDKit Mol objects. + molecules: iterable + An iterable yielding RDKit Mol objects or SMILES strings. """ - return self.featurize(mols) + return self.featurize(molecules) class UserDefinedFeaturizer(Featurizer): diff --git a/deepchem/feat/graph_features.py b/deepchem/feat/graph_features.py index cde036f237..1079f1ddf9 100644 --- a/deepchem/feat/graph_features.py +++ b/deepchem/feat/graph_features.py @@ -115,7 +115,7 @@ def safe_index(l, e): class GraphConvConstants(object): - """Allowed Atom Types.""" + """This class defines a collection of constants which are useful for graph convolutions on molecules.""" possible_atom_list = [ 'C', 'N', 'O', 'S', 'F', 'P', 'Cl', 'Mg', 'Na', 'Br', 'Fe', 'Ca', 'Cu', 'Mc', 'Pd', 'Pb', 'K', 'I', 'Al', 'Ni', 'Mn' From d7fa74b8d2c4c8423f736cd561a8be2fa34559dd Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Sat, 11 Jul 2020 16:25:13 -0700 Subject: [PATCH 8/8] fix --- deepchem/feat/graph_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepchem/feat/graph_features.py b/deepchem/feat/graph_features.py index 1079f1ddf9..15aadf060a 100644 --- a/deepchem/feat/graph_features.py +++ b/deepchem/feat/graph_features.py @@ -80,7 +80,7 @@ def get_intervals(l): [1, 3, 12] >>> dc.feat.graph_features.get_intervals([[1], [], [1, 2], [1, 2, 3]]) - >>> [1, 1, 3, 12] + [1, 1, 3, 12] """ intervals = len(l) * [0] # Initalize with 1