Skip to content

Commit

Permalink
Merge d7fa74b into 99831b7
Browse files Browse the repository at this point in the history
  • Loading branch information
rbharath committed Jul 11, 2020
2 parents 99831b7 + d7fa74b commit 4d2b032
Show file tree
Hide file tree
Showing 17 changed files with 784 additions and 221 deletions.
5 changes: 1 addition & 4 deletions deepchem/feat/__init__.py
@@ -1,11 +1,8 @@
"""
Making it easy to import in classes.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

from deepchem.feat.base_classes import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.feat.base_classes import ComplexFeaturizer
from deepchem.feat.base_classes import UserDefinedFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
Expand Down
4 changes: 0 additions & 4 deletions deepchem/feat/atomic_coordinates.py
@@ -1,10 +1,6 @@
"""
Atomic coordinate featurizer.
"""
__author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import numpy as np
from deepchem.utils.save import log
Expand Down
125 changes: 104 additions & 21 deletions deepchem/feat/base_classes.py
Expand Up @@ -6,16 +6,66 @@
import numpy as np
import multiprocessing

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"
logger = logging.getLogger(__name__)


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
logging.info(log_message)
return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


class Featurizer(object):
"""Abstract class for calculating a set of features for a datapoint.
This class is abstract and cannot be invoked directly. You'll
likely only interact with this class if you're a developer. In
that case, you might want to make a child class which
implements the `_featurize` method for calculating features for
a single datapoints if you'd like to make a featurizer for a
new datatype.
"""

def featurize(self, datapoints, log_every_n=1000):
"""Calculate features for datapoints.
Parameters
----------
datapoints: iterable
A sequence of objects that you'd like to featurize. Subclassses of
`Featurizer` should instantiate the `_featurize` method that featurizes
objects in the sequence.
Returns
-------
A numpy array containing a featurized representation of `datapoints`.
"""
datapoints = list(datapoints)
features = []
for i, point in enumerate(datapoints):
if i % log_every_n == 0:
logger.info("Featurizing datapoint %i" % i)
try:
features.append(self._featurize(point))
except:
logger.warning(
"Failed to featurize datapoint %d. Appending empty array")
features.append(np.array([]))

features = np.asarray(features)
return features

def __call__(self, datapoints):
"""Calculate features for datapoints.
Parameters
----------
datapoints: object
Any blob of data you like. Subclasss should instantiate
this.
"""
return self.featurize(datapoints)


class ComplexFeaturizer(object):
""""
Abstract class for calculating features for mol/protein complexes.
Expand Down Expand Up @@ -73,29 +123,62 @@ def _featurize_complex(self, mol_pdb, complex_pdb):
raise NotImplementedError('Featurizer is not defined.')


class Featurizer(object):
"""
Abstract class for calculating a set of features for a molecule.
class MolecularFeaturizer(Featurizer):
"""Abstract class for calculating a set of features for a
molecule.
Child classes implement the _featurize method for calculating features
for a single molecule.
The defining feature of a `MolecularFeaturizer` is that it
uses SMILES strings and RDKIT molecule objects to represent
small molecules. All other featurizers which are subclasses of
this class should plan to process input which comes as smiles
strings or RDKIT molecules.
Child classes need to implement the _featurize method for
calculating features for a single molecule.
Note
----
In general, subclasses of this class will require RDKit to be installed.
"""

def featurize(self, mols, verbose=True, log_every_n=1000):
"""
Calculate features for molecules.
def featurize(self, molecules, log_every_n=1000):
"""Calculate features for molecules.
Parameters
----------
mols : iterable
RDKit Mol objects.
molecules: RDKit Mol / SMILES string /iterable
RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
strings.
Returns
-------
A numpy array containing a featurized representation of
`datapoints`.
"""
mols = list(mols)
try:
from rdkit import Chem
from rdkit.Chem.rdchem import Mol
except ModuleNotFoundError:
raise ValueError("This class requires RDKit to be installed.")
# Special case handling of single molecule
if isinstance(molecules, str) or isinstance(molecules, Mol):
molecules = [molecules]
else:
# Convert iterables to list
molecutes = list(molecules)
features = []
for i, mol in enumerate(mols):
if mol is not None:
for i, mol in enumerate(molecules):
if i % log_every_n == 0:
logger.info("Featurizing datapoint %i" % i)
try:
# Process only case of SMILES strings.
if isinstance(mol, str):
# mol must be a SMILES string so parse
mol = Chem.MolFromSmiles(mol)
features.append(self._featurize(mol))
else:
except:
logger.warning(
"Failed to featurize datapoint %d. Appending empty array")
features.append(np.array([]))

features = np.asarray(features)
Expand All @@ -112,16 +195,16 @@ def _featurize(self, mol):
"""
raise NotImplementedError('Featurizer is not defined.')

def __call__(self, mols):
def __call__(self, molecules):
"""
Calculate features for molecules.
Parameters
----------
mols : iterable
RDKit Mol objects.
molecules: iterable
An iterable yielding RDKit Mol objects or SMILES strings.
"""
return self.featurize(mols)
return self.featurize(molecules)


class UserDefinedFeaturizer(Featurizer):
Expand Down
56 changes: 40 additions & 16 deletions deepchem/feat/basic.py
@@ -1,18 +1,18 @@
"""
Basic molecular features.
"""
__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "MIT"

from deepchem.feat import Featurizer
import numpy as np
from deepchem.feat.base_classes import MolecularFeaturizer


class MolecularWeight(Featurizer):
"""
Molecular weight.
class MolecularWeight(MolecularFeaturizer):
"""Molecular weight.
Note
----
This class requires RDKit to be installed.
"""
name = ['mw', 'molecular_weight']

def _featurize(self, mol):
"""
Expand All @@ -22,21 +22,37 @@ def _featurize(self, mol):
----------
mol : RDKit Mol
Molecule.
Returns
-------
np.ndarray of length 1 containing the molecular weight.
"""
from rdkit.Chem import Descriptors
try:
from rdkit.Chem import Descriptors
except ModuleNotFoundError:
raise ValueError("This class requires RDKit to be installed.")
wt = Descriptors.ExactMolWt(mol)
wt = [wt]
return wt
return np.asarray(wt)


class RDKitDescriptors(Featurizer):
"""
RDKit descriptors.
class RDKitDescriptors(MolecularFeaturizer):
"""RDKit descriptors.
This class comptues a list of chemical descriptors using RDKit.
See http://rdkit.org/docs/GettingStartedInPython.html
#list-of-available-descriptors.
Attributes
----------
descriptors: np.ndarray
1D array of RDKit descriptor names used in this class.
Note
----
This class requires RDKit to be installed.
"""
name = 'descriptors'

# (ytz): This is done to avoid future compatibility issues like inclusion of
# the 3D descriptors or changing the feature size.
Expand Down Expand Up @@ -69,9 +85,12 @@ class RDKitDescriptors(Featurizer):
])

def __init__(self):
try:
from rdkit.Chem import Descriptors
except ModuleNotFoundError:
raise ValueError("This class requires RDKit to be installed.")
self.descriptors = []
self.descList = []
from rdkit.Chem import Descriptors
for descriptor, function in Descriptors.descList:
if descriptor in self.allowedDescriptors:
self.descriptors.append(descriptor)
Expand All @@ -85,8 +104,13 @@ def _featurize(self, mol):
----------
mol : RDKit Mol
Molecule.
Returns
-------
rval: np.ndarray
1D array of RDKit descriptors for `mol`
"""
rval = []
for desc_name, function in self.descList:
rval.append(function(mol))
return rval
return np.asarray(rval)

0 comments on commit 4d2b032

Please sign in to comment.