In [1]:
from chemprop.featurizers.molgraph.molecule import SimpleMoleculeMolGraphFeaturizer

Example molecule

In [2]:
from rdkit import Chem

mol_to_featurize = Chem.MolFromSmiles("CC")

### MolGraph Featurizers

A `MolGraph` represents the graph featurization of a molecule. It is made of atom features (`V`), bond features (`E`), and a mapping between atoms and bonds (`edge_index` and `rev_edge_index`). It is created by `SimpleMoleculeMolGraphFeaturizer`. 

In [3]:
featurizer = SimpleMoleculeMolGraphFeaturizer()
featurizer(mol_to_featurize)

MolGraph(V=array([[0.     , 0.     , 0.     , 0.     , 0.     , 1.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        1.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        1.     , 0.     , 1.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 1.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 1.     , 0.     , 0.     , 0.     ,
        0.     , 0.12011],
       [0.     , 0.     , 0.     , 0.     , 0.     , 1.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.  

The atom and bond featurizers used to create the atom and bond features are customizable. See their notebooks for details.

In [4]:
from chemprop.featurizers import MultiHotAtomFeaturizer, MultiHotBondFeaturizer

atom_featurizer = MultiHotAtomFeaturizer.organic()
bond_featurizer = MultiHotBondFeaturizer(stereos=[0, 1, 2, 3, 4])
featurizer = SimpleMoleculeMolGraphFeaturizer(
    atom_featurizer=atom_featurizer, bond_featurizer=bond_featurizer
)
featurizer(mol_to_featurize)

MolGraph(V=array([[0.     , 0.     , 1.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 1.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 1.     , 0.     , 1.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 1.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 1.     , 0.     ,
        0.     , 0.12011],
       [0.     , 0.     , 1.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 1.     , 0.     , 0.     , 0.     ,
        0.     , 0.     , 0.     , 1.     , 0.     , 1.     , 0.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 1.     ,
        0.     , 0.     , 0.     , 0.     , 0.     , 1.     , 0.     ,
        0.     , 0.12011]], dtype=float32), E=array([[0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
   

### Extra atom and bond features

If your datapoints have extra atom or bond features, the molgraph featurizer needs to know the length of the extra features when it is created so that molecules without heavy atoms (molecular hydrogen) are featurized correctly and so that the bond feature array is the correct shape. Then the extra features are passed to the featurizer when featurizing.

In [5]:
# Example datapoints with extra atom and bond features. See the datapoints notebook for more details.
import numpy as np
from chemprop.data import MoleculeDatapoint

y = np.random.rand(1, 1)
n_atoms = 2
n_bonds = 1
n_extra_atom_features = 3
n_extra_bond_features = 4
extra_atom_features = np.random.rand(n_atoms, n_extra_atom_features)
extra_bond_features = np.random.rand(n_bonds, n_extra_bond_features)
datapoint = MoleculeDatapoint(mol_to_featurize, y, V_f=extra_atom_features, E_f=extra_bond_features)

In [6]:
featurizer = SimpleMoleculeMolGraphFeaturizer(
    extra_atom_fdim=n_extra_atom_features, extra_bond_fdim=n_extra_bond_features
)
featurizer(mol_to_featurize, atom_features_extra=datapoint.V_f, bond_features_extra=datapoint.E_f)

MolGraph(V=array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.12011   , 0.167

For most use cases, the dataset automatically handles the featurization including passing extra atom and bond features. See the datasets notebook for details about using custom featurizers. 