Skip to content

Commit

Permalink
Merge 139792e into d33cc47
Browse files Browse the repository at this point in the history
  • Loading branch information
rbharath committed Sep 1, 2020
2 parents d33cc47 + 139792e commit 4709fba
Show file tree
Hide file tree
Showing 9 changed files with 610 additions and 147 deletions.
239 changes: 187 additions & 52 deletions deepchem/feat/graph_features.py
Expand Up @@ -5,6 +5,8 @@
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.data import DiskDataset
import logging
from typing import Optional, List
from deepchem.utils.typing import RDKitMol, RDKitAtom


def one_of_k_encoding(x, allowable_set):
Expand Down Expand Up @@ -398,12 +400,75 @@ def bond_features(bond, use_chirality=False):
]
if use_chirality:
bond_feats = bond_feats + one_of_k_encoding_unk(
str(bond.GetStereo()), possible_bond_stereo)
str(bond.GetStereo()), GraphConvCoonstants.possible_bond_stereo)
return np.array(bond_feats)


def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
graph_distance=True):
def max_pair_distance_pairs(mol: RDKitMol,
max_pair_distance: Optional[int]) -> np.ndarray:
"""Helper method which finds atom pairs within max_pair_distance graph distance.
This helper method is used to find atoms which are within max_pair_distance
graph_distance of one another. This is done by using the fact that the
powers of an adjacency matrix encode path connectivity information. In
particular, if `adj` is the adjacency matrix, then `adj**k` has a nonzero
value at `(i, j)` if and only if there exists a path of graph distance `k`
between `i` and `j`. To find all atoms within `max_pair_distance` of each
other, we can compute the adjacency matrix powers `[adj, adj**2,
...,adj**max_pair_distance]` and find pairs which are nonzero in any of
these matrices. Since adjacency matrices and their powers are positive
numbers, this is simply the nonzero elements of `adj + adj**2 + ... +
adj**max_pair_distance`.
Parameters
----------
mol: rdkit.Chem.rdchem.Mol
RDKit molecules
max_pair_distance: Optional[int], (default None)
This value can be a positive integer or None. This
parameter determines the maximum graph distance at which pair
features are computed. For example, if `max_pair_distance==2`,
then pair features are computed only for atoms at most graph
distance 2 apart. If `max_pair_distance` is `None`, all pairs are
considered (effectively infinite `max_pair_distance`)
Returns
-------
np.ndarray
Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs
within `max_pair_distance` of one another.
"""
from rdkit import Chem
from rdkit.Chem import rdmolops
N = len(mol.GetAtoms())
if (max_pair_distance is None or max_pair_distance >= N):
max_distance = N
elif max_pair_distance is not None and max_pair_distance <= 0:
raise ValueError(
"max_pair_distance must either be a positive integer or None")
elif max_pair_distance is not None:
max_distance = max_pair_distance
adj = rdmolops.GetAdjacencyMatrix(mol)
# Handle edge case of self-pairs (i, i)
sum_adj = np.eye(N)
for i in range(max_distance):
# Increment by 1 since we don't want 0-indexing
power = i + 1
sum_adj += np.linalg.matrix_power(adj, power)
nonzero_locs = np.where(sum_adj != 0)
num_pairs = len(nonzero_locs[0])
# This creates a matrix of shape (2, num_pairs)
pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs))
return pair_edges


def pair_features(mol: RDKitMol,
bond_features_map: dict,
bond_adj_list: List,
bt_len: int = 6,
graph_distance: bool = True,
max_pair_distance: Optional[int] = None) -> np.ndarray:
"""Helper method used to compute atom pair feature vectors.
Many different featurization methods compute atom pair features
Expand All @@ -415,16 +480,26 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
----------
mol: RDKit Mol
Molecule to compute features on.
edge_list: list
List of edges to consider
canon_adj_list: list of lists
`canon_adj_list[i]` is a list of the atom indices that atom `i` shares a
list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in
canon_adj_list[j]`.
bond_features_map: dict
Dictionary that maps pairs of atom ids (say `(2, 3)` for a bond between
atoms 2 and 3) to the features for the bond between them.
bond_adj_list: list of lists
`bond_adj_list[i]` is a list of the atom indices that atom `i` shares a
bond with . This list is symmetrical so if `j in bond_adj_list[i]` then `i
in bond_adj_list[j]`.
bt_len: int, optional (default 6)
The number of different bond types to consider.
graph_distance: bool, optional (default True)
If true, use graph distance between molecules. Else use euclidean distance.
If true, use graph distance between molecules. Else use euclidean
distance. The specified `mol` must have a conformer. Atomic
positions will be retrieved by calling `mol.getConformer(0)`.
max_pair_distance: Optional[int], (default None)
This value can be a positive integer or None. This
parameter determines the maximum graph distance at which pair
features are computed. For example, if `max_pair_distance==2`,
then pair features are computed only for atoms at most graph
distance 2 apart. If `max_pair_distance` is `None`, all pairs are
considered (effectively infinite `max_pair_distance`)
Note
----
Expand All @@ -433,32 +508,65 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
Returns
-------
features: np.ndarray
Of shape `(N, N, bt_len + max_distance + 1)`. This is the array of pairwise
features for all atom pairs.
Of shape `(N_edges, bt_len + max_distance + 1)`. This is the array
of pairwise features for all atom pairs, where N_edges is the
number of edges within max_pair_distance of one another in this
molecules.
pair_edges: np.ndarray
Of shape `(2, num_pairs)` where `num_pairs` is the total number of
pairs within `max_pair_distance` of one another.
"""
if graph_distance:
max_distance = 7
else:
max_distance = 1
N = mol.GetNumAtoms()
features = np.zeros((N, N, bt_len + max_distance + 1))
pair_edges = max_pair_distance_pairs(mol, max_pair_distance)
num_pairs = pair_edges.shape[1]
N_edges = pair_edges.shape[1]
features = np.zeros((N_edges, bt_len + max_distance + 1))
# Get mapping
mapping = {}
for n in range(N_edges):
a1, a2 = pair_edges[:, n]
mapping[(int(a1), int(a2))] = n
num_atoms = mol.GetNumAtoms()
rings = mol.GetRingInfo().AtomRings()
for a1 in range(num_atoms):
for a2 in canon_adj_list[a1]:
for a2 in bond_adj_list[a1]:
# first `bt_len` features are bond features(if applicable)
features[a1, a2, :bt_len] = np.asarray(
edge_list[tuple(sorted((a1, a2)))], dtype=float)
if (int(a1), int(a2)) not in mapping:
raise ValueError(
"Malformed molecule with bonds not in specified graph distance.")
else:
n = mapping[(int(a1), int(a2))]
features[n, :bt_len] = np.asarray(
bond_features_map[tuple(sorted((a1, a2)))], dtype=float)
for ring in rings:
if a1 in ring:
# `bt_len`-th feature is if the pair of atoms are in the same ring
features[a1, ring, bt_len] = 1
features[a1, a1, bt_len] = 0.
for a2 in ring:
if (int(a1), int(a2)) not in mapping:
# For ring pairs outside max pairs distance continue
continue
else:
n = mapping[(int(a1), int(a2))]
# `bt_len`-th feature is if the pair of atoms are in the same ring
if a2 == a1:
features[n, bt_len] = 0
else:
features[n, bt_len] = 1
# graph distance between two atoms
if graph_distance:
# distance is a matrix of 1-hot encoded distances for all atoms
distance = find_distance(
a1, num_atoms, canon_adj_list, max_distance=max_distance)
features[a1, :, bt_len + 1:] = distance
a1, num_atoms, bond_adj_list, max_distance=max_distance)
for a2 in range(num_atoms):
if (int(a1), int(a2)) not in mapping:
# For ring pairs outside max pairs distance continue
continue
else:
n = mapping[(int(a1), int(a2))]
features[n, bt_len + 1:] = distance[a2]
# Euclidean distance between atoms
if not graph_distance:
coords = np.zeros((N, 3))
Expand All @@ -469,10 +577,11 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
np.stack([coords] * N, axis=1) - \
np.stack([coords] * N, axis=0)), axis=2))

return features
return features, pair_edges


def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
def find_distance(a1: RDKitAtom, num_atoms: int, bond_adj_list,
max_distance=7) -> np.ndarray:
"""Computes distances from provided atom.
Parameters
Expand All @@ -481,10 +590,10 @@ def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
The source atom to compute distances from.
num_atoms: int
The total number of atoms.
canon_adj_list: list of lists
`canon_adj_list[i]` is a list of the atom indices that atom `i` shares a
list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in
canon_adj_list[j]`.
bond_adj_list: list of lists
`bond_adj_list[i]` is a list of the atom indices that atom `i` shares a
bond with. This list is symmetrical so if `j in bond_adj_list[i]` then `i in
bond_adj_list[j]`.
max_distance: int, optional (default 7)
The max distance to search.
Expand All @@ -498,7 +607,7 @@ def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
distance = np.zeros((num_atoms, max_distance))
radial = 0
# atoms `radial` bonds away from `a1`
adj_list = set(canon_adj_list[a1])
adj_list = set(bond_adj_list[a1])
# atoms less than `radial` bonds away
all_list = set([a1])
while radial < max_distance:
Expand All @@ -507,7 +616,7 @@ def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
# find atoms `radial`+1 bonds away
next_adj = set()
for adj in adj_list:
next_adj.update(canon_adj_list[adj])
next_adj.update(bond_adj_list[adj])
adj_list = next_adj - all_list
radial = radial + 1
return distance
Expand Down Expand Up @@ -647,6 +756,14 @@ class WeaveFeaturizer(MolecularFeaturizer):
descriptors for each pair of atoms. These extra descriptors may provide for
additional descriptive power but at the cost of a larger featurized dataset.
Examples
--------
>>> import deepchem as dc
>>> mols = ["C", "CCC"]
>>> featurizer = dc.feat.WeaveFeaturizer()
>>> X = featurizer.featurize(mols)
References
----------
.. [1] Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond
Expand All @@ -660,18 +777,31 @@ class WeaveFeaturizer(MolecularFeaturizer):

name = ['weave_mol']

def __init__(self, graph_distance=True, explicit_H=False,
use_chirality=False):
"""
def __init__(self,
graph_distance: bool = True,
explicit_H: bool = False,
use_chirality: bool = False,
max_pair_distance: Optional[int] = None):
"""Initialize this featurizer with set parameters.
Parameters
----------
graph_distance: bool, optional
If true, use graph distance. Otherwise, use Euclidean
distance.
explicit_H: bool, optional
graph_distance: bool, (default True)
If True, use graph distance for distance features. Otherwise, use
Euclidean distance. Note that this means that molecules that this
featurizer is invoked on must have valid conformer information if this
option is set.
explicit_H: bool, (default False)
If true, model hydrogens in the molecule.
use_chirality: bool, optional
use_chirality: bool, (default False)
If true, use chiral information in the featurization
max_pair_distance: Optional[int], (default None)
This value can be a positive integer or None. This
parameter determines the maximum graph distance at which pair
features are computed. For example, if `max_pair_distance==2`,
then pair features are computed only for atoms at most graph
distance 2 apart. If `max_pair_distance` is `None`, all pairs are
considered (effectively infinite `max_pair_distance`)
"""
# Distance is either graph distance(True) or Euclidean distance(False,
# only support datasets providing Cartesian coordinates)
Expand All @@ -682,9 +812,13 @@ def __init__(self, graph_distance=True, explicit_H=False,
self.explicit_H = explicit_H
# If uses use_chirality
self.use_chirality = use_chirality
if isinstance(max_pair_distance, int) and max_pair_distance <= 0:
raise ValueError(
"max_pair_distance must either be a positive integer or None")
self.max_pair_distance = max_pair_distance
if self.use_chirality:
self.bt_len = int(
GraphConvConstants.bond_fdim_base) + len(possible_bond_stereo)
self.bt_len = int(GraphConvConstants.bond_fdim_base) + len(
GraphConvConstants.possible_bond_stereo)
else:
self.bt_len = int(GraphConvConstants.bond_fdim_base)

Expand All @@ -704,27 +838,28 @@ def _featurize(self, mol):
nodes = np.vstack(nodes)

# Get bond lists
edge_list = {}
bond_features_map = {}
for b in mol.GetBonds():
edge_list[tuple(sorted([b.GetBeginAtomIdx(),
b.GetEndAtomIdx()]))] = bond_features(
b, use_chirality=self.use_chirality)
bond_features_map[tuple(sorted([b.GetBeginAtomIdx(),
b.GetEndAtomIdx()]))] = bond_features(
b, use_chirality=self.use_chirality)

# Get canonical adjacency list
canon_adj_list = [[] for mol_id in range(len(nodes))]
for edge in edge_list.keys():
canon_adj_list[edge[0]].append(edge[1])
canon_adj_list[edge[1]].append(edge[0])
bond_adj_list = [[] for mol_id in range(len(nodes))]
for bond in bond_features_map.keys():
bond_adj_list[bond[0]].append(bond[1])
bond_adj_list[bond[1]].append(bond[0])

# Calculate pair features
pairs = pair_features(
pairs, pair_edges = pair_features(
mol,
edge_list,
canon_adj_list,
bond_features_map,
bond_adj_list,
bt_len=self.bt_len,
graph_distance=self.graph_distance)
graph_distance=self.graph_distance,
max_pair_distance=self.max_pair_distance)

return WeaveMol(nodes, pairs)
return WeaveMol(nodes, pairs, pair_edges)


class AtomicConvFeaturizer(ComplexNeighborListFragmentAtomicCoordinates):
Expand Down
17 changes: 10 additions & 7 deletions deepchem/feat/mol_graphs.py
@@ -1,10 +1,6 @@
"""
Data Structures used to represented molecules for convolutions.
"""
__author__ = "Han Altae-Tran and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import csv
import random
import numpy as np
Expand Down Expand Up @@ -375,16 +371,23 @@ class WeaveMol(object):
"""Molecular featurization object for weave convolutions.
These objects are produced by WeaveFeaturizer, and feed into
WeaveModel. The underlying implementation is inspired by:
WeaveModel. The underlying implementation is inspired by [1]_.
Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints." Journal of computer-aided molecular design 30.8 (2016): 595-608.
References
----------
.. [1] Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints." Journal of computer-aided molecular design 30.8 (2016): 595-608.
"""

def __init__(self, nodes, pairs):
def __init__(self, nodes, pairs, pair_edges):
self.nodes = nodes
self.pairs = pairs
self.num_atoms = self.nodes.shape[0]
self.n_features = self.nodes.shape[1]
self.pair_edges = pair_edges

def get_pair_edges(self):
return self.pair_edges

def get_pair_features(self):
return self.pairs
Expand Down

0 comments on commit 4709fba

Please sign in to comment.