Skip to content

Commit

Permalink
Merge branch 'deepchem:master' into gbdt-earlystop
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronrockmenezes committed Nov 1, 2023
2 parents 1eb7694 + 56adfb8 commit 921f57e
Show file tree
Hide file tree
Showing 71 changed files with 5,169 additions and 557 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
cp requirements/env.yml env.yml
- name: Install all dependencies using micromamba
uses: mamba-org/provision-with-micromamba@main
uses: mamba-org/setup-micromamba@main
with:
environment-file: env.yml
environment-name: deepchem
Expand Down
6 changes: 5 additions & 1 deletion deepchem/data/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,11 @@ def _featurize_shard(self,
features = [
elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
]
return np.array(features), valid_inds
try:
return np.array(features), valid_inds
except ValueError as e:
logger.warning("Exception message: {}".format(e))
return np.asarray(features, dtype=object), valid_inds


class FASTALoader(DataLoader):
Expand Down
4 changes: 2 additions & 2 deletions deepchem/feat/dft_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
import numpy as np

# dqc dependencies
import dqc
from dqc.system.mol import Mol
from dqc.system.base_system import BaseSystem
from deepchem.utils.dftutils import KSCalc, BaseGrid
from deepchem.utils.dft_utils import parse_moldesc


class DFTSystem():
Expand Down Expand Up @@ -68,7 +68,7 @@ def get_dqc_mol(self, pos_reqgrad: bool = False) -> BaseSystem:
mol
DQC mol object
"""
atomzs, atomposs = dqc.parse_moldesc(self.moldesc)
atomzs, atomposs = parse_moldesc(self.moldesc)
if pos_reqgrad:
atomposs.requires_grad_()
mol = Mol(self.moldesc, self.basis, spin=self.spin, charge=self.charge)
Expand Down
35 changes: 33 additions & 2 deletions deepchem/feat/molecule_featurizers/grover_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The adaptation is based on https://github.com/tencent-ailab/grover/blob/0421d97a5e1bd1b59d1923e3afd556afbe4ff782/grover/data/molgraph.py
"""
from typing import Optional
from typing import Optional, List
import numpy as np
from deepchem.feat.graph_data import GraphData
from deepchem.feat.molecule_featurizers import RDKitDescriptors
Expand Down Expand Up @@ -117,12 +117,22 @@ def _get_atom_features(self, atom, mol):
def _make_mol_graph(self, mol: RDKitMol) -> GraphData:
from deepchem.feat.molecule_featurizers.dmpnn_featurizer import bond_features
smiles = Chem.MolToSmiles(mol)
n_atoms = mol.GetNumAtoms() # number of atoms
f_atoms = [] # mapping from atom index to atom features
f_bonds = [
] # mapping from bond index to concat(from_atom, bond) features
edge_index = []

n_atoms = mol.GetNumAtoms() # number of atoms
n_bonds = 0 # number of bonds
a2b: List[List[int]] = [
] # mapping from atom index to incoming bond indices
b2a = [
] # mapping from bond index to the index of the atom the bond is coming from
b2revb = [] # mapping from bond index to the index of the reverse bond

for _ in range(n_atoms):
a2b.append([])

for _, atom in enumerate(mol.GetAtoms()):
f_atoms.append(self._get_atom_features(atom, mol))

Expand All @@ -143,9 +153,30 @@ def _make_mol_graph(self, mol: RDKitMol) -> GraphData:

edge_index.extend([[a1, a2], [a2, a1]])

b1 = n_bonds # b1: bond id
b2 = b1 + 1 # b2: reverse bond id
# add mapping between bond b1 and atom a2 (destination atom)
a2b[a2].append(b1) # b1 = a1 --> a2
# add mapping between bond id and atom id (a1)
b2a.append(a1)
# add mapping between bond id and atom a1 (source atom)
a2b[a1].append(b2) # b2 = a2 --> a1
b2a.append(a2)
# update index on bond and reverse bond mappings
b2revb.append(b2)
b2revb.append(b1)
n_bonds += 2

molgraph = GraphData(node_features=np.asarray(f_atoms),
edge_index=np.asarray(edge_index).T,
edge_features=np.asarray(f_bonds),
b1=b1,
b2=b2,
a2b=a2b,
b2a=b2a,
b2revb=b2revb,
n_bonds=n_bonds,
n_atoms=n_atoms,
smiles=smiles)
return molgraph

Expand Down
91 changes: 91 additions & 0 deletions deepchem/feat/vocabulary_builders/grover_vocab.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
import numpy as np
import pandas as pd
from typing import Dict, Optional
from collections import Counter
from rdkit import Chem
Expand Down Expand Up @@ -109,6 +110,49 @@ def build(self, dataset: Dataset, log_every_n: int = 1000) -> None:
self.stoi = self._make_reverse_mapping(self.itos)
logger.info('Completed building of atom vocabulary')

def build_from_csv(self,
csv_path: str,
smiles_field: str,
log_every_n: int = 1000) -> None:
"""Builds vocabulary from csv file
Parameters
----------
csv_path: str
Path to csv file containing smiles string
smiles_field: str
Name of column containing smiles string
log_every_n: int, default 1000
Logs vocabulary building progress every `log_every_n` steps.
"""
counter: Dict[str, int] = Counter()
logger.info('Starting to build atom vocabulary')
chunksize = 8196
for i, df in enumerate(pd.read_csv(csv_path, chunksize=chunksize)):
for index, row in df.iterrows():
if (i * chunksize + index) % log_every_n == 0:
logger.info(
'Computing contextual property of atoms in molecule %i'
% i)
smiles = row[smiles_field]
mol = Chem.MolFromSmiles(smiles)
for atom in mol.GetAtoms():
v = self.atom_to_vocab(mol, atom)
counter[v] += 1

logger.info('Completed enumeration of atom contextual properties.')
# sort first by frequency, then alphabetically
words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
for word, freq in words_and_frequencies:
if len(self.itos) == self.size:
break
self.itos.append(word)
if self.size is None:
self.size = len(self.itos)
self.stoi = self._make_reverse_mapping(self.itos)
logger.info('Completed building of atom vocabulary')

def save(self, fname: str) -> None:
"""Saves a vocabulary in json format
Expand Down Expand Up @@ -310,6 +354,53 @@ def build(self, dataset: Dataset, log_every_n: int = 1000) -> None:
self.stoi = self._make_reverse_mapping(self.itos)
logger.info('Completed building of bond vocabulary')

def build_from_csv(self,
csv_path: str,
smiles_field: str,
log_every_n: int = 1000) -> None:
"""Builds vocabulary
Parameters
----------
csv_path: str
Path to csv file containing smiles string
smiles_field: str
Name of column containing smiles string
log_every_n: int, default 1000
Logs vocabulary building progress every `log_every_n` steps.
"""
counter: Dict[str, int] = Counter()
logger.info('Starting to build bond vocabulary')
chunksize = 8196
for i, df in enumerate(pd.read_csv(csv_path, chunksize=chunksize)):
row_count = i * chunksize
for index, row in df.iterrows():
if (row_count + i) % log_every_n == 0:
logger.info(
'Computing contextual property of bonds in molecule %i'
% i)

smiles = row[smiles_field]

mol = Chem.MolFromSmiles(smiles)
for bond in mol.GetBonds():
v = self.bond_to_vocab(mol, bond)
counter[v] += 1

logger.info('Completed enumeration of bond contextual properties.')

# sort first by frequency, then alphabetically
words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
for word, freq in words_and_frequencies:
if len(self.itos) == self.size:
break
self.itos.append(word)
if self.size is None:
self.size = len(self.itos)
self.stoi = self._make_reverse_mapping(self.itos)
logger.info('Completed building of bond vocabulary')

def save(self, fname: str) -> None:
"""Saves a vocabulary in json format
Expand Down
56 changes: 56 additions & 0 deletions deepchem/feat/vocabulary_builders/tests/test_grover_vocab.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import tempfile
import os
import pandas as pd
from rdkit import Chem
import deepchem as dc

Expand Down Expand Up @@ -36,6 +38,30 @@ def testGroverAtomVocabularyBuilder():
assert vocab.size == len(vocab.itos)


def test_grover_atom_vocabulary_build_from_csv(tmpdir):
# test build from csv
from deepchem.feat.vocabulary_builders.grover_vocab import GroverAtomVocabularyBuilder
atom_vocab = GroverAtomVocabularyBuilder()
X = ['CC(=O)C', 'CCC']
df = pd.DataFrame({'X': X})
csv_path = os.path.join(tmpdir, 'temp.csv')
df.to_csv(csv_path)

atom_vocab.build_from_csv(csv_path, smiles_field='X')
assert atom_vocab.stoi == {
'<pad>': 0,
'<other>': 1,
'C_C-SINGLE1': 2,
'C_C-SINGLE2': 3,
'C_C-SINGLE2_O-DOUBLE1': 4,
'O_C-DOUBLE1': 5
}
assert atom_vocab.itos == [
'<pad>', '<other>', 'C_C-SINGLE1', 'C_C-SINGLE2',
'C_C-SINGLE2_O-DOUBLE1', 'O_C-DOUBLE1'
]


def testGroverBondVocabularyBuilder():
from deepchem.feat.vocabulary_builders.grover_vocab import GroverBondVocabularyBuilder
file = tempfile.NamedTemporaryFile()
Expand Down Expand Up @@ -77,6 +103,36 @@ def testGroverBondVocabularyBuilder():
assert vocab.size == len(vocab.itos)


def test_grover_bond_vocabulary_build_from_csv(tmpdir):
from deepchem.feat.vocabulary_builders.grover_vocab import GroverBondVocabularyBuilder
bond_vocab = GroverBondVocabularyBuilder()
X = ['CC(=O)C', 'CCC']
df = pd.DataFrame({'X': X})
csv_path = os.path.join(tmpdir, 'temp.csv')
df.to_csv(csv_path)

bond_vocab.build_from_csv(csv_path, smiles_field='X')

assert bond_vocab.stoi == {
'<pad>':
0,
'<other>':
1,
'(SINGLE-STEREONONE-NONE)_C-(DOUBLE-STEREONONE-NONE)1_C-(SINGLE-STEREONONE-NONE)1':
2,
'(SINGLE-STEREONONE-NONE)_C-(SINGLE-STEREONONE-NONE)1':
3,
'(DOUBLE-STEREONONE-NONE)_C-(SINGLE-STEREONONE-NONE)2':
4,
}
assert bond_vocab.itos == [
'<pad>', '<other>',
'(SINGLE-STEREONONE-NONE)_C-(DOUBLE-STEREONONE-NONE)1_C-(SINGLE-STEREONONE-NONE)1',
'(SINGLE-STEREONONE-NONE)_C-(SINGLE-STEREONONE-NONE)1',
'(DOUBLE-STEREONONE-NONE)_C-(SINGLE-STEREONONE-NONE)2'
]


def testGroverAtomVocabTokenizer():
from deepchem.feat.vocabulary_builders.grover_vocab import GroverAtomVocabularyBuilder, GroverAtomVocabTokenizer
file = tempfile.NamedTemporaryFile()
Expand Down
27 changes: 18 additions & 9 deletions deepchem/models/keras_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,8 @@ def fit(self,
Returns
-------
The average loss over the most recent checkpoint interval
float
The average loss over the most recent checkpoint interval
"""
return self.fit_generator(
self.default_generator(dataset,
Expand Down Expand Up @@ -403,7 +404,8 @@ def fit_generator(self,
Returns
-------
The average loss over the most recent checkpoint interval
float
The average loss over the most recent checkpoint interval
"""
if not isinstance(callbacks, SequenceCollection):
callbacks = [callbacks]
Expand Down Expand Up @@ -557,7 +559,8 @@ def fit_on_batch(self,
Returns
-------
the loss on the batch
float
the loss on the batch
"""
self._ensure_built()
dataset = NumpyDataset(X, y, w)
Expand Down Expand Up @@ -738,7 +741,10 @@ def predict_on_generator(
If specified, all outputs of this type will be retrieved
from the model. If output_types is specified, outputs must
be None.
Returns:
Returns
-------
OneOrMany[np.ndarray]
a NumPy array of the model produces a single output, or a list of arrays
if it produces multiple outputs
"""
Expand Down Expand Up @@ -768,8 +774,9 @@ def predict_on_batch(
Returns
-------
a NumPy array of the model produces a single output, or a list of arrays
if it produces multiple outputs
OneOrMany[np.ndarray]
a NumPy array of the model produces a single output, or a list of arrays
if it produces multiple outputs
"""
dataset = NumpyDataset(X=X, y=None)
return self.predict(dataset, transformers, outputs)
Expand Down Expand Up @@ -797,9 +804,11 @@ def predict_uncertainty_on_batch(
Returns
-------
for each output, a tuple (y_pred, y_std) where y_pred is the predicted
value of the output, and each element of y_std estimates the standard
deviation of the corresponding element of y_pred
OneOrMany[Tuple[y_pred, y_std]]
y_pred: np.ndarray
predicted value of the output
y_std: np.ndarray
standard deviation of the corresponding element of y_pred
"""
dataset = NumpyDataset(X=X, y=None)
return self.predict_uncertainty(dataset, masks)
Expand Down
Loading

0 comments on commit 921f57e

Please sign in to comment.