Skip to content

Commit

Permalink
Merge pull request #2033 from peastman/load
Browse files Browse the repository at this point in the history
Optimizations to data loading
  • Loading branch information
Bharath Ramsundar committed Jul 21, 2020
2 parents 0c5b2b8 + 624dfb1 commit 138522e
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 46 deletions.
42 changes: 14 additions & 28 deletions deepchem/data/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,27 +44,13 @@ def _convert_df_to_numpy(df, tasks):
n_samples = df.shape[0]
n_tasks = len(tasks)

time1 = time.time()
y = np.hstack(
[np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
time2 = time.time()

w = np.ones((n_samples, n_tasks))
missing = np.zeros_like(y).astype(int)
feature_shape = None

for ind in range(n_samples):
for task in range(n_tasks):
if y[ind, task] == "":
missing[ind, task] = 1

# ids = df[id_field].values
# Set missing data to have weight zero
for ind in range(n_samples):
for task in range(n_tasks):
if missing[ind, task]:
y[ind, task] = 0.
w[ind, task] = 0.
if y.dtype.kind in ['O', 'U']:
missing = (y == '')
y[missing] = 0
w[missing] = 0

return y.astype(float), w.astype(float)

Expand Down Expand Up @@ -198,7 +184,7 @@ class DataLoader(object):
of `DataLoader` is specialized to handle one type of input data so
you will have to pick the loader class suitable for your input data
type.
Note that it isn't necessary to use a data loader to process input
data. You can directly use `Featurizer` objects to featurize
provided input into numpy arrays, but note that this calculation
Expand Down Expand Up @@ -352,7 +338,7 @@ def _get_shards(self, inputs, shard_size):
If you chose to override `create_dataset()` directly you don't
need to override this helper method.
Parameters
----------
inputs: list
Expand All @@ -375,7 +361,7 @@ def _featurize_shard(self, shard):

class CSVLoader(DataLoader):
"""
Creates `Dataset` objects from input CSF files.
Creates `Dataset` objects from input CSF files.
This class provides conveniences to load data from CSV files.
It's possible to directly featurize data from CSV files using
Expand All @@ -397,7 +383,7 @@ def __init__(self,
tasks: list[str]
List of task names
smiles_field: str, optional
Name of field that holds smiles string
Name of field that holds smiles string
id_field: str, optional
Name of field that holds sample identifier
featurizer: dc.feat.Featurizer, optional
Expand Down Expand Up @@ -459,7 +445,7 @@ def _featurize_shard(self, shard):

class JsonLoader(DataLoader):
"""
Creates `Dataset` objects from input json files.
Creates `Dataset` objects from input json files.
This class provides conveniences to load data from json files.
It's possible to directly featurize data from json files using
Expand All @@ -481,7 +467,7 @@ class JsonLoader(DataLoader):
>> loader = JsonLoader(tasks=['task'], feature_field='sample_data',
label_field='task', weight_field='weight', id_field='sample_name')
>> dataset = loader.create_dataset('file.json')
"""

def __init__(self,
Expand Down Expand Up @@ -614,7 +600,7 @@ def _featurize_df(self,
"""Featurize individual samples in dataframe.
Helper that given a featurizer that operates on individual
samples, computes & adds features for that sample to the
samples, computes & adds features for that sample to the
features dataframe.
Parameters
Expand Down Expand Up @@ -652,7 +638,7 @@ def _featurize_df(self,

class SDFLoader(DataLoader):
"""
Creates `Dataset` from SDF input files.
Creates `Dataset` from SDF input files.
This class provides conveniences to load data from SDF files.
"""
Expand Down Expand Up @@ -727,7 +713,7 @@ def create_dataset(self,
Name of directory where featurized data is stored.
shard_size: int, optional
For now, this argument is ignored and each FASTA file gets its
own shard.
own shard.
Returns
-------
Expand Down Expand Up @@ -935,7 +921,7 @@ class InMemoryLoader(DataLoader):
4
Here's an example with both datapoints and labels
>>> import deepchem as dc
>>> smiles = ["C", "CC", "CCC", "CCCC"]
>>> labels = [1, 0, 1, 0]
Expand Down
25 changes: 7 additions & 18 deletions deepchem/feat/coulomb_matrices.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class BPSymmetryFunctionInput(MolecularFeaturizer):
"""Calculate Symmetry Function for each atom in the molecules
This method is described in [1]_
This method is described in [1]_
References
----------
Expand Down Expand Up @@ -168,16 +168,8 @@ def coulomb_matrix(self, mol):
rval = []
for conf in mol.GetConformers():
d = self.get_interatomic_distances(conf)
m = np.zeros((n_atoms, n_atoms))
for i in range(mol.GetNumAtoms()):
for j in range(mol.GetNumAtoms()):
if i == j:
m[i, j] = 0.5 * z[i]**2.4
elif i < j:
m[i, j] = (z[i] * z[j]) / d[i, j]
m[j, i] = m[i, j]
else:
continue
m = np.outer(z, z) / d
m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4
if self.randomize:
for random_m in self.randomize_coulomb_matrix(m):
random_m = pad_array(random_m, self.max_atoms)
Expand Down Expand Up @@ -236,12 +228,9 @@ def get_interatomic_distances(conf):
] # Convert AtomPositions from Angstrom to bohr (atomic units)
d = np.zeros((n_atoms, n_atoms), dtype=float)
for i in range(n_atoms):
for j in range(n_atoms):
if i < j:
d[i, j] = coords[i].Distance(coords[j])
d[j, i] = d[i, j]
else:
continue
for j in range(i):
d[i, j] = coords[i].Distance(coords[j])
d[j, i] = d[i, j]
return d


Expand Down Expand Up @@ -319,7 +308,7 @@ def _featurize(self, mol):
"""
Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues
are returned sorted by absolute value in descending order and padded
by max_atoms.
by max_atoms.
Parameters
----------
Expand Down

0 comments on commit 138522e

Please sign in to comment.