Merge pull request #2033 from peastman/load

Optimizations to data loading
deepchem · Jul 21, 2020 · 138522e · 138522e
2 parents 0c5b2b8 + 624dfb1
commit 138522e
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 46 deletions.
diff --git a/deepchem/data/data_loader.py b/deepchem/data/data_loader.py
@@ -44,27 +44,13 @@ def _convert_df_to_numpy(df, tasks):
   n_samples = df.shape[0]
   n_tasks = len(tasks)
 
-  time1 = time.time()
   y = np.hstack(
       [np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
-  time2 = time.time()
-
   w = np.ones((n_samples, n_tasks))
-  missing = np.zeros_like(y).astype(int)
-  feature_shape = None
-
-  for ind in range(n_samples):
-    for task in range(n_tasks):
-      if y[ind, task] == "":
-        missing[ind, task] = 1
-
-  # ids = df[id_field].values
-  # Set missing data to have weight zero
-  for ind in range(n_samples):
-    for task in range(n_tasks):
-      if missing[ind, task]:
-        y[ind, task] = 0.
-        w[ind, task] = 0.
+  if y.dtype.kind in ['O', 'U']:
+    missing = (y == '')
+    y[missing] = 0
+    w[missing] = 0
 
   return y.astype(float), w.astype(float)
 
@@ -198,7 +184,7 @@ class DataLoader(object):
   of `DataLoader` is specialized to handle one type of input data so
   you will have to pick the loader class suitable for your input data
   type.
-  
+
   Note that it isn't necessary to use a data loader to process input
   data. You can directly use `Featurizer` objects to featurize
   provided input into numpy arrays, but note that this calculation
@@ -352,7 +338,7 @@ def _get_shards(self, inputs, shard_size):
 
     If you chose to override `create_dataset()` directly you don't
     need to override this helper method.
-    
+
     Parameters
     ----------
     inputs: list
@@ -375,7 +361,7 @@ def _featurize_shard(self, shard):
 
 class CSVLoader(DataLoader):
   """
-  Creates `Dataset` objects from input CSF files. 
+  Creates `Dataset` objects from input CSF files.
 
   This class provides conveniences to load data from CSV files.
   It's possible to directly featurize data from CSV files using
@@ -397,7 +383,7 @@ def __init__(self,
     tasks: list[str]
       List of task names
     smiles_field: str, optional
-      Name of field that holds smiles string 
+      Name of field that holds smiles string
     id_field: str, optional
       Name of field that holds sample identifier
     featurizer: dc.feat.Featurizer, optional
@@ -459,7 +445,7 @@ def _featurize_shard(self, shard):
 
 class JsonLoader(DataLoader):
   """
-  Creates `Dataset` objects from input json files. 
+  Creates `Dataset` objects from input json files.
 
   This class provides conveniences to load data from json files.
   It's possible to directly featurize data from json files using
@@ -481,7 +467,7 @@ class JsonLoader(DataLoader):
   >> loader = JsonLoader(tasks=['task'], feature_field='sample_data',
       label_field='task', weight_field='weight', id_field='sample_name')
   >> dataset = loader.create_dataset('file.json')
-  
+
   """
 
   def __init__(self,
@@ -614,7 +600,7 @@ def _featurize_df(self,
     """Featurize individual samples in dataframe.
 
     Helper that given a featurizer that operates on individual
-    samples, computes & adds features for that sample to the 
+    samples, computes & adds features for that sample to the
     features dataframe.
 
     Parameters
@@ -652,7 +638,7 @@ def _featurize_df(self,
 
 class SDFLoader(DataLoader):
   """
-  Creates `Dataset` from SDF input files. 
+  Creates `Dataset` from SDF input files.
 
   This class provides conveniences to load data from SDF files.
   """
@@ -727,7 +713,7 @@ def create_dataset(self,
       Name of directory where featurized data is stored.
     shard_size: int, optional
       For now, this argument is ignored and each FASTA file gets its
-      own shard. 
+      own shard.
 
     Returns
     -------
@@ -935,7 +921,7 @@ class InMemoryLoader(DataLoader):
   4
 
   Here's an example with both datapoints and labels
-  
+
   >>> import deepchem as dc
   >>> smiles = ["C", "CC", "CCC", "CCCC"]
   >>> labels = [1, 0, 1, 0]

diff --git a/deepchem/feat/coulomb_matrices.py b/deepchem/feat/coulomb_matrices.py
@@ -13,7 +13,7 @@
 class BPSymmetryFunctionInput(MolecularFeaturizer):
   """Calculate Symmetry Function for each atom in the molecules
 
-  This method is described in [1]_ 
+  This method is described in [1]_
 
   References
   ----------
@@ -168,16 +168,8 @@ def coulomb_matrix(self, mol):
     rval = []
     for conf in mol.GetConformers():
       d = self.get_interatomic_distances(conf)
-      m = np.zeros((n_atoms, n_atoms))
-      for i in range(mol.GetNumAtoms()):
-        for j in range(mol.GetNumAtoms()):
-          if i == j:
-            m[i, j] = 0.5 * z[i]**2.4
-          elif i < j:
-            m[i, j] = (z[i] * z[j]) / d[i, j]
-            m[j, i] = m[i, j]
-          else:
-            continue
+      m = np.outer(z, z) / d
+      m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4
       if self.randomize:
         for random_m in self.randomize_coulomb_matrix(m):
           random_m = pad_array(random_m, self.max_atoms)
@@ -236,12 +228,9 @@ def get_interatomic_distances(conf):
     ]  # Convert AtomPositions from Angstrom to bohr (atomic units)
     d = np.zeros((n_atoms, n_atoms), dtype=float)
     for i in range(n_atoms):
-      for j in range(n_atoms):
-        if i < j:
-          d[i, j] = coords[i].Distance(coords[j])
-          d[j, i] = d[i, j]
-        else:
-          continue
+      for j in range(i):
+        d[i, j] = coords[i].Distance(coords[j])
+        d[j, i] = d[i, j]
     return d
 
 
@@ -319,7 +308,7 @@ def _featurize(self, mol):
     """
     Calculate eigenvalues of Coulomb matrix for molecules. Eigenvalues
     are returned sorted by absolute value in descending order and padded
-    by max_atoms. 
+    by max_atoms.
 
     Parameters
     ----------