Skip to content

Commit

Permalink
yapf fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
arunppsg committed Apr 24, 2022
1 parent 505b693 commit efd56f5
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 66 deletions.
61 changes: 35 additions & 26 deletions deepchem/data/data_loader.py
Expand Up @@ -47,8 +47,9 @@ def _convert_df_to_numpy(df: pd.DataFrame,
n_samples = df.shape[0]
n_tasks = len(tasks)

y = np.hstack(
[np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
y = np.hstack([
np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks
])
w = np.ones((n_samples, n_tasks))
if y.dtype.kind in ['O', 'U']:
missing = (y == '')
Expand Down Expand Up @@ -507,8 +508,8 @@ def _featurize_shard(self,
shard[feature_fields] = shard[feature_fields].apply(pd.to_numeric)
X_shard = shard[feature_fields].to_numpy()
time2 = time.time()
logger.info(
"TIMING: user specified processing took %0.3f s" % (time2 - time1))
logger.info("TIMING: user specified processing took %0.3f s" %
(time2 - time1))
return (X_shard, np.ones(len(X_shard), dtype=bool))


Expand Down Expand Up @@ -796,7 +797,9 @@ def create_dataset(self,
elif extension == ".zip":
zip_dir = tempfile.mkdtemp()
unzip_file(input_file, zip_dir)
zip_files = [os.path.join(zip_dir, name) for name in os.listdir(zip_dir)]
zip_files = [
os.path.join(zip_dir, name) for name in os.listdir(zip_dir)
]
for zip_file in zip_files:
_, extension = os.path.splitext(zip_file)
extension = extension.lower()
Expand Down Expand Up @@ -848,11 +851,10 @@ def _get_shards(self, input_files: List[str],
Iterator[pd.DataFrame]
Iterator over shards
"""
return load_sdf_files(
input_files=input_files,
clean_mols=self.sanitize,
tasks=self.tasks,
shard_size=shard_size)
return load_sdf_files(input_files=input_files,
clean_mols=self.sanitize,
tasks=self.tasks,
shard_size=shard_size)

def _featurize_shard(self,
shard: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
Expand All @@ -876,7 +878,12 @@ def _featurize_shard(self,
"""
pos_cols = ['pos_x', 'pos_y', 'pos_z']
if set(pos_cols).issubset(shard.columns):
features = [elt for elt in self.featurizer(shard[self.mol_field], pos_x=shard['pos_x'], pos_y=shard['pos_y'], pos_z=shard['pos_z'])]
features = [
elt for elt in self.featurizer(shard[self.mol_field],
pos_x=shard['pos_x'],
pos_y=shard['pos_y'],
pos_z=shard['pos_z'])
]
else:
features = [elt for elt in self.featurizer(shard[self.mol_field])]
valid_inds = np.array(
Expand Down Expand Up @@ -952,11 +959,12 @@ def __init__(self,
self.user_specified_features = None

# Handle special featurizer cases
if isinstance(featurizer, UserDefinedFeaturizer): # User defined featurizer
if isinstance(featurizer,
UserDefinedFeaturizer): # User defined featurizer
self.user_specified_features = featurizer.feature_fields
elif featurizer is None: # Default featurizer
featurizer = OneHotFeaturizer(
charset=["A", "C", "T", "G"], max_length=None)
featurizer = OneHotFeaturizer(charset=["A", "C", "T", "G"],
max_length=None)

# Set self.featurizer
self.featurizer = featurizer
Expand Down Expand Up @@ -1160,16 +1168,17 @@ def create_dataset(self,

if in_memory:
if data_dir is None:
return NumpyDataset(
load_image_files(image_files), y=labels, w=weights, ids=image_files)
return NumpyDataset(load_image_files(image_files),
y=labels,
w=weights,
ids=image_files)
else:
dataset = DiskDataset.from_numpy(
load_image_files(image_files),
y=labels,
w=weights,
ids=image_files,
tasks=self.tasks,
data_dir=data_dir)
dataset = DiskDataset.from_numpy(load_image_files(image_files),
y=labels,
w=weights,
ids=image_files,
tasks=self.tasks,
data_dir=data_dir)
if shard_size is not None:
dataset.reshard(shard_size)
return dataset
Expand Down Expand Up @@ -1313,8 +1322,8 @@ def _get_shards(self, inputs: List,

# FIXME: Signature of "_featurize_shard" incompatible with supertype "DataLoader"
def _featurize_shard( # type: ignore[override]
self, shard: List, global_index: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
self, shard: List, global_index: int) -> Tuple[np.ndarray, np.ndarray,
np.ndarray, np.ndarray]:
"""Featurizes a shard of an input data.
Parameters
Expand All @@ -1340,7 +1349,7 @@ def _featurize_shard( # type: ignore[override]
n_tasks = len(self.tasks)
for i, entry in enumerate(shard):
if not isinstance(entry, tuple):
entry = (entry,)
entry = (entry, )
if len(entry) > 4:
raise ValueError(
"Entry is malformed and must be of length 1-4 containing featurization_input"
Expand Down
32 changes: 17 additions & 15 deletions deepchem/feat/molecule_featurizers/mol_graph_conv_featurizer.py
Expand Up @@ -26,9 +26,10 @@
from deepchem.utils.rdkit_utils import compute_pairwise_ring_info


def _construct_atom_feature(
atom: RDKitAtom, h_bond_infos: List[Tuple[int, str]], use_chirality: bool,
use_partial_charge: bool) -> np.ndarray:
def _construct_atom_feature(atom: RDKitAtom, h_bond_infos: List[Tuple[int,
str]],
use_chirality: bool,
use_partial_charge: bool) -> np.ndarray:
"""Construct an atom feature from a RDKit atom object.
Parameters
Expand Down Expand Up @@ -58,8 +59,8 @@ def _construct_atom_feature(
degree = get_atom_total_degree_one_hot(atom)
total_num_Hs = get_atom_total_num_Hs_one_hot(atom)
atom_feat = np.concatenate([
atom_type, formal_charge, hybridization, acceptor_donor, aromatic, degree,
total_num_Hs
atom_type, formal_charge, hybridization, acceptor_donor, aromatic,
degree, total_num_Hs
])

if use_chirality:
Expand Down Expand Up @@ -230,7 +231,7 @@ def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData:
# load_sdf_files returns pos as strings but user can also specify
# numpy arrays for atom coordinates
pos = []
if 'pos_x' in kwargs and 'pos_y' in kwargs and 'pos_z' in kwargs:
if 'pos_x' in kwargs and 'pos_y' in kwargs and 'pos_z' in kwargs:
if isinstance(kwargs['pos_x'], str):
pos_x = eval(kwargs['pos_x'])
elif isinstance(kwargs['pos_x'], np.ndarray):
Expand All @@ -246,11 +247,10 @@ def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData:

for x, y, z in zip(pos_x, pos_y, pos_z):
pos.append([x, y, z])
return GraphData(
node_features=atom_features,
edge_index=np.asarray([src, dest], dtype=int),
edge_features=bond_features,
pos=np.asarray(pos))
return GraphData(node_features=atom_features,
edge_index=np.asarray([src, dest], dtype=int),
edge_features=bond_features,
pos=np.asarray(pos))


class PagtnMolGraphFeaturizer(MolecularFeaturizer):
Expand Down Expand Up @@ -348,11 +348,13 @@ def _pagtn_atom_featurizer(self, atom: RDKitAtom) -> np.ndarray:
numpy vector of atom features.
"""
atom_type = get_atom_type_one_hot(atom, self.SYMBOLS, False)
formal_charge = get_atom_formal_charge_one_hot(
atom, include_unknown_set=False)
formal_charge = get_atom_formal_charge_one_hot(atom,
include_unknown_set=False)
degree = get_atom_total_degree_one_hot(atom, list(range(11)), False)
exp_valence = get_atom_explicit_valence_one_hot(atom, list(range(7)), False)
imp_valence = get_atom_implicit_valence_one_hot(atom, list(range(6)), False)
exp_valence = get_atom_explicit_valence_one_hot(atom, list(range(7)),
False)
imp_valence = get_atom_implicit_valence_one_hot(atom, list(range(6)),
False)
armoticity = get_atom_is_in_aromatic_one_hot(atom)
atom_feat = np.concatenate([
atom_type, formal_charge, degree, exp_valence, imp_valence, armoticity
Expand Down
10 changes: 7 additions & 3 deletions deepchem/feat/tests/test_mol_graph_conv_featurizer.py
Expand Up @@ -74,18 +74,22 @@ def test_featurizer_with_use_partial_charge(self):

def test_featurizer_with_pos_kwargs(self):
# Test featurizer with atom 3-D coordinates as kwargs
smiles = ["C1=CC=CN=C1", "CC"]
smiles = ["C1=CC=CN=C1", "CC"]
pos_x = [np.random.randn(6), np.random.randn(2)]
pos_y, pos_z = pos_x, pos_x
featurizer = MolGraphConvFeaturizer()
graph_feat = featurizer.featurize(smiles, pos_x=pos_x, pos_y=pos_y, pos_z=pos_z)
graph_feat = featurizer.featurize(smiles,
pos_x=pos_x,
pos_y=pos_y,
pos_z=pos_z)

assert len(graph_feat) == 2
assert graph_feat[0].num_nodes == 6
assert graph_feat[0].pos.shape == (6, 3)
assert graph_feat[1].num_nodes == 2
assert graph_feat[1].num_nodes == 2
assert graph_feat[1].pos.shape == (2, 3)


class TestPagtnMolGraphConvFeaturizer(unittest.TestCase):

def test_default_featurizer(self):
Expand Down
52 changes: 32 additions & 20 deletions deepchem/utils/data_utils.py
Expand Up @@ -275,7 +275,9 @@ def load_sdf_files(input_files: List[str],

if shard_size is not None and len(df_rows) == shard_size:
if has_csv:
mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol', 'pos_x', 'pos_y', 'pos_z'))
mol_df = pd.DataFrame(df_rows,
columns=('mol_id', 'smiles', 'mol', 'pos_x',
'pos_y', 'pos_z'))
raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
yield pd.concat([mol_df, raw_df], axis=1, join='inner')
else:
Expand All @@ -284,21 +286,25 @@ def load_sdf_files(input_files: List[str],
# tasks above, they occur after `tasks` here.
# FIXME Ideally, we should use something like a dictionary here to keep it independent
# of column ordering.
mol_df = pd.DataFrame(
df_rows, columns=('mol_id', 'smiles', 'mol') + tuple(tasks) + ('pos_x', 'pos_y', 'pos_z'))
mol_df = pd.DataFrame(df_rows,
columns=('mol_id', 'smiles', 'mol') +
tuple(tasks) + ('pos_x', 'pos_y', 'pos_z'))
yield mol_df
# Reset aggregator
df_rows = []

# Handle final leftovers for this file
if len(df_rows) > 0:
if has_csv:
mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol', 'pos_x', 'pos_y', 'pos_z'))
mol_df = pd.DataFrame(df_rows,
columns=('mol_id', 'smiles', 'mol', 'pos_x',
'pos_y', 'pos_z'))
raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
yield pd.concat([mol_df, raw_df], axis=1, join='inner')
else:
mol_df = pd.DataFrame(
df_rows, columns=('mol_id', 'smiles', 'mol') + tuple(tasks) + ('pos_x', 'pos_y', 'pos_z'))
mol_df = pd.DataFrame(df_rows,
columns=('mol_id', 'smiles', 'mol') +
tuple(tasks) + ('pos_x', 'pos_y', 'pos_z'))
yield mol_df
df_rows = []

Expand Down Expand Up @@ -327,15 +333,16 @@ def load_csv_files(input_files: List[str],
else:
logger.info("About to start loading CSV from %s" % input_file)
for df in pd.read_csv(input_file, chunksize=shard_size):
logger.info(
"Loading shard %d of size %s." % (shard_num, str(shard_size)))
logger.info("Loading shard %d of size %s." %
(shard_num, str(shard_size)))
df = df.replace(np.nan, str(""), regex=True)
shard_num += 1
yield df


def load_json_files(input_files: List[str],
shard_size: Optional[int] = None) -> Iterator[pd.DataFrame]:
def load_json_files(
input_files: List[str],
shard_size: Optional[int] = None) -> Iterator[pd.DataFrame]:
"""Load data as pandas dataframe.
Parameters
Expand All @@ -361,10 +368,12 @@ def load_json_files(input_files: List[str],
yield pd.read_json(input_file, orient='records', lines=True)
else:
logger.info("About to start loading json from %s." % input_file)
for df in pd.read_json(
input_file, orient='records', chunksize=shard_size, lines=True):
logger.info(
"Loading shard %d of size %s." % (shard_num, str(shard_size)))
for df in pd.read_json(input_file,
orient='records',
chunksize=shard_size,
lines=True):
logger.info("Loading shard %d of size %s." %
(shard_num, str(shard_size)))
df = df.replace(np.nan, str(""), regex=True)
shard_num += 1
yield df
Expand Down Expand Up @@ -519,9 +528,11 @@ def load_from_disk(filename: str) -> Any:
raise ValueError("Unrecognized filetype for %s" % filename)


def load_dataset_from_disk(save_dir: str) -> Tuple[bool, Optional[Tuple[
"dc.data.DiskDataset", "dc.data.DiskDataset", "dc.data.DiskDataset"]], List[
"dc.trans.Transformer"]]:
def load_dataset_from_disk(
save_dir: str
) -> Tuple[bool, Optional[Tuple["dc.data.DiskDataset", "dc.data.DiskDataset",
"dc.data.DiskDataset"]],
List["dc.trans.Transformer"]]:
"""Loads MoleculeNet train/valid/test/transformers from disk.
Expects that data was saved using `save_dataset_to_disk` below. Expects the
Expand Down Expand Up @@ -571,9 +582,10 @@ def load_dataset_from_disk(save_dir: str) -> Tuple[bool, Optional[Tuple[
return loaded, all_dataset, transformers


def save_dataset_to_disk(
save_dir: str, train: "dc.data.DiskDataset", valid: "dc.data.DiskDataset",
test: "dc.data.DiskDataset", transformers: List["dc.trans.Transformer"]):
def save_dataset_to_disk(save_dir: str, train: "dc.data.DiskDataset",
valid: "dc.data.DiskDataset",
test: "dc.data.DiskDataset",
transformers: List["dc.trans.Transformer"]):
"""Utility used by MoleculeNet to save train/valid/test datasets.
This utility function saves a train/valid/test split of a dataset along
Expand Down
1 change: 1 addition & 0 deletions deepchem/utils/test/test_data_utils.py
@@ -1,6 +1,7 @@
import unittest
from deepchem.utils.data_utils import load_sdf_files


class TestFileLoading(unittest.TestCase):

def test_load_sdf_files(self):
Expand Down
4 changes: 2 additions & 2 deletions deepchem/utils/test/test_genomics_utils.py
Expand Up @@ -40,8 +40,8 @@ def test_encode_fasta_sequence(self):
# Test it's possible to load a sequence with an aribrary alphabet from a fasta file.
fname = os.path.join(self.current_dir, "./assets/example.fasta")

encoded_seqs = dc.utils.genomics_utils.encode_bio_sequence(
fname, letters=LETTERS)
encoded_seqs = dc.utils.genomics_utils.encode_bio_sequence(fname,
letters=LETTERS)
expected = np.expand_dims(
np.array([
[[1, 0], [0, 1], [0, 0]],
Expand Down

0 comments on commit efd56f5

Please sign in to comment.