Skip to content

Commit

Permalink
RDS-867: Use float32 for feature arrays in dgan, and add more debug logs
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 20c8f6d85b3df408549e6b64227a95bd36730b93
  • Loading branch information
kboyd committed Mar 14, 2024
1 parent 5c9534a commit 5be7b76
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 13 deletions.
54 changes: 48 additions & 6 deletions src/gretel_synthetics/timeseries_dgan/dgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
synthetic_attributes, synthetic_features = model.generate_numpy(1000)
"""


from __future__ import annotations

import abc
Expand Down Expand Up @@ -215,6 +214,16 @@ def train_numpy(
passing *output params at initialization or because train_* was
called previously.
"""
logging.info(
f"features shape={features.shape}, dtype={features.dtype}",
extra={"user_log": True},
)
if attributes is not None:
logging.info(
f"attributes shape={attributes.shape}, dtype={attributes.dtype}",
extra={"user_log": True},
)

if attributes is not None:
if attributes.shape[0] != features.shape[0]:
raise InternalError(
Expand Down Expand Up @@ -263,6 +272,9 @@ def train_numpy(
feature_types.append(OutputType.DISCRETE)

if not self.is_built:
logger.info(
"Determining outputs metadata from input data", extra={"user_log": True}
)
attribute_outputs, feature_outputs = create_outputs_from_data(
attributes,
features,
Expand All @@ -272,7 +284,7 @@ def train_numpy(
apply_feature_scaling=self.config.apply_feature_scaling,
apply_example_scaling=self.config.apply_example_scaling,
)

logger.info("Building DGAN networks", extra={"user_log": True})
self._build(
attribute_outputs,
feature_outputs,
Expand All @@ -290,6 +302,10 @@ def train_numpy(
# category). To ensure we have none of these problematic nans, we
# will interpolate to replace nans with actual float values, but if
# we have too many nans in an example interpolation is unreliable.
logger.info(
f"Checking for nans in the {len(continuous_features_ind)} numeric columns",
extra={"user_log": True},
)

# Find valid examples based on minimal number of nans.
valid_examples = validation_check(
Expand All @@ -301,12 +317,17 @@ def train_numpy(
if attributes is not None:
attributes = attributes[valid_examples]

logger.info(
"Applying linear interpolations for nans (does not mean nans are present)",
extra={"user_log": True},
)
# Apply linear interpolations to replace nans for continuous
# features:
features[:, :, continuous_features_ind] = nan_linear_interpolation(
features[:, :, continuous_features_ind].astype("float")
)

logger.info("Creating encoded array of features", extra={"user_log": True})
if self.additional_attribute_outputs:
(
internal_features,
Expand All @@ -326,24 +347,40 @@ def train_numpy(
np.full((internal_features.shape[0], 1), np.nan)
)

logger.info("Creating encoded array of attributes", extra={"user_log": True})
internal_attributes = transform(
attributes,
self.attribute_outputs,
variable_dim_index=1,
num_examples=internal_features.shape[0],
)

logger.info(
f"internal_features shape={internal_features.shape}, dtype={internal_features.dtype}",
extra={"user_log": True},
)
logger.info(
f"internal_additional_attributes shape={internal_additional_attributes.shape}, dtype={internal_additional_attributes.dtype}",
extra={"user_log": True},
)
logger.info(
f"internal_attributes shape={internal_attributes.shape}, dtype={internal_attributes.dtype}",
extra={"user_log": True},
)

if self.attribute_outputs and np.any(np.isnan(internal_attributes)):
raise InternalError(
f"NaN found in internal attributes. {NAN_ERROR_MESSAGE}"
)

logger.info("Creating TensorDataset", extra={"user_log": True})
dataset = TensorDataset(
torch.Tensor(internal_attributes),
torch.Tensor(internal_additional_attributes),
torch.Tensor(internal_features),
)

logger.info("Calling _train()", extra={"user_log": True})
self._train(dataset, progress_callback=progress_callback)

def train_dataframe(
Expand Down Expand Up @@ -471,8 +508,12 @@ def train_dataframe(
f"df_style param must be an enum value DfStyle ('wide' or 'long'), received '{df_style}'"
)

logger.info(
"Converting from DataFrame to numpy arrays", extra={"user_log": True}
)
attributes, features = self.data_frame_converter.convert(df)

logger.info("Calling train_numpy()", extra={"user_log": True})
self.train_numpy(
attributes=attributes,
features=features,
Expand Down Expand Up @@ -530,9 +571,11 @@ def generate_numpy(
# In [4]: np.array([1,2,3,4]) == None
# Out[4]: array([False, False, False, False])
internal_data = tuple(
np.concatenate(d, axis=0)
if not (np.array(d) == None).any() # noqa
else None
(
np.concatenate(d, axis=0)
if not (np.array(d) == None).any() # noqa
else None
)
for d in zip(*internal_data_list)
)

Expand Down Expand Up @@ -1651,7 +1694,6 @@ def validation_check(
consecutive_nans_max: int = 5,
consecutive_nans_ratio_cutoff: float = 0.05,
) -> np.array:

"""Checks if continuous features of examples are valid.
Returns a 1-d numpy array of booleans with shape (#examples) indicating
Expand Down
9 changes: 4 additions & 5 deletions src/gretel_synthetics/timeseries_dgan/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def transform(
additional_attribute_parts = []
parts = []
if original_data is None:
return np.full((num_examples, 1), np.nan)
return np.full((num_examples, 1), np.nan, dtype=np.float32)

for index, output in enumerate(outputs):
# NOTE: isinstance(output, DiscreteOutput) does not work consistently
Expand Down Expand Up @@ -663,11 +663,11 @@ def transform(

if additional_attribute_parts:
return (
np.concatenate(parts, axis=variable_dim_index, dtype="float"),
np.concatenate(additional_attribute_parts, axis=1, dtype="float"),
np.concatenate(parts, axis=variable_dim_index, dtype=np.float32),
np.concatenate(additional_attribute_parts, axis=1, dtype=np.float32),
)
else:
return np.concatenate(parts, axis=variable_dim_index, dtype="float")
return np.concatenate(parts, axis=variable_dim_index, dtype=np.float32)


def inverse_transform(
Expand Down Expand Up @@ -776,7 +776,6 @@ def inverse_transform(


def create_additional_attribute_outputs(feature_outputs: List[Output]) -> List[Output]:

"""Create outputs for midpoint and half ranges.
Returns list of additional attribute metadata. For each feature with
Expand Down
8 changes: 6 additions & 2 deletions tests/timeseries_dgan/test_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def test_transform_and_inverse_attributes(normalization):
assert transformed.shape == (n, 9)

inversed = inverse_transform(transformed, outputs, 1)
np.testing.assert_allclose(inversed, attributes)
np.testing.assert_allclose(inversed, attributes, rtol=1e-04)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -357,4 +357,8 @@ def test_transform_and_inverse_features(normalization):
assert additional_attributes.shape == (100, 4)

inversed = inverse_transform(transformed, outputs, 2, additional_attributes)
np.testing.assert_allclose(inversed, features)
# TODO: 1e-04 seems too lax of a tolerance for float32, but values very
# close to 0.0 are failing the check at 1e-05, so going with this for now to
# reduce flakiness. Could be something we can do in the calculations to have
# less error.
np.testing.assert_allclose(inversed, features, rtol=1e-04)

0 comments on commit 5be7b76

Please sign in to comment.