RDS-867: Use float32 for feature arrays in dgan, and add more debug logs

GitOrigin-RevId: 20c8f6d85b3df408549e6b64227a95bd36730b93
gretelai · Mar 14, 2024 · 5be7b76 · 5be7b76
1 parent 5c9534a
commit 5be7b76
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 13 deletions.
diff --git a/src/gretel_synthetics/timeseries_dgan/dgan.py b/src/gretel_synthetics/timeseries_dgan/dgan.py
@@ -44,7 +44,6 @@
    synthetic_attributes, synthetic_features = model.generate_numpy(1000)
 """
 
-
 from __future__ import annotations
 
 import abc
@@ -215,6 +214,16 @@ def train_numpy(
                 passing *output params at initialization or because train_* was
                 called previously.
         """
+        logging.info(
+            f"features shape={features.shape}, dtype={features.dtype}",
+            extra={"user_log": True},
+        )
+        if attributes is not None:
+            logging.info(
+                f"attributes shape={attributes.shape}, dtype={attributes.dtype}",
+                extra={"user_log": True},
+            )
+
         if attributes is not None:
             if attributes.shape[0] != features.shape[0]:
                 raise InternalError(
@@ -263,6 +272,9 @@ def train_numpy(
                     feature_types.append(OutputType.DISCRETE)
 
         if not self.is_built:
+            logger.info(
+                "Determining outputs metadata from input data", extra={"user_log": True}
+            )
             attribute_outputs, feature_outputs = create_outputs_from_data(
                 attributes,
                 features,
@@ -272,7 +284,7 @@ def train_numpy(
                 apply_feature_scaling=self.config.apply_feature_scaling,
                 apply_example_scaling=self.config.apply_example_scaling,
             )
-
+            logger.info("Building DGAN networks", extra={"user_log": True})
             self._build(
                 attribute_outputs,
                 feature_outputs,
@@ -290,6 +302,10 @@ def train_numpy(
             # category). To ensure we have none of these problematic nans, we
             # will interpolate to replace nans with actual float values, but if
             # we have too many nans in an example interpolation is unreliable.
+            logger.info(
+                f"Checking for nans in the {len(continuous_features_ind)} numeric columns",
+                extra={"user_log": True},
+            )
 
             # Find valid examples based on minimal number of nans.
             valid_examples = validation_check(
@@ -301,12 +317,17 @@ def train_numpy(
             if attributes is not None:
                 attributes = attributes[valid_examples]
 
+            logger.info(
+                "Applying linear interpolations for nans (does not mean nans are present)",
+                extra={"user_log": True},
+            )
             # Apply linear interpolations to replace nans for continuous
             # features:
             features[:, :, continuous_features_ind] = nan_linear_interpolation(
                 features[:, :, continuous_features_ind].astype("float")
             )
 
+        logger.info("Creating encoded array of features", extra={"user_log": True})
         if self.additional_attribute_outputs:
             (
                 internal_features,
@@ -326,24 +347,40 @@ def train_numpy(
                 np.full((internal_features.shape[0], 1), np.nan)
             )
 
+        logger.info("Creating encoded array of attributes", extra={"user_log": True})
         internal_attributes = transform(
             attributes,
             self.attribute_outputs,
             variable_dim_index=1,
             num_examples=internal_features.shape[0],
         )
 
+        logger.info(
+            f"internal_features shape={internal_features.shape}, dtype={internal_features.dtype}",
+            extra={"user_log": True},
+        )
+        logger.info(
+            f"internal_additional_attributes shape={internal_additional_attributes.shape}, dtype={internal_additional_attributes.dtype}",
+            extra={"user_log": True},
+        )
+        logger.info(
+            f"internal_attributes shape={internal_attributes.shape}, dtype={internal_attributes.dtype}",
+            extra={"user_log": True},
+        )
+
         if self.attribute_outputs and np.any(np.isnan(internal_attributes)):
             raise InternalError(
                 f"NaN found in internal attributes. {NAN_ERROR_MESSAGE}"
             )
 
+        logger.info("Creating TensorDataset", extra={"user_log": True})
         dataset = TensorDataset(
             torch.Tensor(internal_attributes),
             torch.Tensor(internal_additional_attributes),
             torch.Tensor(internal_features),
         )
 
+        logger.info("Calling _train()", extra={"user_log": True})
         self._train(dataset, progress_callback=progress_callback)
 
     def train_dataframe(
@@ -471,8 +508,12 @@ def train_dataframe(
                     f"df_style param must be an enum value DfStyle ('wide' or 'long'), received '{df_style}'"
                 )
 
+        logger.info(
+            "Converting from DataFrame to numpy arrays", extra={"user_log": True}
+        )
         attributes, features = self.data_frame_converter.convert(df)
 
+        logger.info("Calling train_numpy()", extra={"user_log": True})
         self.train_numpy(
             attributes=attributes,
             features=features,
@@ -530,9 +571,11 @@ def generate_numpy(
             # In [4]: np.array([1,2,3,4]) == None
             # Out[4]: array([False, False, False, False])
             internal_data = tuple(
-                np.concatenate(d, axis=0)
-                if not (np.array(d) == None).any()  # noqa
-                else None
+                (
+                    np.concatenate(d, axis=0)
+                    if not (np.array(d) == None).any()  # noqa
+                    else None
+                )
                 for d in zip(*internal_data_list)
             )
 
@@ -1651,7 +1694,6 @@ def validation_check(
     consecutive_nans_max: int = 5,
     consecutive_nans_ratio_cutoff: float = 0.05,
 ) -> np.array:
-
     """Checks if continuous features of examples are valid.
 
     Returns a 1-d numpy array of booleans with shape (#examples) indicating

diff --git a/src/gretel_synthetics/timeseries_dgan/transformations.py b/src/gretel_synthetics/timeseries_dgan/transformations.py
@@ -583,7 +583,7 @@ def transform(
     additional_attribute_parts = []
     parts = []
     if original_data is None:
-        return np.full((num_examples, 1), np.nan)
+        return np.full((num_examples, 1), np.nan, dtype=np.float32)
 
     for index, output in enumerate(outputs):
         # NOTE: isinstance(output, DiscreteOutput) does not work consistently
@@ -663,11 +663,11 @@ def transform(
 
     if additional_attribute_parts:
         return (
-            np.concatenate(parts, axis=variable_dim_index, dtype="float"),
-            np.concatenate(additional_attribute_parts, axis=1, dtype="float"),
+            np.concatenate(parts, axis=variable_dim_index, dtype=np.float32),
+            np.concatenate(additional_attribute_parts, axis=1, dtype=np.float32),
         )
     else:
-        return np.concatenate(parts, axis=variable_dim_index, dtype="float")
+        return np.concatenate(parts, axis=variable_dim_index, dtype=np.float32)
 
 
 def inverse_transform(
@@ -776,7 +776,6 @@ def inverse_transform(
 
 
 def create_additional_attribute_outputs(feature_outputs: List[Output]) -> List[Output]:
-
     """Create outputs for midpoint and half ranges.
 
     Returns list of additional attribute metadata. For each feature with

diff --git a/tests/timeseries_dgan/test_transformations.py b/tests/timeseries_dgan/test_transformations.py
@@ -326,7 +326,7 @@ def test_transform_and_inverse_attributes(normalization):
     assert transformed.shape == (n, 9)
 
     inversed = inverse_transform(transformed, outputs, 1)
-    np.testing.assert_allclose(inversed, attributes)
+    np.testing.assert_allclose(inversed, attributes, rtol=1e-04)
 
 
 @pytest.mark.parametrize(
@@ -357,4 +357,8 @@ def test_transform_and_inverse_features(normalization):
     assert additional_attributes.shape == (100, 4)
 
     inversed = inverse_transform(transformed, outputs, 2, additional_attributes)
-    np.testing.assert_allclose(inversed, features)
+    # TODO: 1e-04 seems too lax of a tolerance for float32, but values very
+    # close to 0.0 are failing the check at 1e-05, so going with this for now to
+    # reduce flakiness. Could be something we can do in the calculations to have
+    # less error.
+    np.testing.assert_allclose(inversed, features, rtol=1e-04)