Marjan/rds 464

* Apply the validation check and interpolation before transformation * remove nans when scaling continuous features * removed the nan replacement with 0 for continuous features * remove nans when scaling * added minor updates to the docstring * docstring updates * monogretel check style pass * updated the docstring-variable name change * added tests for NaNs in continuous features * nan_linear_interpolation function was corrected - updates based on the PR reviews were added * Updates based on the PR comments * corrected a wrong assertion * style check added * minor change of a function name-update doctrings * function name change * style check added GitOrigin-RevId: 177156df1036edc20ce47f421cd664a957c45b93
gretelai · Dec 2, 2022 · a1acefc · a1acefc
1 parent eebaf95
commit a1acefc
Show file tree

Hide file tree

Showing 3 changed files with 300 additions and 15 deletions.
diff --git a/src/gretel_synthetics/timeseries_dgan/dgan.py b/src/gretel_synthetics/timeseries_dgan/dgan.py
@@ -240,15 +240,31 @@ def train_numpy(
                 feature_outputs,
             )
 
+        continuous_features_ind = [
+            ind
+            for ind, val in enumerate(self.feature_outputs)
+            if "ContinuousOutput" in str(val.__class__)
+        ]
+
+        valid_examples = validation_check(
+            features[:, :, continuous_features_ind].astype("float")
+        )
+        # Only using valid examples for the entire dataset.
+        features = features[valid_examples]
+        # Apply linear interpolations for continuous features:
+        features[:, :, continuous_features_ind] = nan_linear_interpolation(
+            features[:, :, continuous_features_ind].astype("float")
+        )
+
+        if attributes is not None:
+            attributes = attributes[valid_examples]
+
         if self.additional_attribute_outputs:
             (
                 internal_features,
                 internal_additional_attributes,
             ) = transform(features, self.feature_outputs, variable_dim_index=2)
 
-            if np.any(np.isnan(internal_features)):
-                raise ValueError(f"NaN found in internal features. {NAN_ERROR_MESSAGE}")
-
             if np.any(np.isnan(internal_additional_attributes)):
                 raise ValueError(
                     f"NaN found in internal additional attributes. {NAN_ERROR_MESSAGE}"
@@ -262,9 +278,6 @@ def train_numpy(
                 np.full((internal_features.shape[0], 1), np.nan)
             )
 
-            if np.any(np.isnan(internal_features)):
-                raise ValueError(f"NaN found in internal features. {NAN_ERROR_MESSAGE}")
-
         internal_attributes = transform(
             attributes,
             self.attribute_outputs,
@@ -1498,3 +1511,118 @@ def _state_dict(self) -> Dict:
     "WideDataFrameConverter": _WideDataFrameConverter,
     "LongDataFrameConverter": _LongDataFrameConverter,
 }
+
+
+def find_max_consecutive_nans(array: np.array) -> int:
+    """
+    Returns the maximum number of consecutive NaNs in an array.
+
+    Args:
+        array: 1-d numpy array of time series per example.
+
+    Returns:
+        max_cons_nan: The maximum number of consecutive NaNs in a times series array.
+
+    """
+    # The number of consecutive nans are listed based on the index difference between the non-null values.
+    max_cons_nan = np.max(
+        np.diff(np.concatenate(([-1], np.where(~np.isnan(array))[0], [len(array)]))) - 1
+    )
+    return max_cons_nan
+
+
+def validation_check(
+    array: np.ndarray,
+    invalid_examples_ratio_cutoff: float = 0.5,
+    nans_ratio_cutoff: float = 0.1,
+    consecutive_nans_max: int = 5,
+    consecutive_nans_ratio_cutoff: float = 0.05,
+) -> np.array:
+
+    """Checks if continuous features of examples are valid.
+
+    Returns a 1-d numpy array of booleans with shape (#examples) indicating
+    valid examples.
+    Examples with continuous features fall into 3 categories: good, valid (fixable) and
+    invalid (non-fixable).
+    - "Good" examples have no NaNs.
+    - "Valid" examples have a low percentage of nans and a below a threshold number of
+    consecutive NaNs.
+    - "Invalid" are the rest, and are marked "False" in the returned array.  Later on,
+    these are omitted from training. If there are too many, later, we error out.
+
+    Args:
+        array: 3-d numpy array of continuous features with
+        shape (#examples,max_sequence_length, #continuous features).
+        invalid_examples_ratio_cutoff: Error out if the invalid examples ratio in the dataset
+        is higher than this value.
+        nans_ratio_cutoff: If the percentage of nans for any continuous feature in an example
+        is greater than this value, the example is invalid.
+        consecutive_nans_max: If the maximum number of consecutive nans in a continuous
+        feature is greater than this number, then that example is invalid.
+        consecutive_nans_ratio_cutoff: If the maximum number of consecutive nans in a
+        continuous feature is greater than this ratio times the length of the example
+        (number samples), then the example is invalid.
+
+    Returns:
+        valid_examples : 1-d numpy array of booleans indicating valid examples with
+        shape (#examples).
+
+    """
+    # Check for the nans ratio per examples and feature.
+    # nan_ratio_feature is a 2-d numpy array of size (#examples,#features)
+
+    nan_ratio_feature = np.mean(np.isnan(array), axis=1)
+    nan_ratio = nan_ratio_feature < nans_ratio_cutoff
+
+    # Check for max number of consecutive NaN values per example and feature.
+    # cons_nans_feature is a 2-d numpy array of size (#examples,#features)
+    cons_nans_feature = np.apply_along_axis(find_max_consecutive_nans, 1, array)
+    cons_nans = cons_nans_feature < min(
+        consecutive_nans_max,
+        max(2, int(consecutive_nans_ratio_cutoff * array.shape[1])),
+    )
+
+    # The two above checks should pass for a valid example for all features, otherwise
+    #  the example is invalid.
+    valid_examples_per_feature = np.logical_and(nan_ratio, cons_nans)
+    valid_examples = np.all(valid_examples_per_feature, axis=1)
+
+    if np.mean(valid_examples) < invalid_examples_ratio_cutoff:
+        raise ValueError(
+            f"More than {100*invalid_examples_ratio_cutoff}% invalid examples in the continuous features. Please reduce the ratio of the NaNs and try again!"
+        )
+
+    if (~valid_examples).any():
+        logger.warning(
+            f"There are {sum(~valid_examples)} examples that have too many nan values in numeric features, accounting for {np.mean(~valid_examples)*100}% of all examples. These invalid examples will be omitted from training.",
+            extra={"user_log": True},
+        )
+
+    return valid_examples
+
+
+def nan_linear_interpolation(arrays: np.ndarray) -> np.ndarray:
+    """Replaces all NaNs via linear interpolation.
+
+    Args:
+        arrays: 3-d numpy array of continuous features, with shape
+        (#examples, max_sequence_length, #continuous features)
+
+    Returns:
+        arrays: 3-d numpy array where NaNs are replaced via
+        linear interpolation.
+
+    """
+    examples = arrays.shape[0]
+    features = arrays.shape[2]
+
+    for exp in range(examples):
+        for f in range(features):
+            array = arrays[exp, :, f]
+            if np.isnan(array).any():
+                nans = np.isnan(array)
+                ind_func = lambda z: z.nonzero()[0]
+                array[nans] = np.interp(ind_func(nans), ind_func(~nans), array[~nans])
+
+    return arrays
diff --git a/src/gretel_synthetics/timeseries_dgan/transformations.py b/src/gretel_synthetics/timeseries_dgan/transformations.py
@@ -327,8 +327,8 @@ def _fit(self, column):
             column: 1-d numpy array
         """
         column = column.astype("float")
-        self.global_min = np.min(column)
-        self.global_max = np.max(column)
+        self.global_min = np.nanmin(column)
+        self.global_max = np.nanmax(column)
 
     def _transform(self, column: np.ndarray) -> np.ndarray:
         """Apply continuous variable encoding/scaling.

diff --git a/tests/timeseries_dgan/test_dgan.py b/tests/timeseries_dgan/test_dgan.py
@@ -16,6 +16,9 @@
     _LongDataFrameConverter,
     _WideDataFrameConverter,
     DGAN,
+    find_max_consecutive_nans,
+    nan_linear_interpolation,
+    validation_check,
 )
 from gretel_synthetics.timeseries_dgan.transformations import (
     BinaryEncodedOutput,
@@ -617,18 +620,119 @@ def test_train_dataframe_long_no_attributes_no_example_id(config: DGANConfig):
     assert synthetic_df["example_id"].value_counts()[0] == config.max_sequence_len
 
 
-def test_train_numpy_nans(config: DGANConfig, feature_data):
-    features, feature_types = feature_data
-    # Insert a NaN
-    features[11, 3, 1] = np.NaN
+def test_find_max_consecutive_nans():
+    # Checking the output of the "find_max_consecutive_nans" function.
+    # We create a 1-d random array, insert nans in different locations and lengths,
+    # testing the maximum consecutive nans in the data.
+    n = 50
+    features = np.random.rand(n)
+    features[0:5] = features[7:21] = features[30:40] = features[-2:] = np.nan
 
-    dg = DGAN(config=config)
+    assert find_max_consecutive_nans(features) == 14
+
+    features = np.random.rand(n)
+    features[0:12] = features[20:22] = features[-3:] = np.nan
+
+    assert find_max_consecutive_nans(features) == 12
+
+    features = np.random.rand(n)
+    features[0:8] = features[20:22] = features[-17:] = np.nan
+
+    assert find_max_consecutive_nans(features) == 17
+
+
+def test_nan_linear_interpolation():
+    # Checks the functionality and output of the "nan_linear_interpolation" function.
+    # Inserting nans in different length and locations of a 3-d array.
+    # np interpolation uses padding for values in the begining and the end of an array.
+
+    features = np.array(
+        [
+            [[0.0, 1.0, 2.0], [np.nan, 7, 5.0], [np.nan, 4, 8.0], [8.0, 10.0, np.nan]],
+            [
+                [np.nan, 13.0, 14.0],
+                [np.nan, 16.0, 17.0],
+                [18.0, 19.0, 20.0],
+                [21.0, 22.0, 23.0],
+            ],
+        ]
+    )
+
+    features = nan_linear_interpolation(features)
+
+    assert (features[0, 1:3, 0] == np.array([8 / 3, (8 / 3 + 8) / 2])).all()
+    assert (np.diff(features[0, 2:, 2]) == 0).all()
+    assert (np.diff(features[1, 0:3, 0]) == 0).all()
+    assert np.isnan(features).sum() == 0
+
+
+def test_validation_check():
+
+    # Checking the functionality and output of the validation check for 3
+    # scenarios of:
+    # 1. Erroring out when invalid records are too high.
+    # 2. Dropping invalid records with lower ratio.
+    # 3. keeping the fixable valid examples.
 
+    n = 50
+    # Set nans for feature 2 , time points 2 and 3, and the first 26 examples. All
+    # the examples are considered invalid. The check will raise an error since there are
+    # too many invalid examples.
+    invalid_examples = np.random.rand(n, 20, 3)
+    invalid_examples[0:26, 2:4, 2] = np.nan
     with pytest.raises(ValueError, match="NaN"):
-        dg.train_numpy(features=features, feature_types=feature_types)
+        validation_check(invalid_examples)
+
+    # Set nans for various features. Features 1 and 2 have fixable invalid examples,
+    # while feature 0 has 10 invalid examples which should be dropped (high consecutive nans)
+    invalid_examples_dropped = np.random.rand(n, 20, 3)
+    invalid_examples_dropped[0:2, 2:3, 2] = np.nan
+    invalid_examples_dropped[20:30, 10:20, 0] = np.nan
+    invalid_examples_dropped[30:40, 15, 1] = np.nan
+
+    test_boolean = np.array([True] * n)
+    test_boolean[20:30] = False
+    assert (validation_check(invalid_examples_dropped) == test_boolean).all()
+
+    # inserting small number of nans for each feature, non should be dropped during
+    # the check.
+    valid_examples = np.random.rand(n, 20, 3)
+    valid_examples[5:7, 2, 2] = np.nan
+    valid_examples[15:20, 15, 0] = np.nan
+    valid_examples[-5:, 8, 1] = np.nan
+    assert validation_check(valid_examples).all()
+
+
+def test_train_numpy_nans(config: DGANConfig):
+    # checking the functionality of the "train_numpy" when including continuous NaNs.
+    # Since the interpolation is done before the transformation, we check if no NaNs are
+    # generated.
+
+    n = 100
+    features = np.concatenate(
+        (
+            np.random.randint(0, 4, size=(n, 20, 1)),
+            np.random.rand(n, 20, 1),
+            np.random.rand(n, 20, 1),
+        ),
+        axis=2,
+    )
+    feature_types = [OutputType.DISCRETE, OutputType.CONTINUOUS, OutputType.CONTINUOUS]
+    # insert sparse NaNs in continuous feature #1.
+    features[11, 3, 1] = features[65:73, 17, 1] = np.NaN
+    # insert cosecutive NaNs in continuous feature #2.
+    features[5:10, 2:4, 2] = features[80:90, 4:10, 2] = np.NaN
+
+    dg = DGAN(config=config)
+    dg.train_numpy(features=features, feature_types=feature_types)
+    synthetic_attributes, synthetic_features = dg.generate_numpy(50)
+
+    assert synthetic_attributes is None
+    assert np.isnan(synthetic_features).sum() == 0
 
 
-def test_train_dataframe_nans(config: DGANConfig):
+def test_train_dataframe_wide_nans_all_invalid_examples(config: DGANConfig):
+    # check the functionality of "train_dataframe_wide" when all examples are invalid.
     n = 50
     df = pd.DataFrame(
         {
@@ -647,6 +751,59 @@ def test_train_dataframe_nans(config: DGANConfig):
         dg.train_dataframe(df=df, df_style=DfStyle.WIDE)
 
 
+def test_train_dataframe_wide_nans_some_valid_examples(config: DGANConfig):
+    # check the functionality of "train_dataframe_wide" when some examples are NaNs.
+    n = 50
+    df = pd.DataFrame(
+        {
+            "2022-01-01": np.random.rand(n),
+            "2022-02-01": np.random.rand(n),
+            "2022-03-01": np.random.rand(n),
+            "2022-04-01": np.random.rand(n),
+            "2022-05-01": np.random.rand(n),
+            "2022-06-01": np.random.rand(n),
+            "2022-07-01": np.random.rand(n),
+            "2022-08-01": np.random.rand(n),
+            "2022-09-01": np.random.rand(n),
+            "2022-10-01": np.random.rand(n),
+            "2022-11-01": np.random.rand(n),
+            "2022-12-01": np.nan,
+        }
+    )
+
+    config.max_sequence_len = 12
+    config.sample_len = 1
+
+    dg = DGAN(config=config)
+    dg.train_dataframe(df=df, df_style=DfStyle.WIDE)
+    synthetic_df = dg.generate_dataframe(30)
+
+    assert not pd.isna(synthetic_df).any().any()
+
+
+def test_train_dataframe_long_nans(config: DGANConfig):
+    n = 50
+    df = pd.DataFrame(
+        {
+            "example_id": np.repeat(range(n), 20),
+            "f1": np.random.rand(20 * n),
+            "f2": np.random.rand(20 * n),
+        }
+    )
+
+    df.iloc[0, 2] = df.iloc[500, 1] = df.iloc[900, 1] = np.nan
+    dg = DGAN(config=config)
+
+    dg.train_dataframe(
+        df=df,
+        example_id_column="example_id",
+        df_style=DfStyle.LONG,
+    )
+
+    synthetic_df = dg.generate_dataframe(n)
+    assert not pd.isna(synthetic_df).any().any()
+
+
 @pytest.mark.parametrize("binary_encoder_cutoff", [150, 1])
 def test_train_dataframe_with_strings(config: DGANConfig, binary_encoder_cutoff):
     n = 50