Skip to content

Commit

Permalink
Marjan/rds 464
Browse files Browse the repository at this point in the history
* Apply the validation check and interpolation before transformation

* remove nans when scaling continuous features

* removed the nan replacement with 0 for continuous features

* remove nans when scaling

* added minor updates to the docstring

* docstring updates

* monogretel check style pass

* updated the docstring-variable name change

* added tests for NaNs in continuous features

* nan_linear_interpolation function was corrected - updates based on the PR reviews were added

* Updates based on the PR comments

* corrected a wrong assertion

* style check added

* minor change of a function name-update doctrings

* function name change

* style check added

GitOrigin-RevId: 177156df1036edc20ce47f421cd664a957c45b93
  • Loading branch information
Marjan-emd committed Dec 2, 2022
1 parent eebaf95 commit a1acefc
Show file tree
Hide file tree
Showing 3 changed files with 300 additions and 15 deletions.
140 changes: 134 additions & 6 deletions src/gretel_synthetics/timeseries_dgan/dgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,15 +240,31 @@ def train_numpy(
feature_outputs,
)

continuous_features_ind = [
ind
for ind, val in enumerate(self.feature_outputs)
if "ContinuousOutput" in str(val.__class__)
]

valid_examples = validation_check(
features[:, :, continuous_features_ind].astype("float")
)
# Only using valid examples for the entire dataset.
features = features[valid_examples]
# Apply linear interpolations for continuous features:
features[:, :, continuous_features_ind] = nan_linear_interpolation(
features[:, :, continuous_features_ind].astype("float")
)

if attributes is not None:
attributes = attributes[valid_examples]

if self.additional_attribute_outputs:
(
internal_features,
internal_additional_attributes,
) = transform(features, self.feature_outputs, variable_dim_index=2)

if np.any(np.isnan(internal_features)):
raise ValueError(f"NaN found in internal features. {NAN_ERROR_MESSAGE}")

if np.any(np.isnan(internal_additional_attributes)):
raise ValueError(
f"NaN found in internal additional attributes. {NAN_ERROR_MESSAGE}"
Expand All @@ -262,9 +278,6 @@ def train_numpy(
np.full((internal_features.shape[0], 1), np.nan)
)

if np.any(np.isnan(internal_features)):
raise ValueError(f"NaN found in internal features. {NAN_ERROR_MESSAGE}")

internal_attributes = transform(
attributes,
self.attribute_outputs,
Expand Down Expand Up @@ -1498,3 +1511,118 @@ def _state_dict(self) -> Dict:
"WideDataFrameConverter": _WideDataFrameConverter,
"LongDataFrameConverter": _LongDataFrameConverter,
}


def find_max_consecutive_nans(array: np.array) -> int:
"""
Returns the maximum number of consecutive NaNs in an array.
Args:
array: 1-d numpy array of time series per example.
Returns:
max_cons_nan: The maximum number of consecutive NaNs in a times series array.
"""
# The number of consecutive nans are listed based on the index difference between the non-null values.
max_cons_nan = np.max(
np.diff(np.concatenate(([-1], np.where(~np.isnan(array))[0], [len(array)]))) - 1
)
return max_cons_nan


def validation_check(
array: np.ndarray,
invalid_examples_ratio_cutoff: float = 0.5,
nans_ratio_cutoff: float = 0.1,
consecutive_nans_max: int = 5,
consecutive_nans_ratio_cutoff: float = 0.05,
) -> np.array:

"""Checks if continuous features of examples are valid.
Returns a 1-d numpy array of booleans with shape (#examples) indicating
valid examples.
Examples with continuous features fall into 3 categories: good, valid (fixable) and
invalid (non-fixable).
- "Good" examples have no NaNs.
- "Valid" examples have a low percentage of nans and a below a threshold number of
consecutive NaNs.
- "Invalid" are the rest, and are marked "False" in the returned array. Later on,
these are omitted from training. If there are too many, later, we error out.
Args:
array: 3-d numpy array of continuous features with
shape (#examples,max_sequence_length, #continuous features).
invalid_examples_ratio_cutoff: Error out if the invalid examples ratio in the dataset
is higher than this value.
nans_ratio_cutoff: If the percentage of nans for any continuous feature in an example
is greater than this value, the example is invalid.
consecutive_nans_max: If the maximum number of consecutive nans in a continuous
feature is greater than this number, then that example is invalid.
consecutive_nans_ratio_cutoff: If the maximum number of consecutive nans in a
continuous feature is greater than this ratio times the length of the example
(number samples), then the example is invalid.
Returns:
valid_examples : 1-d numpy array of booleans indicating valid examples with
shape (#examples).
"""
# Check for the nans ratio per examples and feature.
# nan_ratio_feature is a 2-d numpy array of size (#examples,#features)

nan_ratio_feature = np.mean(np.isnan(array), axis=1)
nan_ratio = nan_ratio_feature < nans_ratio_cutoff

# Check for max number of consecutive NaN values per example and feature.
# cons_nans_feature is a 2-d numpy array of size (#examples,#features)
cons_nans_feature = np.apply_along_axis(find_max_consecutive_nans, 1, array)
cons_nans = cons_nans_feature < min(
consecutive_nans_max,
max(2, int(consecutive_nans_ratio_cutoff * array.shape[1])),
)

# The two above checks should pass for a valid example for all features, otherwise
# the example is invalid.
valid_examples_per_feature = np.logical_and(nan_ratio, cons_nans)
valid_examples = np.all(valid_examples_per_feature, axis=1)

if np.mean(valid_examples) < invalid_examples_ratio_cutoff:
raise ValueError(
f"More than {100*invalid_examples_ratio_cutoff}% invalid examples in the continuous features. Please reduce the ratio of the NaNs and try again!"
)

if (~valid_examples).any():
logger.warning(
f"There are {sum(~valid_examples)} examples that have too many nan values in numeric features, accounting for {np.mean(~valid_examples)*100}% of all examples. These invalid examples will be omitted from training.",
extra={"user_log": True},
)

return valid_examples


def nan_linear_interpolation(arrays: np.ndarray) -> np.ndarray:
"""Replaces all NaNs via linear interpolation.
Args:
arrays: 3-d numpy array of continuous features, with shape
(#examples, max_sequence_length, #continuous features)
Returns:
arrays: 3-d numpy array where NaNs are replaced via
linear interpolation.
"""
examples = arrays.shape[0]
features = arrays.shape[2]

for exp in range(examples):
for f in range(features):
array = arrays[exp, :, f]
if np.isnan(array).any():
nans = np.isnan(array)
ind_func = lambda z: z.nonzero()[0]
array[nans] = np.interp(ind_func(nans), ind_func(~nans), array[~nans])

return arrays
4 changes: 2 additions & 2 deletions src/gretel_synthetics/timeseries_dgan/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,8 @@ def _fit(self, column):
column: 1-d numpy array
"""
column = column.astype("float")
self.global_min = np.min(column)
self.global_max = np.max(column)
self.global_min = np.nanmin(column)
self.global_max = np.nanmax(column)

def _transform(self, column: np.ndarray) -> np.ndarray:
"""Apply continuous variable encoding/scaling.
Expand Down
171 changes: 164 additions & 7 deletions tests/timeseries_dgan/test_dgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
_LongDataFrameConverter,
_WideDataFrameConverter,
DGAN,
find_max_consecutive_nans,
nan_linear_interpolation,
validation_check,
)
from gretel_synthetics.timeseries_dgan.transformations import (
BinaryEncodedOutput,
Expand Down Expand Up @@ -617,18 +620,119 @@ def test_train_dataframe_long_no_attributes_no_example_id(config: DGANConfig):
assert synthetic_df["example_id"].value_counts()[0] == config.max_sequence_len


def test_train_numpy_nans(config: DGANConfig, feature_data):
features, feature_types = feature_data
# Insert a NaN
features[11, 3, 1] = np.NaN
def test_find_max_consecutive_nans():
# Checking the output of the "find_max_consecutive_nans" function.
# We create a 1-d random array, insert nans in different locations and lengths,
# testing the maximum consecutive nans in the data.
n = 50
features = np.random.rand(n)
features[0:5] = features[7:21] = features[30:40] = features[-2:] = np.nan

dg = DGAN(config=config)
assert find_max_consecutive_nans(features) == 14

features = np.random.rand(n)
features[0:12] = features[20:22] = features[-3:] = np.nan

assert find_max_consecutive_nans(features) == 12

features = np.random.rand(n)
features[0:8] = features[20:22] = features[-17:] = np.nan

assert find_max_consecutive_nans(features) == 17


def test_nan_linear_interpolation():
# Checks the functionality and output of the "nan_linear_interpolation" function.
# Inserting nans in different length and locations of a 3-d array.
# np interpolation uses padding for values in the begining and the end of an array.

features = np.array(
[
[[0.0, 1.0, 2.0], [np.nan, 7, 5.0], [np.nan, 4, 8.0], [8.0, 10.0, np.nan]],
[
[np.nan, 13.0, 14.0],
[np.nan, 16.0, 17.0],
[18.0, 19.0, 20.0],
[21.0, 22.0, 23.0],
],
]
)

features = nan_linear_interpolation(features)

assert (features[0, 1:3, 0] == np.array([8 / 3, (8 / 3 + 8) / 2])).all()
assert (np.diff(features[0, 2:, 2]) == 0).all()
assert (np.diff(features[1, 0:3, 0]) == 0).all()
assert np.isnan(features).sum() == 0


def test_validation_check():

# Checking the functionality and output of the validation check for 3
# scenarios of:
# 1. Erroring out when invalid records are too high.
# 2. Dropping invalid records with lower ratio.
# 3. keeping the fixable valid examples.

n = 50
# Set nans for feature 2 , time points 2 and 3, and the first 26 examples. All
# the examples are considered invalid. The check will raise an error since there are
# too many invalid examples.
invalid_examples = np.random.rand(n, 20, 3)
invalid_examples[0:26, 2:4, 2] = np.nan
with pytest.raises(ValueError, match="NaN"):
dg.train_numpy(features=features, feature_types=feature_types)
validation_check(invalid_examples)

# Set nans for various features. Features 1 and 2 have fixable invalid examples,
# while feature 0 has 10 invalid examples which should be dropped (high consecutive nans)
invalid_examples_dropped = np.random.rand(n, 20, 3)
invalid_examples_dropped[0:2, 2:3, 2] = np.nan
invalid_examples_dropped[20:30, 10:20, 0] = np.nan
invalid_examples_dropped[30:40, 15, 1] = np.nan

test_boolean = np.array([True] * n)
test_boolean[20:30] = False
assert (validation_check(invalid_examples_dropped) == test_boolean).all()

# inserting small number of nans for each feature, non should be dropped during
# the check.
valid_examples = np.random.rand(n, 20, 3)
valid_examples[5:7, 2, 2] = np.nan
valid_examples[15:20, 15, 0] = np.nan
valid_examples[-5:, 8, 1] = np.nan
assert validation_check(valid_examples).all()


def test_train_numpy_nans(config: DGANConfig):
# checking the functionality of the "train_numpy" when including continuous NaNs.
# Since the interpolation is done before the transformation, we check if no NaNs are
# generated.

n = 100
features = np.concatenate(
(
np.random.randint(0, 4, size=(n, 20, 1)),
np.random.rand(n, 20, 1),
np.random.rand(n, 20, 1),
),
axis=2,
)
feature_types = [OutputType.DISCRETE, OutputType.CONTINUOUS, OutputType.CONTINUOUS]
# insert sparse NaNs in continuous feature #1.
features[11, 3, 1] = features[65:73, 17, 1] = np.NaN
# insert cosecutive NaNs in continuous feature #2.
features[5:10, 2:4, 2] = features[80:90, 4:10, 2] = np.NaN

dg = DGAN(config=config)
dg.train_numpy(features=features, feature_types=feature_types)
synthetic_attributes, synthetic_features = dg.generate_numpy(50)

assert synthetic_attributes is None
assert np.isnan(synthetic_features).sum() == 0


def test_train_dataframe_nans(config: DGANConfig):
def test_train_dataframe_wide_nans_all_invalid_examples(config: DGANConfig):
# check the functionality of "train_dataframe_wide" when all examples are invalid.
n = 50
df = pd.DataFrame(
{
Expand All @@ -647,6 +751,59 @@ def test_train_dataframe_nans(config: DGANConfig):
dg.train_dataframe(df=df, df_style=DfStyle.WIDE)


def test_train_dataframe_wide_nans_some_valid_examples(config: DGANConfig):
# check the functionality of "train_dataframe_wide" when some examples are NaNs.
n = 50
df = pd.DataFrame(
{
"2022-01-01": np.random.rand(n),
"2022-02-01": np.random.rand(n),
"2022-03-01": np.random.rand(n),
"2022-04-01": np.random.rand(n),
"2022-05-01": np.random.rand(n),
"2022-06-01": np.random.rand(n),
"2022-07-01": np.random.rand(n),
"2022-08-01": np.random.rand(n),
"2022-09-01": np.random.rand(n),
"2022-10-01": np.random.rand(n),
"2022-11-01": np.random.rand(n),
"2022-12-01": np.nan,
}
)

config.max_sequence_len = 12
config.sample_len = 1

dg = DGAN(config=config)
dg.train_dataframe(df=df, df_style=DfStyle.WIDE)
synthetic_df = dg.generate_dataframe(30)

assert not pd.isna(synthetic_df).any().any()


def test_train_dataframe_long_nans(config: DGANConfig):
n = 50
df = pd.DataFrame(
{
"example_id": np.repeat(range(n), 20),
"f1": np.random.rand(20 * n),
"f2": np.random.rand(20 * n),
}
)

df.iloc[0, 2] = df.iloc[500, 1] = df.iloc[900, 1] = np.nan
dg = DGAN(config=config)

dg.train_dataframe(
df=df,
example_id_column="example_id",
df_style=DfStyle.LONG,
)

synthetic_df = dg.generate_dataframe(n)
assert not pd.isna(synthetic_df).any().any()


@pytest.mark.parametrize("binary_encoder_cutoff", [150, 1])
def test_train_dataframe_with_strings(config: DGANConfig, binary_encoder_cutoff):
n = 50
Expand Down

0 comments on commit a1acefc

Please sign in to comment.