Skip to content

Commit

Permalink
Skip key error during attempt cast to int
Browse files Browse the repository at this point in the history
* Skip key error during attempt cast to int

* Add in more guardrails for column params

---------

Co-authored-by: John Myers <john@gretel.ai>
GitOrigin-RevId: b00d18c2336c7a72b37be615a8166db2cc21abea
  • Loading branch information
johntmyers and John Myers committed Jan 30, 2023
1 parent 0e72ab1 commit 44b1d77
Show file tree
Hide file tree
Showing 2 changed files with 186 additions and 57 deletions.
115 changes: 87 additions & 28 deletions src/gretel_synthetics/timeseries_dgan/dgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,34 @@
NumpyArrayTriple = Tuple[np.ndarray, np.ndarray, np.ndarray]

NAN_ERROR_MESSAGE = """
DGAN does not support NaNs, please remove NaNs before training. If there are no NaNs in your input data and you see this error, please create a support ticket.
DGAN does not support NaNs, please remove NaNs before training. If there are no NaNs in your input data and you see this error, please create a support ticket. # noqa
"""


def _discrete_cols_to_int(
df: pd.DataFrame, discrete_columns: Optional[List[str]]
) -> pd.DataFrame:
# Convert discrete columns to int where possible.
if discrete_columns is None:
return df

missing_discrete = set()
for c in discrete_columns:
try:
df[c] = df[c].astype("int")
except ValueError:
continue
except KeyError:
missing_discrete.add(c)

if missing_discrete:
logger.warning(
f"The following discrete columns ({missing_discrete}) were not in the generated DataFrame, you may want to ensure this is intended!" # noqa
)

return df


class DGAN:
"""
DoppelGANger model.
Expand Down Expand Up @@ -152,7 +176,7 @@ def train_numpy(
attributes: Optional[np.ndarray] = None,
attribute_types: Optional[List[OutputType]] = None,
progress_callback: Optional[Callable[[ProgressInfo]]] = None,
):
) -> None:
"""Train DGAN model on data in numpy arrays.
Training data is passed in 2 numpy arrays, one for attributes (2d) and
Expand Down Expand Up @@ -192,7 +216,7 @@ def train_numpy(
if attributes is not None:
if attributes.shape[0] != features.shape[0]:
raise RuntimeError(
"First dimension of attributes and features must be the same length, i.e., the number of training examples."
"First dimension of attributes and features must be the same length, i.e., the number of training examples." # noqa
)

if features.shape[1] != self.config.max_sequence_len:
Expand Down Expand Up @@ -318,7 +342,7 @@ def train_dataframe(
discrete_columns: Optional[List[str]] = None,
df_style: DfStyle = DfStyle.WIDE,
progress_callback: Optional[Callable[[ProgressInfo]]] = None,
):
) -> None:
"""Train DGAN model on data in pandas DataFrame.
Training data can be in either "wide" or "long" format. "Wide" format
Expand All @@ -331,14 +355,18 @@ def train_dataframe(
Args:
df: DataFrame of training data
attribute_columns: list of column names containing attributes, if None,
no attribute columns are used, Default: None
no attribute columns are used. Must be disjoint from
the feature columns.
feature_columns: list of column names containing features, if None
all non-attribute columns are used, Default: None
all non-attribute columns are used. Must be disjoint from
attribute columns.
example_id_column: column name used to split "long" format data
frame into multiple examples, if None, data is treated as a
single example
single example. This value must be unique from the other
column list parameters.
time_column: column name used to sort "long" format data frame,
if None, data frame order of rows/time points is used
if None, data frame order of rows/time points is used. This
value must be unique from the other column list parameters.
discrete_columns: column names (either attributes or features) to
use discrete, onehot encoding, discrete values must be integer
in [0,1,2,3...]
Expand All @@ -348,6 +376,13 @@ def train_dataframe(

if self.data_frame_converter is None:

# attribute columns should be disjoint from feature columns
if attribute_columns is not None and feature_columns is not None:
if set(attribute_columns).intersection(set(feature_columns)):
raise ValueError(
"The `attribute_columns` and `feature_columns` lists must not have overlapping values!"
)

if df_style == DfStyle.WIDE:
self.data_frame_converter = _WideDataFrameConverter.create(
df,
Expand All @@ -356,19 +391,44 @@ def train_dataframe(
discrete_columns=discrete_columns,
)
elif df_style == DfStyle.LONG:
if example_id_column == None and attribute_columns:
raise Exception(
"Please provide an example id column, auto-splitting not available with only attribute columns."
if time_column is not None and example_id_column is not None:
if time_column == example_id_column:
raise ValueError(
"The `time_column` and `example_id_column` values cannot be the same!"
)

if example_id_column is not None:
# It should not be contained in any other lists
other_columns = set()
if discrete_columns is not None:
other_columns.update(discrete_columns)
if feature_columns is not None:
other_columns.update(feature_columns)
if attribute_columns is not None:
other_columns.update(attribute_columns)

if (
example_id_column in other_columns
or time_column in other_columns
):
raise ValueError(
"The `example_id_column` and `time_column` must not be present in any other column lists!"
)

# neither of these should be in any of the other lists
if example_id_column is None and attribute_columns:
raise ValueError(
"Please provide an `example_id_column`, auto-splitting not available with only attribute columns." # noqa
)
if example_id_column == None and attribute_columns == None:
if example_id_column is None and attribute_columns is None:
logging.warning(
f"Example ID column not provided, DGAN will autosplit dataset into sequences of size {self.config.max_sequence_len}!"
f"The `example_id_column` was not provided, DGAN will autosplit dataset into sequences of size {self.config.max_sequence_len}!" # noqa
)
df = df[
: math.floor(len(df) / self.config.max_sequence_len)
* self.config.max_sequence_len
].copy()
if time_column != None:
if time_column is not None:
df[time_column] = pd.to_datetime(df[time_column])
df = df.sort_values(time_column)
example_id_column = "example_id"
Expand Down Expand Up @@ -440,8 +500,15 @@ def generate_numpy(
# Convert from list of tuples to tuple of lists with zip(*) and
# concatenate into single numpy arrays for attributes, additional
# attributes (if present), and features.
#
# NOTE: Despite linter complaints, the np.array(d) == None statement
# is special because it checks for None values along the array like so:
# In [4]: np.array([1,2,3,4]) == None
# Out[4]: array([False, False, False, False])
internal_data = tuple(
np.concatenate(d, axis=0) if not (np.array(d) == None).any() else None
np.concatenate(d, axis=0)
if not (np.array(d) == None).any() # noqa
else None
for d in zip(*internal_data_list)
)

Expand Down Expand Up @@ -1216,11 +1283,7 @@ def invert(
df = pd.DataFrame(data, columns=self._attribute_columns + self._feature_columns)

# Convert discrete columns to int where possible.
for c in self._discrete_columns:
try:
df[c] = df[c].astype("int")
except ValueError:
pass
df = _discrete_cols_to_int(df, self._discrete_columns)

# Ensure we match the original ordering
return df[self._df_column_order]
Expand Down Expand Up @@ -1502,11 +1565,7 @@ def invert(
)

# Convert discrete columns to int where possible.
for c in self._discrete_columns:
try:
df[c] = df[c].astype("int")
except ValueError:
pass
df = _discrete_cols_to_int(df, self._discrete_columns)

if self._example_id_column:
# Use [0,1,2,...] for example_id
Expand Down Expand Up @@ -1620,12 +1679,12 @@ def validation_check(

if np.mean(valid_examples) < invalid_examples_ratio_cutoff:
raise ValueError(
f"More than {100*invalid_examples_ratio_cutoff}% invalid examples in the continuous features. Please reduce the ratio of the NaNs and try again!"
f"More than {100*invalid_examples_ratio_cutoff}% invalid examples in the continuous features. Please reduce the ratio of the NaNs and try again!" # noqa
)

if (~valid_examples).any():
logger.warning(
f"There are {sum(~valid_examples)} examples that have too many nan values in numeric features, accounting for {np.mean(~valid_examples)*100}% of all examples. These invalid examples will be omitted from training.",
f"There are {sum(~valid_examples)} examples that have too many nan values in numeric features, accounting for {np.mean(~valid_examples)*100}% of all examples. These invalid examples will be omitted from training.", # noqa
extra={"user_log": True},
)

Expand All @@ -1652,7 +1711,7 @@ def nan_linear_interpolation(arrays: np.ndarray) -> np.ndarray:
array = arrays[exp, :, f]
if np.isnan(array).any():
nans = np.isnan(array)
ind_func = lambda z: z.nonzero()[0]
ind_func = lambda z: z.nonzero()[0] # noqa
array[nans] = np.interp(ind_func(nans), ind_func(~nans), array[~nans])

return arrays

0 comments on commit 44b1d77

Please sign in to comment.