Skip to content

Commit

Permalink
[BUGFIX] fix incorrect pandas top rows usage (#3091)
Browse files Browse the repository at this point in the history
* fix a bug in the usage of Pandas for subscripting to get select number of rows
* Added a test of pandas slicing in MapMetrics functions.
  • Loading branch information
alexsherstinsky committed Jul 21, 2021
1 parent c1ede6b commit 0a830b4
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 32 deletions.
55 changes: 23 additions & 32 deletions great_expectations/expectations/metrics/map_metric.py
Expand Up @@ -518,7 +518,7 @@ def _pandas_column_map_condition_values(
):
"""Return values from the specified domain that match the map-style metric in the metrics dictionary."""
(
boolean_map_unexpected_values,
boolean_mapped_unexpected_values,
compute_domain_kwargs,
accessor_domain_kwargs,
) = metrics["unexpected_condition"]
Expand Down Expand Up @@ -553,15 +553,14 @@ def _pandas_column_map_condition_values(

domain_values = df[column_name]

domain_values = domain_values[boolean_mapped_unexpected_values == True]

result_format = metric_value_kwargs["result_format"]

if result_format["result_format"] == "COMPLETE":
return list(domain_values[boolean_map_unexpected_values == True])
return list(domain_values)
else:
return list(
domain_values[boolean_map_unexpected_values == True][
: result_format["partial_unexpected_count"]
]
)
return list(domain_values[: result_format["partial_unexpected_count"]])


def _pandas_column_map_series_and_domain_values(
Expand All @@ -574,7 +573,7 @@ def _pandas_column_map_series_and_domain_values(
):
"""Return values from the specified domain that match the map-style metric in the metrics dictionary."""
(
boolean_map_unexpected_values,
boolean_mapped_unexpected_values,
compute_domain_kwargs,
accessor_domain_kwargs,
) = metrics["unexpected_condition"]
Expand Down Expand Up @@ -620,24 +619,20 @@ def _pandas_column_map_series_and_domain_values(

domain_values = df[column_name]

domain_values = domain_values[boolean_mapped_unexpected_values == True]
map_series = map_series[boolean_mapped_unexpected_values == True]

result_format = metric_value_kwargs["result_format"]

if result_format["result_format"] == "COMPLETE":
return (
list(domain_values[boolean_map_unexpected_values == True]),
list(map_series[boolean_map_unexpected_values == True]),
list(domain_values),
list(map_series),
)
else:
return (
list(
domain_values[boolean_map_unexpected_values == True][
: result_format["partial_unexpected_count"]
]
),
list(
map_series[boolean_map_unexpected_values == True][
: result_format["partial_unexpected_count"]
]
),
list(domain_values[: result_format["partial_unexpected_count"]]),
list(map_series[: result_format["partial_unexpected_count"]]),
)


Expand Down Expand Up @@ -682,14 +677,12 @@ def _pandas_map_condition_index(

result_format = metric_value_kwargs["result_format"]

df = df[boolean_mapped_unexpected_values]

if result_format["result_format"] == "COMPLETE":
return list(df[boolean_mapped_unexpected_values].index)
return list(df.index)

return list(
df[boolean_mapped_unexpected_values].index[
: result_format["partial_unexpected_count"]
]
)
return list(df.index[: result_format["partial_unexpected_count"]])


def _pandas_column_map_condition_value_counts(
Expand Down Expand Up @@ -803,12 +796,12 @@ def _pandas_map_condition_rows(

result_format = metric_value_kwargs["result_format"]

df = df[boolean_mapped_unexpected_values]

if result_format["result_format"] == "COMPLETE":
return df[boolean_mapped_unexpected_values]
return df

return df[boolean_mapped_unexpected_values][
result_format["partial_unexpected_count"]
]
return df.iloc[: result_format["partial_unexpected_count"]]


def _sqlalchemy_map_condition_unexpected_count_aggregate_fn(
Expand Down Expand Up @@ -1298,7 +1291,6 @@ def _register_metric_functions(cls):
metric_provider=_pandas_column_map_condition_value_counts,
metric_fn_type=MetricFunctionTypes.VALUE,
)

elif issubclass(engine, SqlAlchemyExecutionEngine):
register_metric(
metric_name=metric_name + ".condition",
Expand Down Expand Up @@ -1437,7 +1429,6 @@ def _register_metric_functions(cls):
metric_provider=_spark_column_map_condition_value_counts,
metric_fn_type=MetricFunctionTypes.VALUE,
)

elif metric_fn_type in [
MetricPartialFunctionTypes.MAP_SERIES,
MetricPartialFunctionTypes.MAP_FN,
Expand Down
19 changes: 19 additions & 0 deletions tests/expectations/metrics/test_core.py
Expand Up @@ -563,6 +563,25 @@ def test_map_unique_pd_column_exists():
assert list(metrics[condition_metric.id][0]) == [False, False, True, True, False]
assert metrics[unexpected_count_metric.id] == 2

unexpected_rows_metric = MetricConfiguration(
metric_name="column_values.unique.unexpected_rows",
metric_domain_kwargs={"column": "a"},
metric_value_kwargs={
"result_format": {"result_format": "SUMMARY", "partial_unexpected_count": 1}
},
metric_dependencies={
"unexpected_condition": condition_metric,
"table.columns": table_columns_metric,
},
)
results = engine.resolve_metrics(
metrics_to_resolve=(unexpected_rows_metric,), metrics=metrics
)
metrics.update(results)

assert metrics[unexpected_rows_metric.id]["a"].index == [2]
assert metrics[unexpected_rows_metric.id]["a"].values == [3]


def test_map_unique_pd_column_does_not_exist():
engine = build_pandas_engine(pd.DataFrame({"a": [1, 2, 3, 3, None]}))
Expand Down

0 comments on commit 0a830b4

Please sign in to comment.