New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MAINTENANCE] Performance improvement refactor for Spark unexpected values #3368
Changes from 28 commits
ec72b07
5ad62bc
778051e
4473092
a784897
66b5ae3
21eea41
a39aa19
abf47b8
a276eab
fea4e71
bd8fc07
8050ce9
522a3f3
fcb22b8
dd56b24
ce2105e
8b48961
2e2ff25
7a72aaa
19f3bd4
7adf3b9
b9bb7cf
099802a
1f9d8d3
93b5d25
80074e9
e1ddfee
341b769
d338d03
51f2005
753e44d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2338,8 +2338,11 @@ def _spark_map_condition_unexpected_count_value( | |
df = execution_engine.get_domain_records( | ||
domain_kwargs=domain_kwargs, | ||
) | ||
|
||
# withColumn is required to transform window functions returned by some metrics to boolean mask | ||
data = df.withColumn("__unexpected", unexpected_condition) | ||
filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected")) | ||
|
||
return filtered.count() | ||
|
||
|
||
|
@@ -2373,17 +2376,9 @@ def _spark_column_map_condition_values( | |
message=f'Error: The column "{column_name}" in BatchData does not exist.' | ||
) | ||
|
||
data = ( | ||
df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1)))) | ||
.withColumn("__unexpected", unexpected_condition) | ||
.orderBy(F.col("__row_number")) | ||
) | ||
|
||
filtered = ( | ||
data.filter(F.col("__unexpected") == True) | ||
.drop(F.col("__unexpected")) | ||
.drop(F.col("__row_number")) | ||
) | ||
# withColumn is required to transform window functions returned by some metrics to boolean mask | ||
data = df.withColumn("__unexpected", unexpected_condition) | ||
filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected")) | ||
|
||
result_format = metric_value_kwargs["result_format"] | ||
if result_format["result_format"] == "COMPLETE": | ||
|
@@ -2411,7 +2406,6 @@ def _spark_column_map_condition_value_counts( | |
df = execution_engine.get_domain_records( | ||
domain_kwargs=compute_domain_kwargs, | ||
) | ||
data = df.withColumn("__unexpected", unexpected_condition) | ||
|
||
if "column" not in accessor_domain_kwargs: | ||
raise ValueError( | ||
|
@@ -2427,9 +2421,12 @@ def _spark_column_map_condition_value_counts( | |
message=f'Error: The column "{column_name}" in BatchData does not exist.' | ||
) | ||
|
||
# withColumn is required to transform window functions returned by some metrics to boolean mask | ||
data = df.withColumn("__unexpected", unexpected_condition) | ||
filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected")) | ||
|
||
result_format = metric_value_kwargs["result_format"] | ||
|
||
filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected")) | ||
value_counts = filtered.groupBy(F.col(column_name)).count() | ||
if result_format["result_format"] == "COMPLETE": | ||
rows = value_counts.collect() | ||
|
@@ -2458,17 +2455,9 @@ def _spark_map_condition_rows( | |
domain_kwargs=domain_kwargs, | ||
) | ||
|
||
data = ( | ||
df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1)))) | ||
.withColumn("__unexpected", unexpected_condition) | ||
.orderBy(F.col("__row_number")) | ||
) | ||
|
||
filtered = ( | ||
data.filter(F.col("__unexpected") == True) | ||
.drop(F.col("__unexpected")) | ||
.drop(F.col("__row_number")) | ||
) | ||
# withColumn is required to transform window functions returned by some metrics to boolean mask | ||
data = df.withColumn("__unexpected", unexpected_condition) | ||
filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected")) | ||
|
||
result_format = metric_value_kwargs["result_format"] | ||
|
||
|
@@ -2488,7 +2477,7 @@ def _spark_column_pair_map_condition_values( | |
): | ||
"""Return values from the specified domain that match the map-style metric in the metrics dictionary.""" | ||
( | ||
boolean_mapped_unexpected_values, | ||
unexpected_condition, | ||
compute_domain_kwargs, | ||
accessor_domain_kwargs, | ||
) = metrics["unexpected_condition"] | ||
|
@@ -2514,17 +2503,9 @@ def _spark_column_pair_map_condition_values( | |
message=f'Error: The column "{column_name}" in BatchData does not exist.' | ||
) | ||
|
||
data = ( | ||
df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1)))) | ||
.withColumn("__unexpected", boolean_mapped_unexpected_values) | ||
.orderBy(F.col("__row_number")) | ||
) | ||
|
||
filtered = ( | ||
data.filter(F.col("__unexpected") == True) | ||
.drop(F.col("__unexpected")) | ||
.drop(F.col("__row_number")) | ||
) | ||
# withColumn is required to transform window functions returned by some metrics to boolean mask | ||
data = df.withColumn("__unexpected", unexpected_condition) | ||
filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected")) | ||
|
||
result_format = metric_value_kwargs["result_format"] | ||
if result_format["result_format"] == "COMPLETE": | ||
|
@@ -2585,7 +2566,7 @@ def _spark_multicolumn_map_condition_values( | |
): | ||
"""Return values from the specified domain that match the map-style metric in the metrics dictionary.""" | ||
( | ||
boolean_mapped_unexpected_values, | ||
unexpected_condition, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @NathanFarmer Could we place keep this variable named There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason for changing this was that metrics are not returning booleans in all cases. Cases that return a window function would fail for the 1-line solution There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @NathanFarmer I cannot find all the line numbers involved (the UI here is confusing, but I follow your logic). Thank you! P.S.: Should we standardize all cases to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alexsherstinsky If we were to standardize all cases to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @NathanFarmer Perhaps we need to clean up our code? Can you please help us out here -- I see in Pandas, SQL, and Spark the pattern that in some cases defines There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alexsherstinsky It is correct that the metric always returns an
to:
for consistency in this single case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @NathanFarmer Thanks -- at least it is consistent, and we can revise later. We now need to solve the remaining problem: the equivalency of results, when the sort order is not enforced. If we can do this, then it is a big gain! |
||
compute_domain_kwargs, | ||
accessor_domain_kwargs, | ||
) = metrics["unexpected_condition"] | ||
|
@@ -2613,17 +2594,9 @@ def _spark_multicolumn_map_condition_values( | |
message=f'Error: The column "{column_name}" in BatchData does not exist.' | ||
) | ||
|
||
data = ( | ||
df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1)))) | ||
.withColumn("__unexpected", boolean_mapped_unexpected_values) | ||
.orderBy(F.col("__row_number")) | ||
) | ||
|
||
filtered = ( | ||
data.filter(F.col("__unexpected") == True) | ||
.drop(F.col("__unexpected")) | ||
.drop(F.col("__row_number")) | ||
) | ||
# withColumn is required to transform window functions returned by some metrics to boolean mask | ||
data = df.withColumn("__unexpected", unexpected_condition) | ||
filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected")) | ||
|
||
column_selector = [F.col(column_name) for column_name in column_list] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1766,6 +1766,30 @@ def generate_expectation_tests( | |
return parametrized_tests | ||
|
||
|
||
def sort_unexpected_values(test_value_list, result_value_list): | ||
# check if value can be sorted; if so, sort so arbitrary ordering of results does not cause failure | ||
if (isinstance(test_value_list, list)) & (len(test_value_list) >= 1): | ||
# __lt__ is not implemented for python dictionaries making sorting trickier | ||
# in our case, we will sort on the values for each key sequentially | ||
if isinstance(test_value_list[0], dict): | ||
test_value_list = sorted( | ||
test_value_list, | ||
key=lambda x: tuple(x[k] for k in list(test_value_list[0].keys())), | ||
) | ||
result_value_list = sorted( | ||
result_value_list, | ||
key=lambda x: tuple(x[k] for k in list(test_value_list[0].keys())), | ||
) | ||
# if python built-in class has __lt__ then sorting can always work this way | ||
elif type(test_value_list[0].__lt__(test_value_list[0])) != type( | ||
NotImplemented | ||
): | ||
test_value_list = sorted(test_value_list, key=lambda x: str(x)) | ||
result_value_list = sorted(result_value_list, key=lambda x: str(x)) | ||
|
||
return test_value_list, result_value_list | ||
|
||
|
||
def evaluate_json_test(data_asset, expectation_type, test): | ||
""" | ||
This method will evaluate the result of a test build using the Great Expectations json test format. | ||
|
@@ -1879,7 +1903,32 @@ def evaluate_json_test_cfe(validator, expectation_type, test): | |
|
||
def check_json_test_result(test, result, data_asset=None): | ||
# Check results | ||
if test["exact_match_out"] is True: | ||
# For Spark we cannot guarantee the order in which values are returned, so we sort for testing purposes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @NathanFarmer Naive thought: Why not sort for all backends? I do not see detriment in pre-sorting in a well-defined order these unexpected values for all backends indiscriminately. What do you think? Thanks! |
||
if (test["exact_match_out"] is True) and isinstance( | ||
data_asset, (SparkDFDataset, SparkDFBatchData) | ||
): | ||
if ("unexpected_list" in result["result"]) and ( | ||
"unexpected_list" in test["out"]["result"] | ||
): | ||
( | ||
test["out"]["result"]["unexpected_list"], | ||
result["result"]["unexpected_list"], | ||
) = sort_unexpected_values( | ||
test["out"]["result"]["unexpected_list"], | ||
result["result"]["unexpected_list"], | ||
) | ||
if ("partial_unexpected_list" in result["result"]) and ( | ||
"partial_unexpected_list" in test["out"]["result"] | ||
): | ||
( | ||
test["out"]["result"]["partial_unexpected_list"], | ||
result["result"]["partial_unexpected_list"], | ||
) = sort_unexpected_values( | ||
test["out"]["result"]["partial_unexpected_list"], | ||
result["result"]["partial_unexpected_list"], | ||
) | ||
assert result == expectationValidationResultSchema.load(test["out"]) | ||
elif test["exact_match_out"] is True: | ||
assert result == expectationValidationResultSchema.load(test["out"]) | ||
else: | ||
# Convert result to json since our tests are reading from json so cannot easily contain richer types (e.g. NaN) | ||
|
@@ -1928,13 +1977,9 @@ def check_json_test_result(test, result, data_asset=None): | |
assert result["result"]["unexpected_index_list"] == value | ||
|
||
elif key == "unexpected_list": | ||
# check if value can be sorted; if so, sort so arbitrary ordering of results does not cause failure | ||
if (isinstance(value, list)) & (len(value) >= 1): | ||
if type(value[0].__lt__(value[0])) != type(NotImplemented): | ||
value = sorted(value, key=lambda x: str(x)) | ||
result["result"]["unexpected_list"] = sorted( | ||
result["result"]["unexpected_list"], key=lambda x: str(x) | ||
) | ||
value, result["result"]["unexpected_list"] = sort_unexpected_values( | ||
value, result["result"]["unexpected_list"] | ||
) | ||
|
||
assert result["result"]["unexpected_list"] == value, ( | ||
"expected " | ||
|
@@ -1943,6 +1988,21 @@ def check_json_test_result(test, result, data_asset=None): | |
+ str(result["result"]["unexpected_list"]) | ||
) | ||
|
||
elif key == "partial_unexpected_list": | ||
( | ||
value, | ||
result["result"]["partial_unexpected_list"], | ||
) = sort_unexpected_values( | ||
value, result["result"]["partial_unexpected_list"] | ||
) | ||
|
||
assert result["result"]["partial_unexpected_list"] == value, ( | ||
"expected " | ||
+ str(value) | ||
+ " but got " | ||
+ str(result["result"]["partial_unexpected_list"]) | ||
) | ||
|
||
elif key == "details": | ||
assert result["result"]["details"] == value | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@NathanFarmer I believe that we can leave the
filtered =
statement where it was (because there could be an exception raised earlier, so no need to have it before). Would you agree or not? Thanks.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alexsherstinsky Sure, we can leave this line where it is, but then I would advocate to move lines 2408-2409 down below as well for consistency.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@NathanFarmer The way I am seeing it now seems good. Thank you.