great-expectations · NathanFarmer · Sep 21, 2021 · Sep 8, 2021 · Sep 8, 2021 · Sep 16, 2021
diff --git a/great_expectations/expectations/metrics/map_metric_provider.py b/great_expectations/expectations/metrics/map_metric_provider.py
@@ -2477,7 +2477,7 @@ def _spark_column_pair_map_condition_values(
 ):
     """Return values from the specified domain that match the map-style metric in the metrics dictionary."""
     (
-        boolean_mapped_unexpected_values,
+        unexpected_condition,
         compute_domain_kwargs,
         accessor_domain_kwargs,
     ) = metrics["unexpected_condition"]
@@ -2503,7 +2503,9 @@ def _spark_column_pair_map_condition_values(
                 message=f'Error: The column "{column_name}" in BatchData does not exist.'
             )
 
-    filtered = df.filter(boolean_mapped_unexpected_values)
+    # withColumn is required to transform window functions returned by some metrics to boolean mask
+    data = df.withColumn("__unexpected", unexpected_condition)
+    filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected"))
 
     result_format = metric_value_kwargs["result_format"]
     if result_format["result_format"] == "COMPLETE":

diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py
@@ -1766,6 +1766,30 @@ def generate_expectation_tests(
     return parametrized_tests
 
 
+def sort_unexpected_values(test_value_list, result_value_list):
+    # check if value can be sorted; if so, sort so arbitrary ordering of results does not cause failure
+    if (isinstance(test_value_list, list)) & (len(test_value_list) >= 1):
+        # __lt__ is not implemented for python dictionaries making sorting trickier
+        # in our case, we will sort on the values for each key sequentially
+        if isinstance(test_value_list[0], dict):
+            test_value_list = sorted(
+                test_value_list,
+                key=lambda x: tuple(x[k] for k in list(test_value_list[0].keys())),
+            )
+            result_value_list = sorted(
+                result_value_list,
+                key=lambda x: tuple(x[k] for k in list(test_value_list[0].keys())),
+            )
+        # if python built-in class has __lt__ then sorting can always work this way
+        elif type(test_value_list[0].__lt__(test_value_list[0])) != type(
+            NotImplemented
+        ):
+            test_value_list = sorted(test_value_list, key=lambda x: str(x))
+            result_value_list = sorted(result_value_list, key=lambda x: str(x))
+
+    return test_value_list, result_value_list
+
+
 def evaluate_json_test(data_asset, expectation_type, test):
     """
     This method will evaluate the result of a test build using the Great Expectations json test format.
@@ -1879,7 +1903,32 @@ def evaluate_json_test_cfe(validator, expectation_type, test):
 
 def check_json_test_result(test, result, data_asset=None):
     # Check results
-    if test["exact_match_out"] is True:
+    # For Spark we cannot guarantee the order in which values are returned, so we sort for testing purposes
+    if (test["exact_match_out"] is True) and isinstance(
+        data_asset, (SparkDFDataset, SparkDFBatchData)
+    ):
+        if ("unexpected_list" in result["result"]) and (
+            "unexpected_list" in test["out"]["result"]
+        ):
+            (
+                test["out"]["result"]["unexpected_list"],
+                result["result"]["unexpected_list"],
+            ) = sort_unexpected_values(
+                test["out"]["result"]["unexpected_list"],
+                result["result"]["unexpected_list"],
+            )
+        if ("partial_unexpected_list" in result["result"]) and (
+            "partial_unexpected_list" in test["out"]["result"]
+        ):
+            (
+                test["out"]["result"]["partial_unexpected_list"],
+                result["result"]["partial_unexpected_list"],
+            ) = sort_unexpected_values(
+                test["out"]["result"]["partial_unexpected_list"],
+                result["result"]["partial_unexpected_list"],
+            )
+        assert result == expectationValidationResultSchema.load(test["out"])
+    elif test["exact_match_out"] is True:
         assert result == expectationValidationResultSchema.load(test["out"])
     else:
         # Convert result to json since our tests are reading from json so cannot easily contain richer types (e.g. NaN)
@@ -1928,25 +1977,9 @@ def check_json_test_result(test, result, data_asset=None):
                     assert result["result"]["unexpected_index_list"] == value
 
             elif key == "unexpected_list":
-                # check if value can be sorted; if so, sort so arbitrary ordering of results does not cause failure
-                if (isinstance(value, list)) & (len(value) >= 1):
-                    # __lt__ is not implemented for python dictionaries making sorting trickier
-                    # in our case, we will sort on the values for each key sequentially
-                    if isinstance(value[0], dict):
-                        value = sorted(
-                            value,
-                            key=lambda x: tuple(x[k] for k in list(value[0].keys())),
-                        )
-                        result["result"]["unexpected_list"] = sorted(
-                            result["result"]["unexpected_list"],
-                            key=lambda x: tuple(x[k] for k in list(value[0].keys())),
-                        )
-                    # if python built-in class has __lt__ then sorting can always work this way
-                    elif type(value[0].__lt__(value[0])) != type(NotImplemented):
-                        value = sorted(value, key=lambda x: str(x))
-                        result["result"]["unexpected_list"] = sorted(
-                            result["result"]["unexpected_list"], key=lambda x: str(x)
-                        )
+                value, result["result"]["unexpected_list"] = sort_unexpected_values(
+                    value, result["result"]["unexpected_list"]
+                )
 
                 assert result["result"]["unexpected_list"] == value, (
                     "expected "
@@ -1955,6 +1988,21 @@ def check_json_test_result(test, result, data_asset=None):
                     + str(result["result"]["unexpected_list"])
                 )
 
+            elif key == "partial_unexpected_list":
+                (
+                    value,
+                    result["result"]["partial_unexpected_list"],
+                ) = sort_unexpected_values(
+                    value, result["result"]["partial_unexpected_list"]
+                )
+
+                assert result["result"]["partial_unexpected_list"] == value, (
+                    "expected "
+                    + str(value)
+                    + " but got "
+                    + str(result["result"]["partial_unexpected_list"])
+                )
+
             elif key == "details":
                 assert result["result"]["details"] == value
 

diff --git a/...s/test_definitions/multicolumn_map_expectations/expect_compound_columns_to_be_unique.json b/...s/test_definitions/multicolumn_map_expectations/expect_compound_columns_to_be_unique.json
@@ -213,7 +213,7 @@
     },{
       "title": "unexpected_values_exact_match_out_without_unexpected_index_list",
       "exact_match_out" : true,
-      "only_for": ["sqlalchemy"],
+      "suppress_test_for": ["pandas"],
       "in": {
         "column_list": ["a", "b"]
       },