great-expectations · NathanFarmer · Sep 21, 2021 · Sep 8, 2021 · Sep 8, 2021 · Sep 16, 2021
diff --git a/docs_rtd/changelog.rst b/docs_rtd/changelog.rst
@@ -7,6 +7,7 @@ Changelog
 
 develop
 -----------------
+* [MAINTENANCE] Spark performance improvement for metrics that return unexpected values (#3368)
 
 0.13.34
 -----------------
@@ -64,7 +65,6 @@ develop
 * [MAINTENANCE] Tests for RuntimeDataConnector at Datasource-level (Spark and Pandas) (#3318)
 * [MAINTENANCE] Various doc patches (#3326)
 * [MAINTENANCE] clean up imports and method signatures (#3337)
->>>>>>> 9208de453238af6d673aa9184c865b8422165172
 
 0.13.31
 -----------------

diff --git a/great_expectations/expectations/metrics/map_metric_provider.py b/great_expectations/expectations/metrics/map_metric_provider.py
@@ -2338,6 +2338,7 @@ def _spark_map_condition_unexpected_count_value(
     df = execution_engine.get_domain_records(
         domain_kwargs=domain_kwargs,
     )
+    # withColumn is required to transform window functions returned by some metrics to boolean mask
     data = df.withColumn("__unexpected", unexpected_condition)
     filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected"))
     return filtered.count()
@@ -2373,17 +2374,9 @@ def _spark_column_map_condition_values(
             message=f'Error: The column "{column_name}" in BatchData does not exist.'
         )
 
-    data = (
-        df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1))))
-        .withColumn("__unexpected", unexpected_condition)
-        .orderBy(F.col("__row_number"))
-    )
-
-    filtered = (
-        data.filter(F.col("__unexpected") == True)
-        .drop(F.col("__unexpected"))
-        .drop(F.col("__row_number"))
-    )
+    # withColumn is required to transform window functions returned by some metrics to boolean mask
+    data = df.withColumn("__unexpected", unexpected_condition)
+    filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected"))
 
     result_format = metric_value_kwargs["result_format"]
     if result_format["result_format"] == "COMPLETE":
@@ -2411,7 +2404,10 @@ def _spark_column_map_condition_value_counts(
     df = execution_engine.get_domain_records(
         domain_kwargs=compute_domain_kwargs,
     )
+
+    # withColumn is required to transform window functions returned by some metrics to boolean mask
     data = df.withColumn("__unexpected", unexpected_condition)
+    filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected"))
 
     if "column" not in accessor_domain_kwargs:
         raise ValueError(
@@ -2429,7 +2425,6 @@ def _spark_column_map_condition_value_counts(
 
     result_format = metric_value_kwargs["result_format"]
 
-    filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected"))
     value_counts = filtered.groupBy(F.col(column_name)).count()
     if result_format["result_format"] == "COMPLETE":
         rows = value_counts.collect()
@@ -2458,17 +2453,9 @@ def _spark_map_condition_rows(
         domain_kwargs=domain_kwargs,
     )
 
-    data = (
-        df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1))))
-        .withColumn("__unexpected", unexpected_condition)
-        .orderBy(F.col("__row_number"))
-    )
-
-    filtered = (
-        data.filter(F.col("__unexpected") == True)
-        .drop(F.col("__unexpected"))
-        .drop(F.col("__row_number"))
-    )
+    # withColumn is required to transform window functions returned by some metrics to boolean mask
+    data = df.withColumn("__unexpected", unexpected_condition)
+    filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected"))
 
     result_format = metric_value_kwargs["result_format"]
 
@@ -2514,17 +2501,7 @@ def _spark_column_pair_map_condition_values(
                 message=f'Error: The column "{column_name}" in BatchData does not exist.'
             )
 
-    data = (
-        df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1))))
-        .withColumn("__unexpected", boolean_mapped_unexpected_values)
-        .orderBy(F.col("__row_number"))
-    )
-
-    filtered = (
-        data.filter(F.col("__unexpected") == True)
-        .drop(F.col("__unexpected"))
-        .drop(F.col("__row_number"))
-    )
+    filtered = df.filter(boolean_mapped_unexpected_values)
 
     result_format = metric_value_kwargs["result_format"]
     if result_format["result_format"] == "COMPLETE":
@@ -2585,7 +2562,7 @@ def _spark_multicolumn_map_condition_values(
 ):
     """Return values from the specified domain that match the map-style metric in the metrics dictionary."""
     (
-        boolean_mapped_unexpected_values,
+        unexpected_condition,
         compute_domain_kwargs,
         accessor_domain_kwargs,
     ) = metrics["unexpected_condition"]
@@ -2613,17 +2590,9 @@ def _spark_multicolumn_map_condition_values(
                 message=f'Error: The column "{column_name}" in BatchData does not exist.'
             )
 
-    data = (
-        df.withColumn("__row_number", F.row_number().over(Window.orderBy(F.lit(1))))
-        .withColumn("__unexpected", boolean_mapped_unexpected_values)
-        .orderBy(F.col("__row_number"))
-    )
-
-    filtered = (
-        data.filter(F.col("__unexpected") == True)
-        .drop(F.col("__unexpected"))
-        .drop(F.col("__row_number"))
-    )
+    # withColumn is required to transform window functions returned by some metrics to boolean mask
+    data = df.withColumn("__unexpected", unexpected_condition)
+    filtered = data.filter(F.col("__unexpected") == True).drop(F.col("__unexpected"))
 
     column_selector = [F.col(column_name) for column_name in column_list]
 

diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py
@@ -10,6 +10,7 @@
 from functools import wraps
 from types import ModuleType
 from typing import Dict, List, Optional, Union
+from operator import itemgetter
 
 import numpy as np
 import pandas as pd
@@ -1930,6 +1931,13 @@ def check_json_test_result(test, result, data_asset=None):
             elif key == "unexpected_list":
                 # check if value can be sorted; if so, sort so arbitrary ordering of results does not cause failure
                 if (isinstance(value, list)) & (len(value) >= 1):
+                    # dictionary handling isn't implemented in great_expectations.core.data_context_key.__lt__
+                    # but values still need to be sorted since spark metrics return unordered
+                    if isinstance(value[0], dict):
+                        value = sorted(value, key=itemgetter(*list(value[0].keys())))
+                        result["result"]["unexpected_list"] = sorted(
+                            result["result"]["unexpected_list"], key=itemgetter(*list(value[0].keys()))
+                        )
                     if type(value[0].__lt__(value[0])) != type(NotImplemented):
                         value = sorted(value, key=lambda x: str(x))
                         result["result"]["unexpected_list"] = sorted(

diff --git a/...s/test_definitions/multicolumn_map_expectations/expect_compound_columns_to_be_unique.json b/...s/test_definitions/multicolumn_map_expectations/expect_compound_columns_to_be_unique.json
@@ -213,7 +213,7 @@
     },{
       "title": "unexpected_values_exact_match_out_without_unexpected_index_list",
       "exact_match_out" : true,
-      "suppress_test_for": ["pandas"],
+      "only_for": ["sqlalchemy"],
       "in": {
         "column_list": ["a", "b"]
       },