From 5c7b3a2715c33ff801162d24caa610b2c4b93a07 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Tue, 13 Aug 2019 09:16:46 -0400 Subject: [PATCH 1/2] Fix typo in CLI message --- great_expectations/cli/datasource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py index 5681eb3a3dbe..0c507aad783c 100644 --- a/great_expectations/cli/datasource.py +++ b/great_expectations/cli/datasource.py @@ -302,7 +302,7 @@ def profile_datasource(context, data_source_name, data_assets=None, profile_all_ else: cli_message( - "Okay, skipping HTML documentation for now.`." + "Okay, skipping HTML documentation for now." ) From 27f2a7d3b20977ce4b55f2becb1aa5fb7f3c1ef2 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Tue, 13 Aug 2019 09:22:30 -0400 Subject: [PATCH 2/2] Improve performance for cardinality detection by only counting distinct - do not get all value counts unless requested --- great_expectations/dataset/sparkdf_dataset.py | 12 ++++++++++-- great_expectations/profile/basic_dataset_profiler.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/great_expectations/dataset/sparkdf_dataset.py b/great_expectations/dataset/sparkdf_dataset.py index 4195d0f07877..b60df354cfa4 100644 --- a/great_expectations/dataset/sparkdf_dataset.py +++ b/great_expectations/dataset/sparkdf_dataset.py @@ -20,7 +20,15 @@ logger = logging.getLogger(__name__) try: - from pyspark.sql.functions import udf, col, lit, stddev_samp, length as length_, when, year, count + from pyspark.sql.functions import ( + udf, col, lit, + stddev_samp, + length as length_, + when, + year, + count, + countDistinct + ) import pyspark.sql.types as sparktypes from pyspark.ml.feature import Bucketizer from pyspark.sql import Window @@ -268,7 +276,7 @@ def get_column_value_counts(self, column): return series def get_column_unique_count(self, column): - return self.get_column_value_counts(column).shape[0] + return self.spark_df.agg(countDistinct(column)).collect()[0][0] def get_column_modes(self, column): """leverages computation done in _get_column_value_counts""" diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index c489f97c1a76..2e8cc01b91db 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -61,7 +61,7 @@ def _get_column_cardinality(cls, df, column): 'result']['observed_value'] pct_unique = df.expect_column_proportion_of_unique_values_to_be_between( column, None, None)['result']['observed_value'] - except KeyError: # if observed_value value is not set + except KeyError: # if observed_value value is not set logger.exception("Failed to get cardinality of column {0:s} - continuing...".format(column)) if num_unique is None or num_unique == 0 or pct_unique is None: