From 5602398bd594087b000d96512ae52ea18b782d64 Mon Sep 17 00:00:00 2001 From: Aylr Date: Fri, 26 Jul 2019 16:01:56 -0600 Subject: [PATCH 1/3] * disable and enable evaluation to speed up profiling * print out columns so users see something happening rather than waiting --- .../profile/basic_dataset_profiler.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index d386963eeb57..9b33aef088b2 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -16,9 +16,18 @@ class BasicDatasetProfiler(DatasetProfiler): such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate. """ + @classmethod + def _enable_evaluation(cls, df): + df._config["interactive_evaluation"] = True + + @classmethod + def _disable_evaluation(cls, df): + df._config["interactive_evaluation"] = False + @classmethod def _get_column_type(cls, df, column): # list of types is used to support pandas and sqlalchemy + cls._enable_evaluation(df) try: if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.INT_TYPE_NAMES)))["success"]: type_ = "int" @@ -38,13 +47,14 @@ def _get_column_type(cls, df, column): except NotImplementedError: type_ = "unknown" + cls._disable_evaluation(df) return type_ @classmethod def _get_column_cardinality(cls, df, column): - num_unique = None pct_unique = None + cls._enable_evaluation(df) try: num_unique = df.expect_column_unique_value_count_to_be_between(column, None, None)[ @@ -84,20 +94,24 @@ def _get_column_cardinality(cls, df, column): cardinality = "many" # print('col: {0:s}, num_unique: {1:s}, pct_unique: {2:s}, card: {3:s}'.format(column, str(num_unique), str(pct_unique), cardinality)) + cls._disable_evaluation(df) + return cardinality @classmethod def _profile(cls, dataset): - - df = dataset df.set_default_expectation_argument("catch_exceptions", True) df.expect_table_row_count_to_be_between(min_value=0, max_value=None) df.expect_table_columns_to_match_ordered_list(None) + cls._disable_evaluation(df) - for column in df.get_table_columns(): + columns = df.get_table_columns() + number_of_columns = len(columns) + for i, column in enumerate(columns): + print(f" Preparing column {i} of {number_of_columns}: {column}") if column == 'sizes': print("sizes") @@ -179,4 +193,5 @@ def _profile(cls, dataset): # print(column, type_, cardinality) pass + cls._enable_evaluation(df) return df.get_expectation_suite(suppress_warnings=True, discard_failed_expectations=False) From ee99d25afc882c589bf768482333d5381887b644 Mon Sep 17 00:00:00 2001 From: Aylr Date: Tue, 30 Jul 2019 13:08:07 -0600 Subject: [PATCH 2/3] * added config getter/setter to DataAsset --- great_expectations/data_asset/data_asset.py | 6 ++++++ .../profile/basic_dataset_profiler.py | 20 ++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/great_expectations/data_asset/data_asset.py b/great_expectations/data_asset/data_asset.py index 789a845d4617..fcdd00f0b4f2 100644 --- a/great_expectations/data_asset/data_asset.py +++ b/great_expectations/data_asset/data_asset.py @@ -603,6 +603,12 @@ def remove_expectation(self, else: return expectation + def set_config_value(self, key, value): + self._config[key] = value + + def get_config_value(self, key): + return self._config[key] + def get_batch_kwargs(self): return self._batch_kwargs diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index 9b33aef088b2..78b1b0b98bbf 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -16,18 +16,10 @@ class BasicDatasetProfiler(DatasetProfiler): such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate. """ - @classmethod - def _enable_evaluation(cls, df): - df._config["interactive_evaluation"] = True - - @classmethod - def _disable_evaluation(cls, df): - df._config["interactive_evaluation"] = False - @classmethod def _get_column_type(cls, df, column): # list of types is used to support pandas and sqlalchemy - cls._enable_evaluation(df) + df.set_config_value("interactive_evaluation", True) try: if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.INT_TYPE_NAMES)))["success"]: type_ = "int" @@ -47,14 +39,14 @@ def _get_column_type(cls, df, column): except NotImplementedError: type_ = "unknown" - cls._disable_evaluation(df) + df.set_config_value('interactive_evaluation', False) return type_ @classmethod def _get_column_cardinality(cls, df, column): num_unique = None pct_unique = None - cls._enable_evaluation(df) + df.set_config_value("interactive_evaluation", True) try: num_unique = df.expect_column_unique_value_count_to_be_between(column, None, None)[ @@ -94,7 +86,7 @@ def _get_column_cardinality(cls, df, column): cardinality = "many" # print('col: {0:s}, num_unique: {1:s}, pct_unique: {2:s}, card: {3:s}'.format(column, str(num_unique), str(pct_unique), cardinality)) - cls._disable_evaluation(df) + df.set_config_value('interactive_evaluation', False) return cardinality @@ -106,7 +98,7 @@ def _profile(cls, dataset): df.expect_table_row_count_to_be_between(min_value=0, max_value=None) df.expect_table_columns_to_match_ordered_list(None) - cls._disable_evaluation(df) + df.set_config_value('interactive_evaluation', False) columns = df.get_table_columns() number_of_columns = len(columns) @@ -193,5 +185,5 @@ def _profile(cls, dataset): # print(column, type_, cardinality) pass - cls._enable_evaluation(df) + df.set_config_value("interactive_evaluation", True) return df.get_expectation_suite(suppress_warnings=True, discard_failed_expectations=False) From c91b1eb73685906a70fa1bf7ce19334d3cc0203e Mon Sep 17 00:00:00 2001 From: Aylr Date: Thu, 1 Aug 2019 14:40:13 -0600 Subject: [PATCH 3/3] * support legacy python string formatting * proper logger for output --- great_expectations/profile/basic_dataset_profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index 78b1b0b98bbf..0e8d6dd47c10 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -103,7 +103,7 @@ def _profile(cls, dataset): columns = df.get_table_columns() number_of_columns = len(columns) for i, column in enumerate(columns): - print(f" Preparing column {i} of {number_of_columns}: {column}") + logger.info(" Preparing column {} of {}: {}".format(i, number_of_columns, column)) if column == 'sizes': print("sizes")