Merge pull request #1063 from great-expectations/aylr/cli-profile-log…

…ging cli profile logging
great-expectations · Feb 12, 2020 · f504158 · f504158
2 parents e7f5273 + 4fc21cc
commit f504158
Show file tree

Hide file tree

Showing 11 changed files with 828 additions and 770 deletions.
diff --git a/great_expectations/cli/cli_logging.py b/great_expectations/cli/cli_logging.py
@@ -10,7 +10,7 @@
     "great_expectations.dataset.sqlalchemy_dataset"
 ).setLevel(logging.CRITICAL)
 logging.getLogger(
-    "great_expectations.profile.basic_dataset_profiler"
+    "great_expectations.profile.sample_expectations_dataset_profiler"
 ).setLevel(logging.CRITICAL)
 
 # Take over the entire GE module logging namespace when running CLI

diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py
@@ -16,8 +16,7 @@
     _offer_to_install_new_template,
     cli_message,
 )
-from great_expectations.data_context.types.resource_identifiers import ValidationResultIdentifier, \
-    ExpectationSuiteIdentifier
+from great_expectations.data_context.types.resource_identifiers import ValidationResultIdentifier
 from great_expectations.datasource import (
     PandasDatasource,
     SparkDFDatasource,
@@ -27,9 +26,8 @@
     ManualBatchKwargsGenerator,
 )
 from great_expectations.exceptions import DatasourceInitializationError
-from great_expectations.profile.basic_dataset_profiler import (
-    SampleExpectationsDatasetProfiler,
-)
+from great_expectations.profile.sample_expectations_dataset_profiler import \
+    SampleExpectationsDatasetProfiler
 
 from great_expectations.validator.validator import Validator
 from great_expectations.core import ExpectationSuite
@@ -177,7 +175,9 @@ def datasource_profile(datasource, generator_name, data_assets, profile_all_data
         return
 
     if batch_kwargs is not None:
+        # TODO refactor out json load check in suite edit and add here
         batch_kwargs = json.loads(batch_kwargs)
+        # TODO refactor batch load check in suite edit and add here
 
     if datasource is None:
         datasources = [_datasource["name"] for _datasource in context.list_datasources()]
@@ -1107,6 +1107,11 @@ def profile_datasource(
     open_docs=False,
 ):
     """"Profile a named datasource using the specified context"""
+    # Note we are explicitly not using a logger in all CLI output to have
+    # more control over console UI.
+    logging.getLogger(
+        "great_expectations.profile.basic_dataset_profiler"
+    ).setLevel(logging.INFO)
     msg_intro = """
 <cyan>========== Profiling ==========</cyan>
 

diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py
@@ -1,5 +1,5 @@
 import logging
-from dateutil.parser import parse
+
 # Gross legacy python 2 hacks
 try:
     ModuleNotFoundError
@@ -12,8 +12,6 @@
     OperationalError = RuntimeError
 
 from .base import DatasetProfiler
-from ..dataset.util import build_categorical_partition_object
-import datetime
 
 logger = logging.getLogger(__name__)
 
@@ -233,294 +231,3 @@ def _profile(cls, dataset):
         expectation_suite.meta["columns"] = meta_columns
 
         return expectation_suite
-
-
-class SampleExpectationsDatasetProfiler(BasicDatasetProfilerBase):
-    """The goal of SampleExpectationsDatasetProfiler is to generate an expectation suite that
-    contains one instance of every interesting expectation type.
-
-    This expectation suite is intended to serve as a demo of the expressive power of expectations
-    and provide a service similar to the one expectations glossary documentation page, but on
-    users' own data.
-
-    Ranges of acceptable values in the expectations created by this profiler (e.g., min/max
-    of the median in expect_column_median_to_be_between) are created only to demonstrate
-    the functionality and should not be taken as the actual ranges outside which the data
-    should be considered incorrect.
-    """
-
-    @classmethod
-    def _get_column_type_with_caching(cls, dataset, column_name, cache):
-        column_cache_entry = cache.get(column_name)
-        if not column_cache_entry:
-            column_cache_entry = {}
-            cache[column_name] = column_cache_entry
-        column_type = column_cache_entry.get("type")
-        if not column_type:
-            column_type = cls._get_column_type(dataset, column_name)
-            column_cache_entry["type"] = column_type
-            # remove the expectation
-            dataset.remove_expectation(expectation_type="expect_column_values_to_be_in_type_list")
-            dataset.set_config_value('interactive_evaluation', True)
-
-        return column_type
-
-
-    @classmethod
-    def _get_column_cardinality_with_caching(cls, dataset, column_name, cache):
-        column_cache_entry = cache.get(column_name)
-        if not column_cache_entry:
-            column_cache_entry = {}
-            cache[column_name] = column_cache_entry
-        column_cardinality = column_cache_entry.get("cardinality")
-        if not column_cardinality:
-            column_cardinality = cls._get_column_cardinality(dataset, column_name)
-            column_cache_entry["cardinality"] = column_cardinality
-            # remove the expectations
-            dataset.remove_expectation(expectation_type="expect_column_unique_value_count_to_be_between")
-            dataset.remove_expectation(expectation_type="expect_column_proportion_of_unique_values_to_be_between")
-            dataset.set_config_value('interactive_evaluation', True)
-
-        return column_cardinality
-
-    @classmethod
-    def _create_expectations_for_low_card_column(cls, dataset, column, column_cache):
-        cls._create_non_nullity_expectations(dataset, column)
-
-        value_set = \
-        dataset.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY").result[
-            "observed_value"]
-        dataset.expect_column_distinct_values_to_be_in_set(column, value_set=value_set, result_format="SUMMARY")
-
-        if cls._get_column_cardinality_with_caching(dataset, column, column_cache) in ["two", "very few"]:
-            partition_object = build_categorical_partition_object(dataset, column)
-            dataset.expect_column_kl_divergence_to_be_less_than(column, partition_object=partition_object,
-                                                                threshold=0.6, catch_exceptions=True)
-
-    @classmethod
-    def _create_non_nullity_expectations(cls, dataset, column):
-        not_null_result = dataset.expect_column_values_to_not_be_null(column)
-        if not not_null_result.success:
-            mostly_value = max(0.001, (100.0 - not_null_result.result["unexpected_percent"] - 10) / 100.0)
-            dataset.expect_column_values_to_not_be_null(column, mostly=mostly_value)
-
-    @classmethod
-    def _create_expectations_for_numeric_column(cls, dataset, column):
-        cls._create_non_nullity_expectations(dataset, column)
-
-        value = \
-        dataset.expect_column_min_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[
-            "observed_value"]
-        value = dataset.expect_column_min_to_be_between(column, min_value=value - 1, max_value=value + 1)
-
-        value = \
-        dataset.expect_column_max_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[
-            "observed_value"]
-        value = dataset.expect_column_max_to_be_between(column, min_value=value - 1, max_value=value + 1)
-
-        value = dataset.expect_column_mean_to_be_between(column, min_value=None, max_value=None,
-                                                         result_format="SUMMARY").result["observed_value"]
-        dataset.expect_column_mean_to_be_between(column, min_value=value - 1, max_value=value + 1)
-
-        value = dataset.expect_column_median_to_be_between(column, min_value=None, max_value=None,
-                                                           result_format="SUMMARY").result["observed_value"]
-        dataset.expect_column_median_to_be_between(column, min_value=value - 1, max_value=value + 1)
-
-        result = dataset.expect_column_quantile_values_to_be_between(
-            column,
-            quantile_ranges={
-                "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
-                "value_ranges": [
-                    [None, None],
-                    [None, None],
-                    [None, None],
-                    [None, None],
-                    [None, None],
-                ],
-            },
-            result_format="SUMMARY",
-            catch_exceptions=True
-        )
-        if result.exception_info:
-            # TODO quantiles are not implemented correctly on sqlite, and likely other sql dialects
-            logger.warning(result.exception_info["exception_traceback"])
-        else:
-            dataset.set_config_value('interactive_evaluation', False)
-            dataset.expect_column_quantile_values_to_be_between(
-                column,
-                quantile_ranges={
-                    "quantiles": result.result["observed_value"]["quantiles"],
-                    "value_ranges": [
-                        [v - 1, v + 1] for v in
-                        result.result["observed_value"]["values"]
-                    ],
-                },
-                catch_exceptions=True
-            )
-            dataset.set_config_value('interactive_evaluation', True)
-
-    @classmethod
-    def _create_expectations_for_string_column(cls, dataset, column):
-        cls._create_non_nullity_expectations(dataset, column)
-        dataset.expect_column_value_lengths_to_be_between(column, min_value=1)
-
-
-    @classmethod
-    def _find_next_low_card_column(cls, dataset, columns, profiled_columns, column_cache):
-        for column in columns:
-            if column in profiled_columns["low_card"]:
-                continue
-            cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache)
-            if cardinality in ["two", "very few", "few"]:
-                return column
-
-        return None
-
-
-    @classmethod
-    def _find_next_numeric_column(cls, dataset, columns, profiled_columns, column_cache):
-        for column in columns:
-            if column in profiled_columns["numeric"]:
-                continue
-            if column.lower().strip() == "id" or column.lower().strip().find("_id") > -1:
-                continue
-
-            cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache)
-            type = cls._get_column_type_with_caching(dataset, column, column_cache)
-
-            if cardinality in ["many", "very many", "unique"] and type in ["int", "float"]:
-                return column
-
-        return None
-
-    @classmethod
-    def _find_next_string_column(cls, dataset, columns, profiled_columns, column_cache):
-        for column in columns:
-            if column in profiled_columns["string"]:
-                continue
-
-            cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache)
-            type = cls._get_column_type_with_caching(dataset, column, column_cache)
-
-            if cardinality in ["many", "very many", "unique"] and type in ["string", "unknown"]:
-                return column
-
-        return None
-
-    @classmethod
-    def _find_next_datetime_column(cls, dataset, columns, profiled_columns, column_cache):
-        for column in columns:
-            if column in profiled_columns["datetime"]:
-                continue
-
-            cardinality = cls._get_column_cardinality_with_caching(dataset, column, column_cache)
-            type = cls._get_column_type_with_caching(dataset, column, column_cache)
-
-            if cardinality in ["many", "very many", "unique"] and type in ["datetime"]:
-                return column
-
-        return None
-
-    @classmethod
-    def _create_expectations_for_datetime_column(cls, dataset, column):
-        cls._create_non_nullity_expectations(dataset, column)
-
-        min_value = \
-        dataset.expect_column_min_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[
-            "observed_value"]
-
-        if min_value is not None:
-            dataset.remove_expectation(expectation_type="expect_column_min_to_be_between", column=column)
-            try:
-                min_value = min_value + datetime.timedelta(days=-365)
-            except OverflowError as o_err:
-                min_value = datetime.datetime.min
-            except TypeError as o_err:
-                min_value = parse(min_value) + datetime.timedelta(days=-365)
-
-
-        max_value = \
-        dataset.expect_column_max_to_be_between(column, min_value=None, max_value=None, result_format="SUMMARY").result[
-            "observed_value"]
-        if max_value is not None:
-            dataset.remove_expectation(expectation_type="expect_column_max_to_be_between", column=column)
-            try:
-                max_value = max_value + datetime.timedelta(days=365)
-            except OverflowError as o_err:
-                max_value = datetime.datetime.max
-            except TypeError as o_err:
-                max_value = parse(max_value) + datetime.timedelta(days=365)
-
-        if min_value is not None or max_value is not None:
-            dataset.expect_column_values_to_be_between(column, min_value, max_value, parse_strings_as_datetimes=True)
-
-
-    @classmethod
-    def _profile(cls, dataset):
-
-        dataset.set_default_expectation_argument("catch_exceptions", False)
-
-        value = dataset.expect_table_row_count_to_be_between(min_value=0, max_value=None).result["observed_value"]
-        dataset.expect_table_row_count_to_be_between(min_value=max(0, value-10), max_value=value+10)
-
-        dataset.set_config_value('interactive_evaluation', True)
-
-        columns = dataset.get_table_columns()
-
-        dataset.expect_table_column_count_to_equal(len(columns))
-        dataset.expect_table_columns_to_match_ordered_list(columns)
-
-        meta_columns = {}
-        for column in columns:
-            meta_columns[column] = {"description": ""}
-
-        column_cache = {}
-        profiled_columns = {
-            "numeric": [],
-            "low_card": [],
-            "string": [],
-            "datetime": []
-        }
-
-        column = cls._find_next_low_card_column(dataset, columns, profiled_columns, column_cache)
-        if column:
-            cls._create_expectations_for_low_card_column(dataset, column, column_cache)
-            profiled_columns["low_card"].append(column)
-
-
-        column = cls._find_next_numeric_column(dataset, columns, profiled_columns, column_cache)
-        if column:
-            cls._create_expectations_for_numeric_column(dataset, column)
-            profiled_columns["numeric"].append(column)
-
-
-        column = cls._find_next_string_column(dataset, columns, profiled_columns, column_cache)
-        if column:
-            cls._create_expectations_for_string_column(dataset, column)
-            profiled_columns["string"].append(column)
-
-        column = cls._find_next_datetime_column(dataset, columns, profiled_columns, column_cache)
-        if column:
-            cls._create_expectations_for_datetime_column(dataset, column)
-            profiled_columns["datetime"].append(column)
-
-
-        expectation_suite = dataset.get_expectation_suite(suppress_warnings=True, discard_failed_expectations=True)
-        if not expectation_suite.meta:
-            expectation_suite.meta = {"columns": meta_columns, "notes": {""}}
-        else:
-            expectation_suite.meta["columns"] = meta_columns
-
-        expectation_suite.meta["notes"] = {
-            "format": "markdown",
-            "content": [
-                """#### This is an _example_ suite
-
-- This suite was made by quickly glancing at 1000 rows of your data.
-- This is **not a production suite**. It is meant to show examples of expectations.
-- Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful.
-"""
-            ]
-        }
-
-        return expectation_suite