Skip to content

Commit

Permalink
[FEATURE] Enable self-initializing `ExpectColumnUniqueValueCountToBeB…
Browse files Browse the repository at this point in the history
…etween` (#4902)

* feat: init commit

* chore: port over existing self-initializing config

* fix: fix import statement

* chore: misc updates

* test: start integration test

* test: uncomment remaining portion

* feat: misc updates per convo with Alex

* test: set seed

* chore: write docstring

* test: add clarifying comment
  • Loading branch information
cdkini committed Apr 20, 2022
1 parent b67ac9a commit 40940b6
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Optional
from typing import Dict, List, Optional

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.execution_engine import ExecutionEngine
Expand All @@ -12,6 +12,18 @@
parse_row_condition_string_pandas_engine,
substitute_none_for_missing,
)
from great_expectations.rule_based_profiler.config.base import (
ParameterBuilderConfig,
RuleBasedProfilerConfig,
)
from great_expectations.rule_based_profiler.types.parameter_container import (
DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY,
FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER,
FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY,
PARAMETER_KEY,
VARIABLES_KEY,
)


class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation):
Expand Down Expand Up @@ -83,6 +95,73 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation):
success_keys = (
"min_value",
"max_value",
"auto",
"profiler_config",
)

column_unique_values_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig(
module_name="great_expectations.rule_based_profiler.parameter_builder",
class_name="NumericMetricRangeMultiBatchParameterBuilder",
name="column_unique_values_range_estimator",
metric_name="column.distinct_values.count",
metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
metric_value_kwargs=None,
enforce_numeric_metric=True,
replace_nan_with_zero=True,
reduce_scalar_metric=True,
false_positive_rate=f"{VARIABLES_KEY}false_positive_rate",
estimator=f"{VARIABLES_KEY}estimator",
num_bootstrap_samples=f"{VARIABLES_KEY}num_bootstrap_samples",
bootstrap_random_seed=f"{VARIABLES_KEY}bootstrap_random_seed",
truncate_values=f"{VARIABLES_KEY}truncate_values",
round_decimals=f"{VARIABLES_KEY}round_decimals",
evaluation_parameter_builder_configs=None,
json_serialize=True,
)
validation_parameter_builder_configs: List[ParameterBuilderConfig] = [
column_unique_values_range_estimator_parameter_builder_config,
]
default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
name="expect_column_unique_value_count_to_be_between", # Convention: use "expectation_type" as profiler name.
config_version=1.0,
variables={
"mostly": 1.0,
"strict_min": False,
"strict_max": False,
"false_positive_rate": 0.05,
"estimator": "bootstrap",
"num_bootstrap_samples": 9999,
"bootstrap_random_seed": None,
"truncate_values": {
"lower_bound": 0,
"upper_bound": None,
},
"round_decimals": 0,
},
rules={
"default_expect_column_unique_values_to_be_between_rule": {
"domain_builder": {
"class_name": "ColumnDomainBuilder",
"module_name": "great_expectations.rule_based_profiler.domain_builder",
},
"expectation_configuration_builders": [
{
"expectation_type": "expect_column_unique_value_count_to_be_between",
"class_name": "DefaultExpectationConfigurationBuilder",
"module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder",
"validation_parameter_builder_configs": validation_parameter_builder_configs,
"column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
"min_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]",
"max_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]",
"strict_min": f"{VARIABLES_KEY}strict_min",
"strict_max": f"{VARIABLES_KEY}strict_max",
"meta": {
"profiler_details": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
},
},
],
},
},
)

# Default values
Expand All @@ -94,6 +173,8 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation):
"result_format": "BASIC",
"include_config": True,
"catch_exceptions": False,
"auto": False,
"profiler_config": default_profiler_config,
}
args_keys = (
"column",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation):
"profiler_config",
)

min_estimato_parameter_builder_config: ParameterBuilderConfig = (
min_estimator_parameter_builder_config: ParameterBuilderConfig = (
ParameterBuilderConfig(
module_name="great_expectations.rule_based_profiler.parameter_builder",
class_name="MetricMultiBatchParameterBuilder",
Expand All @@ -123,7 +123,7 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation):
json_serialize=True,
)
)
max_estimato_parameter_builder_config: ParameterBuilderConfig = (
max_estimator_parameter_builder_config: ParameterBuilderConfig = (
ParameterBuilderConfig(
module_name="great_expectations.rule_based_profiler.parameter_builder",
class_name="MetricMultiBatchParameterBuilder",
Expand All @@ -139,8 +139,8 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation):
)
)
validation_parameter_builder_configs: List[ParameterBuilderConfig] = [
min_estimato_parameter_builder_config,
max_estimato_parameter_builder_config,
min_estimator_parameter_builder_config,
max_estimator_parameter_builder_config,
]
default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
name="expect_column_values_to_be_between", # Convention: use "expectation_type" as profiler name.
Expand All @@ -163,15 +163,15 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation):
"module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder",
"validation_parameter_builder_configs": validation_parameter_builder_configs,
"column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
"min_value": f"{PARAMETER_KEY}{min_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]",
"max_value": f"{PARAMETER_KEY}{max_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]",
"min_value": f"{PARAMETER_KEY}{min_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]",
"max_value": f"{PARAMETER_KEY}{max_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]",
"mostly": f"{VARIABLES_KEY}mostly",
"strict_min": f"{VARIABLES_KEY}strict_min",
"strict_max": f"{VARIABLES_KEY}strict_max",
"meta": {
"profiler_details": {
"min_estimator": f"{PARAMETER_KEY}{min_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
"max_estimator": f"{PARAMETER_KEY}{max_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
"min_estimator": f"{PARAMETER_KEY}{min_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
"max_estimator": f"{PARAMETER_KEY}{max_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
},
},
},
Expand Down
26 changes: 22 additions & 4 deletions great_expectations/expectations/expectation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@
from collections import Counter
from copy import deepcopy
from inspect import isabstract
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
from dateutil.parser import parse
from numpy import negative

from great_expectations import __version__ as ge_version
from great_expectations import execution_engine
from great_expectations.core import expectation_configuration
from great_expectations.core.batch import Batch
from great_expectations.core.expectation_configuration import (
ExpectationConfiguration,
Expand Down Expand Up @@ -58,6 +55,7 @@
from great_expectations.expectations.registry import (
_registered_metrics,
_registered_renderers,
get_expectation_impl,
get_metric_kwargs,
register_expectation,
register_renderer,
Expand All @@ -75,6 +73,7 @@
renderedAtomicValueSchema,
)
from great_expectations.render.util import num_to_str
from great_expectations.rule_based_profiler.config.base import RuleBasedProfilerConfig
from great_expectations.self_check.util import (
evaluate_json_test_cfe,
generate_expectation_tests,
Expand Down Expand Up @@ -2406,3 +2405,22 @@ def _format_map_output(
return return_obj

raise ValueError(f"Unknown result_format {result_format['result_format']}.")


def get_default_profiler_config_for_expectation_type(
expectation_type: str,
) -> Optional[RuleBasedProfilerConfig]:
"""Retrieves the default profiler config as defined within a given Expectation.
Args:
expectation_type (str): The name of the Expectation to parse
Returns:
The default profiler config within the target Expectation.
If not available, returns None.
"""
expectation_impl = get_expectation_impl(expectation_name=expectation_type)
profiler_config: Optional[
RuleBasedProfilerConfig
] = expectation_impl.default_kwarg_values.get("profiler_config")
return profiler_config
2 changes: 1 addition & 1 deletion scripts/check_type_hint_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Dict, List, Optional

TYPE_HINT_ERROR_THRESHOLD: int = (
2840 # This number is to be reduced as we annotate more functions!
2839 # This number is to be reduced as we annotate more functions!
)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import uuid
from numbers import Number
from typing import Any, Dict, List, Optional, Tuple, cast
from typing import Any, Callable, Dict, List, Optional, Tuple, cast
from unittest import mock

import numpy as np
Expand All @@ -17,6 +17,9 @@
from great_expectations.core import ExpectationSuite, ExpectationValidationResult
from great_expectations.core.batch import BatchRequest
from great_expectations.datasource import DataConnector, Datasource
from great_expectations.expectations.expectation import (
get_default_profiler_config_for_expectation_type,
)
from great_expectations.expectations.registry import get_expectation_impl
from great_expectations.rule_based_profiler.config.base import (
RuleBasedProfilerConfig,
Expand All @@ -31,10 +34,23 @@
usage_stats_invalid_messages_exist,
)
from tests.rule_based_profiler.conftest import ATOL, RTOL
from tests.rule_based_profiler.parameter_builder.conftest import RANDOM_SEED

yaml = YAML()


@pytest.fixture
def set_consistent_seed_within_expectation_default_profiler_config() -> Callable:
def _set_seed(expectation_type: str):
default_profiler: Optional[
RuleBasedProfilerConfig
] = get_default_profiler_config_for_expectation_type(expectation_type)
assert default_profiler is not None and default_profiler.variables is not None
default_profiler.variables["bootstrap_random_seed"] = RANDOM_SEED

return _set_seed


def test_alice_columnar_table_single_batch_batches_are_accessible(
alice_columnar_table_single_batch_context,
alice_columnar_table_single_batch,
Expand Down Expand Up @@ -2236,8 +2252,6 @@ def test_quentin_expect_column_max_to_be_between_auto_yes_default_profiler_confi

result: ExpectationValidationResult

custom_profiler_config: RuleBasedProfilerConfig

suite: ExpectationSuite

expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}"
Expand Down Expand Up @@ -2317,3 +2331,91 @@ def test_quentin_expect_column_max_to_be_between_auto_yes_default_profiler_confi
atol=atol,
err_msg=f"Actual value of {max_value_actual} differs from expected value of {max_value_expected} by more than {atol + rtol * abs(max_value_expected)} tolerance.",
)


@pytest.mark.skipif(
version.parse(np.version.version) < version.parse("1.21.0"),
reason="requires numpy version 1.21.0 or newer",
)
@freeze_time("09/26/2019 13:42:41")
def test_quentin_expect_column_unique_value_count_to_be_between_auto_yes_default_profiler_config_yes_custom_profiler_config_no(
quentin_columnar_table_multi_batch_data_context,
set_consistent_seed_within_expectation_default_profiler_config: Callable,
) -> None:
context: DataContext = quentin_columnar_table_multi_batch_data_context

result: ExpectationValidationResult

suite: ExpectationSuite

expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}"
try:
# noinspection PyUnusedLocal
suite = context.get_expectation_suite(
expectation_suite_name=expectation_suite_name
)
except ge_exceptions.DataContextError:
suite = context.create_expectation_suite(
expectation_suite_name=expectation_suite_name
)
print(f'Created ExpectationSuite "{suite.expectation_suite_name}".')

batch_request: dict = {
"datasource_name": "taxi_pandas",
"data_connector_name": "monthly",
"data_asset_name": "my_reports",
}

validator: Validator = context.get_validator(
batch_request=BatchRequest(**batch_request),
expectation_suite_name=expectation_suite_name,
)
assert len(validator.batches) == 36

# Utilize a consistent seed to deal with probabilistic nature of this feature
set_consistent_seed_within_expectation_default_profiler_config(
"expect_column_unique_value_count_to_be_between"
)

test_cases: Tuple[Tuple[str, int, int], ...] = (
("pickup_location_id", 118, 212),
("dropoff_location_id", 190, 236),
)

for column_name, min_value_expected, max_value_expected in test_cases:

# Use all batches, loaded by Validator, for estimating Expectation argument values.
result = validator.expect_column_unique_value_count_to_be_between(
column=column_name,
result_format="SUMMARY",
include_config=True,
auto=True,
)
assert result.success

key: str
value: Any
expectation_config_kwargs: dict = {
key: value
for key, value in result.expectation_config["kwargs"].items()
if key
not in [
"min_value",
"max_value",
]
}
assert expectation_config_kwargs == {
"column": column_name,
"strict_min": False,
"strict_max": False,
"result_format": "SUMMARY",
"include_config": True,
"auto": True,
"batch_id": "84000630d1b69a0fe870c94fb26a32bc",
}

min_value_actual: int = result.expectation_config["kwargs"]["min_value"]
assert min_value_actual == min_value_expected

max_value_actual: int = result.expectation_config["kwargs"]["max_value"]
assert max_value_actual == max_value_expected

0 comments on commit 40940b6

Please sign in to comment.