diff --git a/great_expectations/core/batch.py b/great_expectations/core/batch.py index 0dd175e41b9b..7995fdc5e2ae 100644 --- a/great_expectations/core/batch.py +++ b/great_expectations/core/batch.py @@ -628,9 +628,10 @@ def head(self, n_rows=5, fetch_all=False): return self._data.execution_engine.resolve_metrics((metric,))[metric.id] +# TODO: ALEX -- Make this helper utility of general use. def materialize_batch_request( batch_request: Optional[Union[BatchRequestBase, dict]] = None, -) -> Optional[Union[BatchRequest, RuntimeBatchRequest]]: +) -> Optional[BatchRequestBase]: effective_batch_request: dict = get_batch_request_as_dict( batch_request=batch_request ) diff --git a/great_expectations/rule_based_profiler/data_assistant/data_assistant.py b/great_expectations/rule_based_profiler/data_assistant/data_assistant.py index c1a21de8bbfa..f7e1d213f445 100644 --- a/great_expectations/rule_based_profiler/data_assistant/data_assistant.py +++ b/great_expectations/rule_based_profiler/data_assistant/data_assistant.py @@ -3,7 +3,6 @@ from great_expectations.core import ExpectationSuite from great_expectations.core.batch import Batch, BatchRequestBase -from great_expectations.data_context import BaseDataContext from great_expectations.execution_engine.execution_engine import MetricDomainTypes from great_expectations.rule_based_profiler.domain_builder import DomainBuilder from great_expectations.rule_based_profiler.expectation_configuration_builder import ( @@ -15,9 +14,6 @@ from great_expectations.rule_based_profiler.helpers.util import ( convert_variables_to_dict, ) -from great_expectations.rule_based_profiler.helpers.util import ( - get_validator as get_validator_using_batch_list_or_batch_request, -) from great_expectations.rule_based_profiler.parameter_builder import ParameterBuilder from great_expectations.rule_based_profiler.rule import Rule from great_expectations.rule_based_profiler.rule_based_profiler import ( @@ -29,6 +25,7 @@ DataAssistantResult, ) from great_expectations.util import measure_execution_time +from great_expectations.validator.validator import Validator class DataAssistant(ABC): @@ -40,8 +37,7 @@ class DataAssistant(ABC): data_assistant: DataAssistant = VolumeDataAssistant( name="my_volume_data_assistant", - batch_request=batch_request, - data_context=context, + validator=validator, ) result: DataAssistantResult = data_assistant.run() @@ -56,8 +52,7 @@ class DataAssistant(ABC): def __init__( self, name: str, - batch_request: Union[BatchRequestBase, dict], - data_context: BaseDataContext = None, + validator: Validator, ): """ DataAssistant subclasses guide "RuleBasedProfiler" to contain Rule configurations to embody profiling behaviors, @@ -66,29 +61,17 @@ def __init__( and overall "ExpectationSuite" object, immediately available for validating underlying data "Batch" objects. Args: - name: the name of this DataAssistant object. - batch_request: specified for querying data Batch objects. - data_context: DataContext + name: the name of this DataAssistant object + validator: Validator object, containing loaded Batch objects as well as Expectation and Metric operations """ self._name = name - - self._data_context = data_context - - self._validator = get_validator_using_batch_list_or_batch_request( - purpose=self.name, - data_context=self.data_context, - batch_list=None, - batch_request=batch_request, - domain=None, - variables=None, - parameters=None, - ) + self._validator = validator self._profiler = RuleBasedProfiler( name=self.name, config_version=1.0, variables=None, - data_context=self.data_context, + data_context=self._validator.data_context, ) self._build_profiler() @@ -164,7 +147,8 @@ def run( Args: expectation_suite: An existing "ExpectationSuite" to update expectation_suite_name: A name for returned "ExpectationSuite" - include_citation: Whether or not to include the Profiler config in the metadata for "ExpectationSuite" produced by "RuleBasedProfiler" + include_citation: Flag, which controls whether or not to effective Profiler configuration should be included + as a citation in metadata of the "ExpectationSuite" computeds and returned by "RuleBasedProfiler" Returns: DataAssistantResult: The result object for the DataAssistant @@ -192,10 +176,6 @@ def run( def name(self) -> str: return self._name - @property - def data_context(self) -> BaseDataContext: - return self._data_context - @property def profiler(self) -> BaseRuleBasedProfiler: return self._profiler @@ -268,8 +248,8 @@ def get_metrics_by_domain(self) -> Dict[Domain, Dict[str, ParameterNode]]: value of "DataAssistant.metrics_parameter_builders_by_domain_type" interface property and actual fully-qualified parameter names match interface properties of "ParameterBuilder" objects, corresponding to these "domain" types. - returns: - dictionaries of values for fully-qualified parameter names by domain for metrics, computed by "rulebasedprofiler" state. + Returns: + Dictionaries of values for fully-qualified parameter names by Domain for metrics, from "RuleBasedpRofiler" """ # noinspection PyTypeChecker parameter_values_for_fully_qualified_parameter_names_by_domain: Dict[ @@ -326,7 +306,8 @@ def get_expectation_suite( Args: expectation_suite: An existing "ExpectationSuite" to update expectation_suite_name: A name for returned "ExpectationSuite" - include_citation: Whether or not to include the Profiler config in the metadata for "ExpectationSuite" produced by "RuleBasedProfiler" + include_citation: Flag, which controls whether or not to effective Profiler configuration should be included + as a citation in metadata of the "ExpectationSuite" computeds and returned by "RuleBasedProfiler" Returns: "ExpectationSuite" using "ExpectationConfiguration" objects, computed by "RuleBasedProfiler" state diff --git a/great_expectations/rule_based_profiler/data_assistant/volume_data_assistant.py b/great_expectations/rule_based_profiler/data_assistant/volume_data_assistant.py index 62cb25674c26..64686b0013ae 100644 --- a/great_expectations/rule_based_profiler/data_assistant/volume_data_assistant.py +++ b/great_expectations/rule_based_profiler/data_assistant/volume_data_assistant.py @@ -1,7 +1,5 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional -from great_expectations.core.batch import BatchRequestBase -from great_expectations.data_context import BaseDataContext from great_expectations.execution_engine.execution_engine import MetricDomainTypes from great_expectations.rule_based_profiler.data_assistant import DataAssistant from great_expectations.rule_based_profiler.parameter_builder import ( @@ -16,6 +14,7 @@ DataAssistantResult, VolumeDataAssistantResult, ) +from great_expectations.validator.validator import Validator class VolumeDataAssistant(DataAssistant): @@ -31,13 +30,11 @@ class VolumeDataAssistant(DataAssistant): def __init__( self, name: str, - batch_request: Union[BatchRequestBase, dict], - data_context: BaseDataContext = None, + validator: Validator, ): super().__init__( name=name, - batch_request=batch_request, - data_context=data_context, + validator=validator, ) @property diff --git a/great_expectations/rule_based_profiler/domain_builder/categorical_column_domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/categorical_column_domain_builder.py index 06ea8155ffa5..8e820397a272 100644 --- a/great_expectations/rule_based_profiler/domain_builder/categorical_column_domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/categorical_column_domain_builder.py @@ -52,7 +52,7 @@ def __init__( limit_mode: Optional[Union[CardinalityLimitMode, str]] = None, max_unique_values: Optional[Union[str, int]] = None, max_proportion_unique: Optional[Union[str, float]] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """Create column domains where cardinality is within the specified limit. @@ -88,7 +88,7 @@ def __init__( cardinality limit to use when filtering columns. max_proportion_unique: proportion of unique values for a custom cardinality limit to use when filtering columns. - data_context: DataContext associated with this profiler. + data_context: BaseDataContext associated with this DomainBuilder """ if exclude_column_names is None: exclude_column_names = [ diff --git a/great_expectations/rule_based_profiler/domain_builder/column_domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/column_domain_builder.py index d486b5c1f0fa..895c3996972a 100644 --- a/great_expectations/rule_based_profiler/domain_builder/column_domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/column_domain_builder.py @@ -39,7 +39,7 @@ def __init__( exclude_semantic_types: Optional[ Union[str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]] ] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ A semantic type is distinguished from the structured column type; @@ -56,7 +56,7 @@ def __init__( to be included exclude_semantic_types: single/multiple type specifications using SemanticDomainTypes (or str equivalents) to be excluded - data_context: DataContext + data_context: BaseDataContext associated with this DomainBuilder Inclusion/Exclusion Logic: (include_column_names|table_columns - exclude_column_names) + (include_semantic_types - exclude_semantic_types) diff --git a/great_expectations/rule_based_profiler/domain_builder/column_pair_domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/column_pair_domain_builder.py index b926075a194c..9e1c3c1e0947 100644 --- a/great_expectations/rule_based_profiler/domain_builder/column_pair_domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/column_pair_domain_builder.py @@ -14,12 +14,12 @@ class ColumnPairDomainBuilder(ColumnDomainBuilder): def __init__( self, include_column_names: Optional[Union[str, Optional[List[str]]]] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: - include_column_names: Explicitly specified exactly two desired columns. - data_context: DataContext associated with this profiler. + include_column_names: Explicitly specified exactly two desired columns + data_context: BaseDataContext associated with this DomainBuilder """ super().__init__( include_column_names=include_column_names, diff --git a/great_expectations/rule_based_profiler/domain_builder/domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/domain_builder.py index b89b9287ca17..95a38635c9e4 100644 --- a/great_expectations/rule_based_profiler/domain_builder/domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/domain_builder.py @@ -27,11 +27,11 @@ class DomainBuilder(Builder, ABC): def __init__( self, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: - data_context: DataContext + data_context: BaseDataContext associated with DomainBuilder """ super().__init__(data_context=data_context) diff --git a/great_expectations/rule_based_profiler/domain_builder/map_metric_column_domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/map_metric_column_domain_builder.py index 36fd2489576e..4c64b851c7e5 100644 --- a/great_expectations/rule_based_profiler/domain_builder/map_metric_column_domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/map_metric_column_domain_builder.py @@ -37,7 +37,7 @@ def __init__( max_unexpected_values: Union[str, int] = 0, max_unexpected_ratio: Optional[Union[str, float]] = None, min_max_unexpected_values_proportion: Union[str, float] = 9.75e-1, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Create column domains using tolerance for inter-Batch proportion of adherence to intra-Batch "unexpected_count" @@ -59,9 +59,9 @@ def __init__( max_unexpected_values: maximum "unexpected_count" value of "map_metric_name" (intra-Batch) max_unexpected_ratio: maximum "unexpected_count" value of "map_metric_name" divided by number of records (intra-Batch); if both "max_unexpected_values" and "max_unexpected_ratio" are specified, then - "max_unexpected_ratio" is used (and "max_unexpected_values" is ignored). + "max_unexpected_ratio" is used (and "max_unexpected_values" is ignored) min_max_unexpected_values_proportion: minimum fraction of Batch objects adhering to "max_unexpected_values" - data_context: DataContext associated with this profiler. + data_context: BaseDataContext associated with this DomainBuilder For example (using default values of "max_unexpected_values" and "min_max_unexpected_values_proportion"): Suppose that "map_metric_name" is "column_values.nonnull" and consider the following three Batches of data: diff --git a/great_expectations/rule_based_profiler/domain_builder/multi_column_domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/multi_column_domain_builder.py index 916ed7458ce8..e6dc78300504 100644 --- a/great_expectations/rule_based_profiler/domain_builder/multi_column_domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/multi_column_domain_builder.py @@ -14,12 +14,12 @@ class MultiColumnDomainBuilder(ColumnDomainBuilder): def __init__( self, include_column_names: Optional[Union[str, Optional[List[str]]]] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: - include_column_names: Explicitly specified desired columns. - data_context: DataContext associated with this profiler. + include_column_names: Explicitly specified desired columns + data_context: BaseDataContext associated with this DomainBuilder """ super().__init__( include_column_names=include_column_names, diff --git a/great_expectations/rule_based_profiler/domain_builder/table_domain_builder.py b/great_expectations/rule_based_profiler/domain_builder/table_domain_builder.py index 01b3901fbd9a..1c807dcb5202 100644 --- a/great_expectations/rule_based_profiler/domain_builder/table_domain_builder.py +++ b/great_expectations/rule_based_profiler/domain_builder/table_domain_builder.py @@ -8,11 +8,11 @@ class TableDomainBuilder(DomainBuilder): def __init__( self, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: - data_context: DataContext + data_context: BaseDataContext associated with this DomainBuilder """ super().__init__(data_context=data_context) diff --git a/great_expectations/rule_based_profiler/expectation_configuration_builder/default_expectation_configuration_builder.py b/great_expectations/rule_based_profiler/expectation_configuration_builder/default_expectation_configuration_builder.py index d84169a0c6c1..ed17e107adf0 100644 --- a/great_expectations/rule_based_profiler/expectation_configuration_builder/default_expectation_configuration_builder.py +++ b/great_expectations/rule_based_profiler/expectation_configuration_builder/default_expectation_configuration_builder.py @@ -70,19 +70,19 @@ def __init__( validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig] ] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 **kwargs, ): """ Args: expectation_type: the "expectation_type" argument of "ExpectationConfiguration" object to be emitted. - meta: the "meta" argument of "ExpectationConfiguration" object to be emitted. + meta: the "meta" argument of "ExpectationConfiguration" object to be emitted condition: Boolean statement (expressed as string and following specified grammar), which controls whether - or not underlying logic should be executed and thus resulting "ExpectationConfiguration" emitted. + or not underlying logic should be executed and thus resulting "ExpectationConfiguration" emitted validation_parameter_builder_configs: ParameterBuilder configurations, having whose outputs available (as - fully-qualified parameter names) is pre-requisite for present ExpectationConfigurationBuilder instance. - These "ParameterBuilder" configurations help build kwargs needed for this "ExpectationConfigurationBuilder". - data_context: DataContext + fully-qualified parameter names) is pre-requisite for present ExpectationConfigurationBuilder instance + These "ParameterBuilder" configurations help build kwargs needed for this "ExpectationConfigurationBuilder" + data_context: BaseDataContext associated with this ExpectationConfigurationBuilder kwargs: additional arguments """ diff --git a/great_expectations/rule_based_profiler/expectation_configuration_builder/expectation_configuration_builder.py b/great_expectations/rule_based_profiler/expectation_configuration_builder/expectation_configuration_builder.py index 25cbf2d58846..7c17de61f386 100644 --- a/great_expectations/rule_based_profiler/expectation_configuration_builder/expectation_configuration_builder.py +++ b/great_expectations/rule_based_profiler/expectation_configuration_builder/expectation_configuration_builder.py @@ -31,7 +31,7 @@ def __init__( validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig] ] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 **kwargs ): """ @@ -41,8 +41,8 @@ def __init__( expectation_type: the "expectation_type" argument of "ExpectationConfiguration" object to be emitted. validation_parameter_builder_configs: ParameterBuilder configurations, having whose outputs available (as fully-qualified parameter names) is pre-requisite for present ExpectationConfigurationBuilder instance. - These "ParameterBuilder" configurations help build kwargs needed for this "ExpectationConfigurationBuilder". - data_context: DataContext + These "ParameterBuilder" configurations help build kwargs needed for this "ExpectationConfigurationBuilder" + data_context: BaseDataContext associated with this ExpectationConfigurationBuilder kwargs: additional arguments """ @@ -146,7 +146,7 @@ def validation_parameter_builders(self) -> Optional[List[ParameterBuilder]]: def init_rule_expectation_configuration_builders( expectation_configuration_builder_configs: List[dict], - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ) -> List["ExpectationConfigurationBuilder"]: # noqa: F821 expectation_configuration_builder_config: dict return [ @@ -162,7 +162,7 @@ def init_expectation_configuration_builder( expectation_configuration_builder_config: Union[ "ExpectationConfigurationBuilder", dict # noqa: F821 ], - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ) -> "ExpectationConfigurationBuilder": # noqa: F821 if not isinstance(expectation_configuration_builder_config, dict): expectation_configuration_builder_config = ( diff --git a/great_expectations/rule_based_profiler/helpers/util.py b/great_expectations/rule_based_profiler/helpers/util.py index c4214b678cfe..34f2ea007f76 100644 --- a/great_expectations/rule_based_profiler/helpers/util.py +++ b/great_expectations/rule_based_profiler/helpers/util.py @@ -37,7 +37,7 @@ def get_validator( purpose: str, *, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 batch_list: Optional[List[Batch]] = None, batch_request: Optional[Union[str, BatchRequestBase, dict]] = None, domain: Optional[Domain] = None, @@ -92,7 +92,7 @@ def get_validator( def get_batch_ids( - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 batch_list: Optional[List[Batch]] = None, batch_request: Optional[Union[str, BatchRequestBase, dict]] = None, domain: Optional[Domain] = None, @@ -126,7 +126,7 @@ def get_batch_ids( def build_batch_request( - batch_request: Optional[Union[str, BatchRequest, RuntimeBatchRequest, dict]] = None, + batch_request: Optional[Union[str, BatchRequestBase, dict]] = None, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, @@ -136,11 +136,11 @@ def build_batch_request( # Obtain BatchRequest from "rule state" (i.e., variables and parameters); from instance variable otherwise. effective_batch_request: Optional[ - Union[BatchRequest, RuntimeBatchRequest, dict] + Union[BatchRequestBase, dict] ] = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=batch_request, - expected_return_type=(BatchRequest, RuntimeBatchRequest, dict), + expected_return_type=(BatchRequestBase, dict), variables=variables, parameters=parameters, ) diff --git a/great_expectations/rule_based_profiler/parameter_builder/mean_unexpected_map_metric_multi_batch_parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/mean_unexpected_map_metric_multi_batch_parameter_builder.py index d86c54e5d6b7..8befeabd6703 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/mean_unexpected_map_metric_multi_batch_parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/mean_unexpected_map_metric_multi_batch_parameter_builder.py @@ -49,7 +49,7 @@ def __init__( List[ParameterBuilderConfig] ] = None, json_serialize: Union[str, bool] = True, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: @@ -66,7 +66,7 @@ def __init__( ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". json_serialize: If True (default), convert computed value to JSON prior to saving results. - data_context: DataContext + data_context: BaseDataContext associated with this ParameterBuilder """ super().__init__( name=name, diff --git a/great_expectations/rule_based_profiler/parameter_builder/metric_multi_batch_parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/metric_multi_batch_parameter_builder.py index c5f31a5fe91a..faefa96846fa 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/metric_multi_batch_parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/metric_multi_batch_parameter_builder.py @@ -39,7 +39,7 @@ def __init__( List[ParameterBuilderConfig] ] = None, json_serialize: Union[str, bool] = True, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: @@ -57,7 +57,7 @@ def __init__( ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". json_serialize: If True (default), convert computed value to JSON prior to saving results. - data_context: DataContext + data_context: BaseDataContext associated with this ParameterBuilder """ super().__init__( name=name, diff --git a/great_expectations/rule_based_profiler/parameter_builder/numeric_metric_range_multi_batch_parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/numeric_metric_range_multi_batch_parameter_builder.py index 6d1e0d2c0373..886378efa1a5 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/numeric_metric_range_multi_batch_parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/numeric_metric_range_multi_batch_parameter_builder.py @@ -79,7 +79,7 @@ def __init__( List[ParameterBuilderConfig] ] = None, json_serialize: Union[str, bool] = True, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: @@ -107,7 +107,7 @@ def __init__( ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". json_serialize: If True (default), convert computed value to JSON prior to saving results. - data_context: DataContext + data_context: BaseDataContext associated with this ParameterBuilder """ super().__init__( name=name, @@ -197,8 +197,7 @@ def _build_parameters( Attributes object, containing computed parameter values and parameter computation details metadata. The algorithm operates according to the following steps: - 1. Obtain batch IDs of interest using DataContext and BatchRequest (unless passed explicitly as argument). Note - that this specific BatchRequest was specified as part of configuration for the present ParameterBuilder class. + 1. Obtain batch IDs of interest using BaseDataContext and BatchRequest (unless passed explicitly as argument). 2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters). 3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in order to have access to all Batch objects, on each of which the specified metric_name will be computed. diff --git a/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py index 79b9902443e4..64822b1af1e7 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/parameter_builder.py @@ -120,7 +120,7 @@ def __init__( List[ParameterBuilderConfig] ] = None, json_serialize: Union[str, bool] = True, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ The ParameterBuilder will build ParameterNode objects for a Domain from the Rule. @@ -133,7 +133,7 @@ def __init__( ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". json_serialize: If True (default), convert computed value to JSON prior to saving results. - data_context: DataContext + data_context: BaseDataContext associated with ParameterBuilder """ super().__init__(data_context=data_context) @@ -667,7 +667,7 @@ def _get_sorted_candidates_and_ratios( def init_rule_parameter_builders( parameter_builder_configs: Optional[List[dict]] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ) -> Optional[List["ParameterBuilder"]]: # noqa: F821 if parameter_builder_configs is None: return None @@ -683,7 +683,7 @@ def init_rule_parameter_builders( def init_parameter_builder( parameter_builder_config: Union["ParameterBuilderConfig", dict], # noqa: F821 - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ) -> "ParameterBuilder": # noqa: F821 if not isinstance(parameter_builder_config, dict): parameter_builder_config = parameter_builder_config.to_dict() diff --git a/great_expectations/rule_based_profiler/parameter_builder/regex_pattern_string_parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/regex_pattern_string_parameter_builder.py index 06d99381ab7d..12415f828b1f 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/regex_pattern_string_parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/regex_pattern_string_parameter_builder.py @@ -59,7 +59,7 @@ def __init__( List[ParameterBuilderConfig] ] = None, json_serialize: Union[str, bool] = True, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Configure this RegexPatternStringParameterBuilder @@ -73,7 +73,7 @@ def __init__( ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". json_serialize: If True (default), convert computed value to JSON prior to saving results. - data_context: DataContext + data_context: BaseDataContext associated with this ParameterBuilder """ super().__init__( name=name, diff --git a/great_expectations/rule_based_profiler/parameter_builder/simple_date_format_string_parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/simple_date_format_string_parameter_builder.py index b925be362547..66dd92366853 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/simple_date_format_string_parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/simple_date_format_string_parameter_builder.py @@ -104,7 +104,7 @@ def __init__( List[ParameterBuilderConfig] ] = None, json_serialize: Union[str, bool] = True, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Configure this SimpleDateFormatStringParameterBuilder @@ -120,7 +120,7 @@ def __init__( ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". json_serialize: If True (default), convert computed value to JSON prior to saving results. - data_context: DataContext + data_context: BaseDataContext associated with this ParameterBuilder """ super().__init__( name=name, diff --git a/great_expectations/rule_based_profiler/parameter_builder/value_set_multi_batch_parameter_builder.py b/great_expectations/rule_based_profiler/parameter_builder/value_set_multi_batch_parameter_builder.py index 1db56584ada5..ba626fc4ccfc 100644 --- a/great_expectations/rule_based_profiler/parameter_builder/value_set_multi_batch_parameter_builder.py +++ b/great_expectations/rule_based_profiler/parameter_builder/value_set_multi_batch_parameter_builder.py @@ -58,7 +58,7 @@ def __init__( List[ParameterBuilderConfig] ] = None, json_serialize: Union[str, bool] = True, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: @@ -71,7 +71,7 @@ def __init__( ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". json_serialize: If True (default), convert computed value to JSON prior to saving results. - data_context: DataContext + data_context: BaseDataContext associated with this ParameterBuilder """ super().__init__( name=name, diff --git a/great_expectations/rule_based_profiler/rule_based_profiler.py b/great_expectations/rule_based_profiler/rule_based_profiler.py index b09565f0ff73..b37ef63b7d15 100644 --- a/great_expectations/rule_based_profiler/rule_based_profiler.py +++ b/great_expectations/rule_based_profiler/rule_based_profiler.py @@ -85,7 +85,7 @@ class BaseRuleBasedProfiler(ConfigPeer): def __init__( self, profiler_config: RuleBasedProfilerConfig, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 usage_statistics_handler: Optional[UsageStatisticsHandler] = None, ): """ @@ -97,7 +97,7 @@ def __init__( Args: profiler_config: RuleBasedProfilerConfig -- formal typed object containing configuration - data_context: DataContext object that defines a full runtime environment (data access, etc.) + data_context: BaseDataContext object that defines full runtime environment (data access, etc.) """ name: str = profiler_config.name config_version: float = profiler_config.config_version @@ -195,7 +195,7 @@ def _init_rule( @staticmethod def _init_rule_domain_builder( domain_builder_config: dict, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ) -> DomainBuilder: domain_builder: DomainBuilder = instantiate_class_from_config( config=domain_builder_config, @@ -868,7 +868,7 @@ def _get_rules_as_dict(self) -> Dict[str, Rule]: @staticmethod def run_profiler( - data_context: "DataContext", # noqa: F821 + data_context: "BaseDataContext", # noqa: F821 profiler_store: ProfilerStore, name: Optional[str] = None, ge_cloud_id: Optional[str] = None, @@ -904,7 +904,7 @@ def run_profiler( @staticmethod def run_profiler_on_data( - data_context: "DataContext", # noqa: F821 + data_context: "BaseDataContext", # noqa: F821 profiler_store: ProfilerStore, batch_list: Optional[List[Batch]] = None, batch_request: Optional[Union[BatchRequestBase, dict]] = None, @@ -946,7 +946,7 @@ def run_profiler_on_data( @staticmethod def add_profiler( config: RuleBasedProfilerConfig, - data_context: "DataContext", # noqa: F821 + data_context: "BaseDataContext", # noqa: F821 profiler_store: ProfilerStore, ge_cloud_id: Optional[str] = None, ) -> "RuleBasedProfiler": # noqa: F821 @@ -957,7 +957,7 @@ def add_profiler( f'batch_data found in batch_request cannot be saved to ProfilerStore "{profiler_store.store_name}"' ) - # Chetan - 20220204 - DataContext to be removed once it can be decoupled from RBP + # Chetan - 20220204 - BaseDataContext to be removed once it can be decoupled from RBP new_profiler: "RuleBasedProfiler" = instantiate_class_from_config( # noqa: F821 config=config.to_json_dict(), runtime_environment={ @@ -1009,7 +1009,7 @@ def _check_validity_of_batch_requests_in_config( @staticmethod def get_profiler( - data_context: "DataContext", # noqa: F821 + data_context: "BaseDataContext", # noqa: F821 profiler_store: ProfilerStore, name: Optional[str] = None, ge_cloud_id: Optional[str] = None, @@ -1095,7 +1095,7 @@ def list_profilers( def self_check(self, pretty_print: bool = True) -> dict: """ - Necessary to enable integration with `DataContext.test_yaml_config` + Necessary to enable integration with `BaseDataContext.test_yaml_config` Args: pretty_print: flag to turn on verbose output Returns: @@ -1261,7 +1261,7 @@ def __init__( config_version: float, variables: Optional[Dict[str, Any]] = None, rules: Optional[Dict[str, Dict[str, Any]]] = None, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Create a new Profiler using configured rules. @@ -1275,7 +1275,7 @@ def __init__( variables: Any variables to be substituted within the rules rules: A set of dictionaries, each of which contains its own domain_builder, parameter_builders, and expectation_configuration_builders configuration components - data_context: DataContext object that defines a full runtime environment (data access, etc.) + data_context: BaseDataContext object that defines full runtime environment (data access, etc.) """ profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name=name, diff --git a/great_expectations/rule_based_profiler/types/builder.py b/great_expectations/rule_based_profiler/types/builder.py index 1d28300fad57..5b12a664aaf1 100644 --- a/great_expectations/rule_based_profiler/types/builder.py +++ b/great_expectations/rule_based_profiler/types/builder.py @@ -26,11 +26,11 @@ class Builder(SerializableDictDot): def __init__( self, - data_context: Optional["DataContext"] = None, # noqa: F821 + data_context: Optional["BaseDataContext"] = None, # noqa: F821 ): """ Args: - data_context: DataContext + data_context: BaseDataContext associated with this Builder """ self._batch_list = None self._batch_request = None @@ -60,7 +60,7 @@ def batch_request(self, value: Optional[Union[BatchRequestBase, dict]]) -> None: self._batch_request = value @property - def data_context(self) -> Optional["DataContext"]: # noqa: F821 + def data_context(self) -> Optional["BaseDataContext"]: # noqa: F821 return self._data_context def set_batch_list_or_batch_request( diff --git a/tests/integration/docusaurus/expectations/advanced/multi_batch_rule_based_profiler_example.py b/tests/integration/docusaurus/expectations/advanced/multi_batch_rule_based_profiler_example.py index 2692ebaf5c93..22dc1e9272b6 100644 --- a/tests/integration/docusaurus/expectations/advanced/multi_batch_rule_based_profiler_example.py +++ b/tests/integration/docusaurus/expectations/advanced/multi_batch_rule_based_profiler_example.py @@ -22,12 +22,6 @@ parameter_builders: - name: row_count_range class_name: NumericMetricRangeMultiBatchParameterBuilder - batch_request: - datasource_name: taxi_pandas - data_connector_name: monthly - data_asset_name: my_reports - data_connector_query: - index: "-6:-1" metric_name: table.row_count metric_domain_kwargs: $domain.domain_kwargs false_positive_rate: $variables.false_positive_rate @@ -48,34 +42,15 @@ class_name: ColumnDomainBuilder include_semantic_types: - numeric - # BatchRequest yielding exactly one batch (March, 2019 trip data) - batch_request: - datasource_name: taxi_pandas - data_connector_name: monthly - data_asset_name: my_reports - data_connector_query: - index: -1 parameter_builders: - name: min_range class_name: NumericMetricRangeMultiBatchParameterBuilder - batch_request: - datasource_name: taxi_pandas - data_connector_name: monthly - data_asset_name: my_reports - data_connector_query: - index: "-6:-1" metric_name: column.min metric_domain_kwargs: $domain.domain_kwargs false_positive_rate: $variables.false_positive_rate round_decimals: 2 - name: max_range class_name: NumericMetricRangeMultiBatchParameterBuilder - batch_request: - datasource_name: taxi_pandas - data_connector_name: monthly - data_asset_name: my_reports - data_connector_query: - index: "-6:-1" metric_name: column.max metric_domain_kwargs: $domain.domain_kwargs false_positive_rate: $variables.false_positive_rate @@ -113,7 +88,16 @@ data_context=data_context, ) -rule_based_profiler.run() +batch_request: dict = { + "datasource_name": "taxi_pandas", + "data_connector_name": "monthly", + "data_asset_name": "my_reports", + "data_connector_query": { + "index": "-6:-1", + }, +} + +rule_based_profiler.run(batch_request=batch_request) suite: ExpectationSuite = rule_based_profiler.get_expectation_suite( expectation_suite_name="test_suite_name" ) diff --git a/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py b/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py index 74bdff4728d1..7c4d03eb3e2f 100644 --- a/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py +++ b/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py @@ -1,5 +1,4 @@ import datetime -import uuid from numbers import Number from typing import Any, Callable, Dict, List, Optional, Tuple, cast from unittest import mock @@ -12,7 +11,6 @@ from ruamel.yaml import YAML from ruamel.yaml.comments import CommentedMap -import great_expectations.exceptions as ge_exceptions from great_expectations import DataContext from great_expectations.core import ExpectationSuite, ExpectationValidationResult from great_expectations.core.batch import BatchRequest @@ -35,6 +33,7 @@ ) from tests.rule_based_profiler.conftest import ATOL, RTOL from tests.rule_based_profiler.parameter_builder.conftest import RANDOM_SEED +from tests.test_utils import get_validator_with_expectation_suite yaml = YAML() @@ -673,30 +672,18 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf ): context: DataContext = bobby_columnar_table_multi_batch_deterministic_data_context - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - - # Use all batches, loaded by Validator, for estimating Expectation argument values. batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } - validator: Validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 3 @@ -815,20 +802,6 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf ): context: DataContext = bobby_columnar_table_multi_batch_deterministic_data_context - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - batch_request: dict validator: Validator @@ -844,9 +817,12 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf "data_asset_name": "my_reports", } - validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 3 @@ -973,6 +949,15 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf }, } + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", + ) + assert len(validator.batches) == 1 + custom_profiler_config = RuleBasedProfilerConfig( name="expect_column_values_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, @@ -1009,12 +994,6 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf }, ) - validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, - ) - assert len(validator.batches) == 1 - result = validator.expect_column_values_to_be_between( column=column_name, mostly=1.0, @@ -1078,20 +1057,6 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf context: DataContext = bobby_columnar_table_multi_batch_deterministic_data_context - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - batch_request: dict validator: Validator @@ -1152,9 +1117,12 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf "data_asset_name": "my_reports", } - validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 3 @@ -1244,9 +1212,12 @@ def test_bobby_expect_column_values_to_be_between_auto_yes_default_profiler_conf }, } - validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 1 @@ -1483,22 +1454,6 @@ def test_bobster_expect_table_row_count_to_be_between_auto_yes_default_profiler_ bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context ) - result: ExpectationValidationResult - - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - # Use all batches, loaded by Validator, for estimating Expectation argument values. batch_request: dict = { "datasource_name": "taxi_pandas", @@ -1506,16 +1461,21 @@ def test_bobster_expect_table_row_count_to_be_between_auto_yes_default_profiler_ "data_asset_name": "my_reports", } - validator: Validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 36 - result = validator.expect_table_row_count_to_be_between( - result_format="SUMMARY", - include_config=True, - auto=True, + result: ExpectationValidationResult = ( + validator.expect_table_row_count_to_be_between( + result_format="SUMMARY", + include_config=True, + auto=True, + ) ) assert result.expectation_config.kwargs["auto"] @@ -1686,29 +1646,18 @@ def test_quentin_expect_column_quantile_values_to_be_between_auto_yes_default_pr custom_profiler_config: RuleBasedProfilerConfig - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } - validator: Validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 36 @@ -1870,38 +1819,25 @@ def test_quentin_expect_column_values_to_be_in_set_auto_yes_default_profiler_con ): context: DataContext = quentin_columnar_table_multi_batch_data_context - result: ExpectationValidationResult - custom_profiler_config: RuleBasedProfilerConfig - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } - validator: Validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 36 # Use all batches, loaded by Validator, for estimating Expectation argument values. - result = validator.expect_column_values_to_be_in_set( + result: ExpectationValidationResult = validator.expect_column_values_to_be_in_set( column="passenger_count", result_format="SUMMARY", include_config=True, @@ -1941,38 +1877,25 @@ def test_quentin_expect_column_min_to_be_between_auto_yes_default_profiler_confi ): context: DataContext = quentin_columnar_table_multi_batch_data_context - result: ExpectationValidationResult - custom_profiler_config: RuleBasedProfilerConfig - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } - validator: Validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 36 # Use all batches, loaded by Validator, for estimating Expectation argument values. - result = validator.expect_column_min_to_be_between( + result: ExpectationValidationResult = validator.expect_column_min_to_be_between( column="fare_amount", result_format="SUMMARY", include_config=True, @@ -2011,36 +1934,23 @@ def test_quentin_expect_column_max_to_be_between_auto_yes_default_profiler_confi ): context: DataContext = quentin_columnar_table_multi_batch_data_context - result: ExpectationValidationResult - - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } - validator: Validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 36 # Use all batches, loaded by Validator, for estimating Expectation argument values. - result = validator.expect_column_max_to_be_between( + result: ExpectationValidationResult = validator.expect_column_max_to_be_between( column="fare_amount", result_format="SUMMARY", include_config=True, @@ -2107,29 +2017,18 @@ def test_quentin_expect_column_unique_value_count_to_be_between_auto_yes_default result: ExpectationValidationResult - suite: ExpectationSuite - - expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" - try: - # noinspection PyUnusedLocal - suite = context.get_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - except ge_exceptions.DataContextError: - suite = context.create_expectation_suite( - expectation_suite_name=expectation_suite_name - ) - print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') - batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } - validator: Validator = context.get_validator( - batch_request=BatchRequest(**batch_request), - expectation_suite_name=expectation_suite_name, + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="profiler", ) assert len(validator.batches) == 36 diff --git a/tests/rule_based_profiler/data_assistant/test_volume_data_assistant.py b/tests/rule_based_profiler/data_assistant/test_volume_data_assistant.py index 78bfb35f3d60..6df38ba272b8 100644 --- a/tests/rule_based_profiler/data_assistant/test_volume_data_assistant.py +++ b/tests/rule_based_profiler/data_assistant/test_volume_data_assistant.py @@ -13,43 +13,18 @@ DataAssistant, VolumeDataAssistant, ) -from great_expectations.rule_based_profiler.helpers.util import ( - convert_variables_to_dict, -) -from great_expectations.rule_based_profiler.rule import Rule -from great_expectations.rule_based_profiler.rule_based_profiler import ( - BaseRuleBasedProfiler, -) -from great_expectations.rule_based_profiler.types import ( - Domain, - build_parameter_container_for_variables, -) +from great_expectations.rule_based_profiler.types import Domain from great_expectations.rule_based_profiler.types.data_assistant_result import ( DataAssistantResult, ) from great_expectations.util import deep_filter_properties_iterable +from great_expectations.validator.validator import Validator from tests.render.test_util import load_notebook_from_path from tests.rule_based_profiler.parameter_builder.conftest import RANDOM_SEED - - -def set_bootstrap_random_seed_variable( - profiler: BaseRuleBasedProfiler, random_seed: int = RANDOM_SEED -) -> None: - variables_dict: dict - - variables_dict = convert_variables_to_dict(variables=profiler.variables) - variables_dict["bootstrap_random_seed"] = random_seed - profiler.variables = build_parameter_container_for_variables( - variables_configs=variables_dict - ) - - rule: Rule - for rule in profiler.rules: - variables_dict = convert_variables_to_dict(variables=rule.variables) - variables_dict["bootstrap_random_seed"] = random_seed - rule.variables = build_parameter_container_for_variables( - variables_configs=variables_dict - ) +from tests.test_utils import ( + get_validator_with_expectation_suite, + set_bootstrap_random_seed_variable, +) def run_volume_data_assistant_result_jupyter_notebook_with_new_cell( @@ -73,13 +48,79 @@ def run_volume_data_assistant_result_jupyter_notebook_with_new_cell( context.create_expectation_suite(expectation_suite_name) notebook_path: str = os.path.join(root_dir, f"run_volume_data_assistant.ipynb") notebook_code: str = """ + from typing import Optional, Union + + import uuid + import great_expectations as ge + from great_expectations.data_context import BaseDataContext + from great_expectations.core.batch import BatchRequestBase, materialize_batch_request + from great_expectations.core import ExpectationSuite + from great_expectations.validator.validator import Validator from great_expectations.rule_based_profiler.data_assistant import ( DataAssistant, VolumeDataAssistant, ) from great_expectations.rule_based_profiler.types.data_assistant_result import DataAssistantResult + import great_expectations.exceptions as ge_exceptions + """ + notebook_code += """ + def get_validator_with_expectation_suite( + batch_request: Union[BatchRequestBase, dict], + data_context: BaseDataContext, + expectation_suite: Optional[ExpectationSuite] = None, + expectation_suite_name: Optional[str] = None, + component_name: Optional[str] = None, + ) -> Validator: + suite: ExpectationSuite + + generate_temp_expectation_suite_name: bool + create_expectation_suite: bool + + if expectation_suite is not None and expectation_suite_name is not None: + if expectation_suite.expectation_suite_name != expectation_suite_name: + raise ValueError( + 'Mutually inconsistent "expectation_suite" and "expectation_suite_name" were specified.' + ) + generate_temp_expectation_suite_name = False + create_expectation_suite = False + elif expectation_suite is None and expectation_suite_name is not None: + generate_temp_expectation_suite_name = False + create_expectation_suite = True + elif expectation_suite is not None and expectation_suite_name is None: + generate_temp_expectation_suite_name = False + create_expectation_suite = False + else: + generate_temp_expectation_suite_name = True + create_expectation_suite = True + + if generate_temp_expectation_suite_name: + if not component_name: + component_name = "test" + + expectation_suite_name = f"tmp.{component_name}.suite_{str(uuid.uuid4())[:8]}" + + if create_expectation_suite: + try: + # noinspection PyUnusedLocal + expectation_suite = data_context.get_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + except ge_exceptions.DataContextError: + expectation_suite = data_context.create_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + print(f'Created ExpectationSuite "{expectation_suite.expectation_suite_name}".') + + batch_request = materialize_batch_request(batch_request=batch_request) + validator: Validator = data_context.get_validator( + batch_request=batch_request, + expectation_suite_name=expectation_suite_name, + ) + return validator + """ + notebook_code += """ context = ge.get_context() batch_request: dict = { @@ -88,10 +129,17 @@ def run_volume_data_assistant_result_jupyter_notebook_with_new_cell( "data_asset_name": "my_reports", } - data_assistant: DataAssistant = VolumeDataAssistant( - name="test_volume_data_assistant", + validator: Validator = get_validator_with_expectation_suite( batch_request=batch_request, data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="volume_data_assistant", + ) + + data_assistant: DataAssistant = VolumeDataAssistant( + name="test_volume_data_assistant", + validator=validator, ) expectation_suite_name: str = "test_suite" @@ -132,6 +180,15 @@ def test_get_metrics_and_expectations( "data_asset_name": "my_reports", } + validator: Validator = get_validator_with_expectation_suite( + batch_request=batch_request, + data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="volume_data_assistant", + ) + assert len(validator.batches) == 36 + expected_metrics_by_domain: Dict[Domain, Dict[str, Any]] = { Domain(domain_type="table",): { "$parameter.table_row_count": { @@ -2594,8 +2651,7 @@ def test_get_metrics_and_expectations( # Utilize a consistent seed to deal with probabilistic nature of this feature. data_assistant: DataAssistant = VolumeDataAssistant( name="test_volume_data_assistant", - batch_request=batch_request, - data_context=context, + validator=validator, ) set_bootstrap_random_seed_variable(profiler=data_assistant.profiler) data_assistant_result: DataAssistantResult = data_assistant.run( @@ -2634,10 +2690,18 @@ def test_execution_time_within_proper_bounds( "data_asset_name": "my_reports", } - data_assistant: DataAssistant = VolumeDataAssistant( - name="test_volume_data_assistant", + validator: Validator = get_validator_with_expectation_suite( batch_request=batch_request, data_context=context, + expectation_suite_name=None, + expectation_suite=None, + component_name="volume_data_assistant", + ) + assert len(validator.batches) == 36 + + data_assistant: DataAssistant = VolumeDataAssistant( + name="test_volume_data_assistant", + validator=validator, ) data_assistant_result: DataAssistantResult = data_assistant.run() diff --git a/tests/rule_based_profiler/test_rule_based_profiler.py b/tests/rule_based_profiler/test_rule_based_profiler.py index 655e4a2cb06d..d20321944050 100644 --- a/tests/rule_based_profiler/test_rule_based_profiler.py +++ b/tests/rule_based_profiler/test_rule_based_profiler.py @@ -906,7 +906,7 @@ def test_reconcile_profiler_rules_existing_rule_full_rule_override_update( @mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_run_profiler_without_dynamic_args( mock_data_context: mock.MagicMock, mock_profiler_run: mock.MagicMock, @@ -936,7 +936,7 @@ def test_run_profiler_without_dynamic_args( @mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_run_profiler_with_dynamic_args( mock_data_context: mock.MagicMock, mock_profiler_run: mock.MagicMock, @@ -976,7 +976,7 @@ def test_run_profiler_with_dynamic_args( @mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_run_profiler_on_data_creates_suite_with_dict_arg( mock_data_context: mock.MagicMock, mock_rule_based_profiler_run: mock.MagicMock, @@ -1003,7 +1003,7 @@ def test_run_profiler_on_data_creates_suite_with_dict_arg( @mock.patch("great_expectations.rule_based_profiler.RuleBasedProfiler.run") -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_run_profiler_on_data_creates_suite_with_batch_request_arg( mock_data_context: mock.MagicMock, mock_rule_based_profiler_run: mock.MagicMock, @@ -1034,7 +1034,7 @@ def test_run_profiler_on_data_creates_suite_with_batch_request_arg( assert resulting_batch_request == expected_batch_request -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_get_profiler_with_too_many_args_raises_error( mock_data_context: mock.MagicMock, populated_profiler_store: ProfilerStore, @@ -1050,7 +1050,7 @@ def test_get_profiler_with_too_many_args_raises_error( assert "either name or ge_cloud_id" in str(e.value) -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_add_profiler( mock_data_context: mock.MagicMock, profiler_key: ConfigurationIdentifier, @@ -1070,7 +1070,7 @@ def test_add_profiler( ) -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_add_profiler_ge_cloud_mode( mock_data_context: mock.MagicMock, ge_cloud_profiler_id: str, @@ -1092,7 +1092,7 @@ def test_add_profiler_ge_cloud_mode( ) -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_add_profiler_with_batch_request_containing_batch_data_raises_error( mock_data_context: mock.MagicMock, ): @@ -1137,7 +1137,7 @@ def test_add_profiler_with_batch_request_containing_batch_data_raises_error( assert "batch_data found in batch_request" in str(e.value) -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_get_profiler( mock_data_context: mock.MagicMock, populated_profiler_store: ProfilerStore, @@ -1157,7 +1157,7 @@ def test_get_profiler( assert isinstance(profiler, RuleBasedProfiler) -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_get_profiler_non_existent_profiler_raises_error( mock_data_context: mock.MagicMock, empty_profiler_store: ProfilerStore ): @@ -1238,7 +1238,7 @@ def test_list_profilers_in_cloud_mode(mock_profiler_store: mock.MagicMock): assert store.list_keys.called -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") @mock.patch("great_expectations.rule_based_profiler.domain_builder.ColumnDomainBuilder") @mock.patch( "great_expectations.rule_based_profiler.expectation_configuration_builder.DefaultExpectationConfigurationBuilder" @@ -1275,7 +1275,7 @@ def test_add_single_rule( assert len(profiler.rules) == 1 -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") @mock.patch("great_expectations.rule_based_profiler.domain_builder.ColumnDomainBuilder") @mock.patch( "great_expectations.rule_based_profiler.expectation_configuration_builder.DefaultExpectationConfigurationBuilder" @@ -1303,7 +1303,7 @@ def test_add_rule_overwrite_first_rule( assert len(profiler.rules) == 1 -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") @mock.patch("great_expectations.rule_based_profiler.domain_builder.ColumnDomainBuilder") @mock.patch( "great_expectations.rule_based_profiler.expectation_configuration_builder.DefaultExpectationConfigurationBuilder" @@ -1340,7 +1340,7 @@ def test_add_rule_add_second_rule( assert len(profiler.rules) == 2 -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_add_rule_bad_rule( mock_data_context: mock.MagicMock, sample_rule_dict: dict, @@ -1361,7 +1361,7 @@ def test_add_rule_bad_rule( assert "'dict' object has no attribute 'name'" in str(e.value) -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_run_with_expectation_suite_arg(mock_data_context: mock.MagicMock): profiler: RuleBasedProfiler = RuleBasedProfiler( name="my_rbp", data_context=mock_data_context, config_version=1.0 @@ -1377,7 +1377,7 @@ def test_run_with_expectation_suite_arg(mock_data_context: mock.MagicMock): assert id(suite) == id(result_suite) -@mock.patch("great_expectations.data_context.data_context.DataContext") +@mock.patch("great_expectations.data_context.data_context.BaseDataContext") def test_run_with_conflicting_expectation_suite_args_raises_error( mock_data_context: mock.MagicMock, ): diff --git a/tests/test_utils.py b/tests/test_utils.py index a4d21b6cfe03..e5369b38d41b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,6 +11,10 @@ import pandas as pd import pytest +import great_expectations.exceptions as ge_exceptions +from great_expectations.core import ExpectationSuite +from great_expectations.core.batch import BatchRequestBase, materialize_batch_request +from great_expectations.data_context import BaseDataContext from great_expectations.data_context.store import ( CheckpointStore, ProfilerStore, @@ -31,8 +35,19 @@ build_store_from_config, instantiate_class_from_config, ) -from great_expectations.datasource.data_connector import InferredAssetSqlDataConnector from great_expectations.execution_engine import SqlAlchemyExecutionEngine +from great_expectations.rule_based_profiler.helpers.util import ( + convert_variables_to_dict, +) +from great_expectations.rule_based_profiler.rule import Rule +from great_expectations.rule_based_profiler.rule_based_profiler import ( + BaseRuleBasedProfiler, +) +from great_expectations.rule_based_profiler.types import ( + build_parameter_container_for_variables, +) +from great_expectations.validator.validator import Validator +from tests.rule_based_profiler.parameter_builder.conftest import RANDOM_SEED logger = logging.getLogger(__name__) @@ -696,3 +711,84 @@ def clean_athena_db(connection_string: str, db_name: str, table_to_keep: str) -> finally: connection.close() engine.dispose() + + +def get_validator_with_expectation_suite( + batch_request: Union[BatchRequestBase, dict], + data_context: BaseDataContext, + expectation_suite: Optional[ExpectationSuite] = None, + expectation_suite_name: Optional[str] = None, + component_name: str = "test", +) -> Validator: + """ + Instantiates and returns "Validator" object using "data_context", "batch_request", and other available information. + Use "expectation_suite" if provided. If not, then if "expectation_suite_name" is specified, then create + "ExpectationSuite" from it. Otherwise, generate temporary "expectation_suite_name" using supplied "component_name". + """ + suite: ExpectationSuite + + generate_temp_expectation_suite_name: bool + create_expectation_suite: bool + + if expectation_suite is not None and expectation_suite_name is not None: + if expectation_suite.expectation_suite_name != expectation_suite_name: + raise ValueError( + 'Mutually inconsistent "expectation_suite" and "expectation_suite_name" were specified.' + ) + generate_temp_expectation_suite_name = False + create_expectation_suite = False + elif expectation_suite is None and expectation_suite_name is not None: + generate_temp_expectation_suite_name = False + create_expectation_suite = True + elif expectation_suite is not None and expectation_suite_name is None: + generate_temp_expectation_suite_name = False + create_expectation_suite = False + else: + generate_temp_expectation_suite_name = True + create_expectation_suite = True + + if generate_temp_expectation_suite_name: + expectation_suite_name = f"tmp.{component_name}.suite_{str(uuid.uuid4())[:8]}" + + if create_expectation_suite: + try: + # noinspection PyUnusedLocal + expectation_suite = data_context.get_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + except ge_exceptions.DataContextError: + expectation_suite = data_context.create_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + print( + f'Created ExpectationSuite "{expectation_suite.expectation_suite_name}".' + ) + + batch_request = materialize_batch_request(batch_request=batch_request) + validator: Validator = data_context.get_validator( + batch_request=batch_request, + expectation_suite_name=expectation_suite_name, + ) + + return validator + + +def set_bootstrap_random_seed_variable( + profiler: BaseRuleBasedProfiler, + random_seed: int = RANDOM_SEED, +) -> None: + variables_dict: dict + + variables_dict = convert_variables_to_dict(variables=profiler.variables) + variables_dict["bootstrap_random_seed"] = random_seed + profiler.variables = build_parameter_container_for_variables( + variables_configs=variables_dict + ) + + rule: Rule + for rule in profiler.rules: + variables_dict = convert_variables_to_dict(variables=rule.variables) + variables_dict["bootstrap_random_seed"] = random_seed + rule.variables = build_parameter_container_for_variables( + variables_configs=variables_dict + )