diff --git a/great_expectations/core/expectation_diagnostics/expectation_diagnostics.py b/great_expectations/core/expectation_diagnostics/expectation_diagnostics.py index e68aac1303ea..c62d7dfe7e4d 100644 --- a/great_expectations/core/expectation_diagnostics/expectation_diagnostics.py +++ b/great_expectations/core/expectation_diagnostics/expectation_diagnostics.py @@ -87,6 +87,7 @@ def generate_checklist(self) -> str: """Generates the checklist in CLI-appropriate string format.""" str_ = self._convert_checks_into_output_message( self.description["camel_name"], + self.library_metadata.maturity, self.maturity_checklist, ) return str_ @@ -327,11 +328,13 @@ def _count_unexpected_test_cases( @staticmethod def _convert_checks_into_output_message( - class_name: str, maturity_messages: ExpectationDiagnosticMaturityMessages + class_name: str, + maturity_level: str, + maturity_messages: ExpectationDiagnosticMaturityMessages, ) -> str: """Converts a list of checks into an output string (potentially nested), with ✔ to indicate checks that passed.""" - output_message = f"Completeness checklist for {class_name}:" + output_message = f"Completeness checklist for {class_name} ({maturity_level}):" checks = ( maturity_messages.experimental diff --git a/great_expectations/data_context/data_context/base_data_context.py b/great_expectations/data_context/data_context/base_data_context.py index 3da5413b676c..15abfddb00df 100644 --- a/great_expectations/data_context/data_context/base_data_context.py +++ b/great_expectations/data_context/data_context/base_data_context.py @@ -11,7 +11,7 @@ import warnings import webbrowser from collections import OrderedDict -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union, cast from dateutil.parser import parse from ruamel.yaml import YAML @@ -73,6 +73,7 @@ DataContextConfig, DataContextConfigDefaults, DatasourceConfig, + GeCloudConfig, ProgressBarsConfig, anonymizedUsageStatisticsSchema, dataContextConfigSchema, @@ -98,7 +99,6 @@ from great_expectations.datasource import LegacyDatasource from great_expectations.datasource.data_connector.data_connector import DataConnector from great_expectations.datasource.new_datasource import BaseDatasource, Datasource -from great_expectations.exceptions import DataContextError from great_expectations.marshmallow__shade import ValidationError from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler from great_expectations.render.renderer.site_builder import SiteBuilder @@ -300,7 +300,7 @@ class BaseDataContext(ConfigPeer): _data_context = None @classmethod - def validate_config(cls, project_config): + def validate_config(cls, project_config: Union[DataContextConfig, Mapping]) -> bool: if isinstance(project_config, DataContextConfig): return True try: @@ -314,12 +314,12 @@ def validate_config(cls, project_config): ) def __init__( self, - project_config, - context_root_dir=None, - runtime_environment=None, - ge_cloud_mode=False, - ge_cloud_config=None, - ): + project_config: Union[DataContextConfig, Mapping], + context_root_dir: Optional[str] = None, + runtime_environment: Optional[dict] = None, + ge_cloud_mode: bool = False, + ge_cloud_config: Optional[GeCloudConfig] = None, + ) -> None: """DataContext constructor Args: @@ -399,14 +399,16 @@ def __init__( self._evaluation_parameter_dependencies = {} @property - def ge_cloud_config(self): + def ge_cloud_config(self) -> Optional[GeCloudConfig]: return self._ge_cloud_config @property - def ge_cloud_mode(self): + def ge_cloud_mode(self) -> bool: return self._ge_cloud_mode - def _build_store_from_config(self, store_name, store_config): + def _build_store_from_config( + self, store_name: str, store_config: dict + ) -> Optional[Store]: module_name = "great_expectations.data_context.store" # Set expectations_store.store_backend_id to the data_context_id from the project_config if # the expectations_store does not yet exist by: @@ -439,7 +441,7 @@ def _build_store_from_config(self, store_name, store_config): self._stores[store_name] = new_store return new_store - def _init_stores(self, store_configs): + def _init_stores(self, store_configs: Dict[str, dict]) -> None: """Initialize all Stores for this DataContext. Stores are a good fit for reading/writing objects that: @@ -466,7 +468,7 @@ def _init_datasources(self, config: DataContextConfig) -> None: # caught at the context.get_batch() step. So we just pass here. pass - def _apply_global_config_overrides(self): + def _apply_global_config_overrides(self) -> None: # check for global usage statistics opt out validation_errors = {} @@ -524,8 +526,11 @@ def _apply_global_config_overrides(self): @classmethod def _get_global_config_value( - cls, environment_variable=None, conf_file_section=None, conf_file_option=None - ): + cls, + environment_variable: Optional[str] = None, + conf_file_section=None, + conf_file_option=None, + ) -> Optional[str]: assert (conf_file_section and conf_file_option) or ( not conf_file_section and not conf_file_option ), "Must pass both 'conf_file_section' and 'conf_file_option' or neither." @@ -543,7 +548,7 @@ def _get_global_config_value( return None @staticmethod - def _check_global_usage_statistics_opt_out(): + def _check_global_usage_statistics_opt_out() -> bool: if os.environ.get("GE_USAGE_STATS", False): ge_usage_stats = os.environ.get("GE_USAGE_STATS") if ge_usage_stats in BaseDataContext.FALSEY_STRINGS: @@ -598,7 +603,7 @@ def _construct_data_context_id(self) -> str: def _initialize_usage_statistics( self, usage_statistics_config: AnonymizedUsageStatisticsConfig - ): + ) -> None: """Initialize the usage statistics system.""" if not usage_statistics_config.enabled: logger.info("Usage statistics is disabled; skipping initialization.") @@ -611,7 +616,7 @@ def _initialize_usage_statistics( usage_statistics_url=usage_statistics_config.usage_statistics_url, ) - def add_store(self, store_name, store_config): + def add_store(self, store_name: str, store_config: dict) -> Optional[Store]: """Add a new Store to the DataContext and (for convenience) return the instantiated Store object. Args: @@ -626,8 +631,8 @@ def add_store(self, store_name, store_config): return self._build_store_from_config(store_name, store_config) def add_validation_operator( - self, validation_operator_name, validation_operator_config - ): + self, validation_operator_name: str, validation_operator_config: dict + ) -> "ValidationOperator": """Add a new ValidationOperator to the DataContext and (for convenience) return the instantiated object. Args: @@ -662,7 +667,9 @@ def add_validation_operator( self.validation_operators[validation_operator_name] = new_validation_operator return new_validation_operator - def _normalize_absolute_or_relative_path(self, path): + def _normalize_absolute_or_relative_path( + self, path: Optional[str] + ) -> Optional[str]: if path is None: return if os.path.isabs(path): diff --git a/great_expectations/data_context/data_context/data_context.py b/great_expectations/data_context/data_context/data_context.py index 3bf71366dcfb..a18d7cf6ca8b 100644 --- a/great_expectations/data_context/data_context/data_context.py +++ b/great_expectations/data_context/data_context/data_context.py @@ -2,7 +2,7 @@ import os import shutil import warnings -from typing import Optional, Union +from typing import Dict, Optional, Union import requests from ruamel.yaml import YAML, YAMLError @@ -76,10 +76,10 @@ class DataContext(BaseDataContext): @classmethod def create( cls, - project_root_dir=None, - usage_statistics_enabled=True, - runtime_environment=None, - ): + project_root_dir: Optional[str] = None, + usage_statistics_enabled: bool = True, + runtime_environment: Optional[dict] = None, + ) -> "DataContext": """ Build a new great_expectations directory and DataContext object in the provided project_root_dir. @@ -126,7 +126,7 @@ def create( return cls(ge_dir, runtime_environment=runtime_environment) @classmethod - def all_uncommitted_directories_exist(cls, ge_dir): + def all_uncommitted_directories_exist(cls, ge_dir: str) -> bool: """Check if all uncommitted directories exist.""" uncommitted_dir = os.path.join(ge_dir, cls.GE_UNCOMMITTED_DIR) for directory in cls.UNCOMMITTED_DIRECTORIES: @@ -136,7 +136,7 @@ def all_uncommitted_directories_exist(cls, ge_dir): return True @classmethod - def config_variables_yml_exist(cls, ge_dir): + def config_variables_yml_exist(cls, ge_dir: str) -> bool: """Check if all config_variables.yml exists.""" path_to_yml = os.path.join(ge_dir, cls.GE_YML) @@ -148,14 +148,16 @@ def config_variables_yml_exist(cls, ge_dir): return os.path.isfile(config_var_path) @classmethod - def write_config_variables_template_to_disk(cls, uncommitted_dir): + def write_config_variables_template_to_disk(cls, uncommitted_dir: str) -> None: os.makedirs(uncommitted_dir, exist_ok=True) config_var_file = os.path.join(uncommitted_dir, "config_variables.yml") with open(config_var_file, "w") as template: template.write(CONFIG_VARIABLES_TEMPLATE) @classmethod - def write_project_template_to_disk(cls, ge_dir, usage_statistics_enabled=True): + def write_project_template_to_disk( + cls, ge_dir: str, usage_statistics_enabled: bool = True + ) -> None: file_path = os.path.join(ge_dir, cls.GE_YML) with open(file_path, "w") as template: if usage_statistics_enabled: @@ -164,7 +166,7 @@ def write_project_template_to_disk(cls, ge_dir, usage_statistics_enabled=True): template.write(PROJECT_TEMPLATE_USAGE_STATISTICS_DISABLED) @classmethod - def scaffold_directories(cls, base_dir): + def scaffold_directories(cls, base_dir: str) -> None: """Safely create GE directories for a new project.""" os.makedirs(base_dir, exist_ok=True) with open(os.path.join(base_dir, ".gitignore"), "w") as f: @@ -200,7 +202,7 @@ def scaffold_directories(cls, base_dir): os.makedirs(new_directory_path, exist_ok=True) @classmethod - def scaffold_custom_data_docs(cls, plugins_dir): + def scaffold_custom_data_docs(cls, plugins_dir: str) -> None: """Copy custom data docs templates""" styles_template = file_relative_path( __file__, @@ -219,7 +221,7 @@ def _get_ge_cloud_config_dict( ge_cloud_account_id: Optional[str] = None, ge_cloud_access_token: Optional[str] = None, ge_cloud_organization_id: Optional[str] = None, - ): + ) -> Dict[str, Optional[str]]: ge_cloud_base_url = ( ge_cloud_base_url or super()._get_global_config_value( @@ -278,7 +280,7 @@ def get_ge_cloud_config( ge_cloud_account_id: Optional[str] = None, ge_cloud_access_token: Optional[str] = None, ge_cloud_organization_id: Optional[str] = None, - ): + ) -> GeCloudConfig: """ Build a GeCloudConfig object. Config attributes are collected from any combination of args passed in at runtime, environment variables, or a global great_expectations.conf file (in order of precedence) diff --git a/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py b/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py index 500ebcce5658..41d80c3f5d80 100644 --- a/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional +from typing import Dict, List, Optional from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine @@ -12,6 +12,18 @@ parse_row_condition_string_pandas_engine, substitute_none_for_missing, ) +from great_expectations.rule_based_profiler.config.base import ( + ParameterBuilderConfig, + RuleBasedProfilerConfig, +) +from great_expectations.rule_based_profiler.types.parameter_container import ( + DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, + FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY, + FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER, + FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY, + PARAMETER_KEY, + VARIABLES_KEY, +) class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation): @@ -83,6 +95,73 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation): success_keys = ( "min_value", "max_value", + "auto", + "profiler_config", + ) + + column_unique_values_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( + module_name="great_expectations.rule_based_profiler.parameter_builder", + class_name="NumericMetricRangeMultiBatchParameterBuilder", + name="column_unique_values_range_estimator", + metric_name="column.distinct_values.count", + metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, + metric_value_kwargs=None, + enforce_numeric_metric=True, + replace_nan_with_zero=True, + reduce_scalar_metric=True, + false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", + estimator=f"{VARIABLES_KEY}estimator", + num_bootstrap_samples=f"{VARIABLES_KEY}num_bootstrap_samples", + bootstrap_random_seed=f"{VARIABLES_KEY}bootstrap_random_seed", + truncate_values=f"{VARIABLES_KEY}truncate_values", + round_decimals=f"{VARIABLES_KEY}round_decimals", + evaluation_parameter_builder_configs=None, + json_serialize=True, + ) + validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ + column_unique_values_range_estimator_parameter_builder_config, + ] + default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( + name="expect_column_unique_value_count_to_be_between", # Convention: use "expectation_type" as profiler name. + config_version=1.0, + variables={ + "mostly": 1.0, + "strict_min": False, + "strict_max": False, + "false_positive_rate": 0.05, + "estimator": "bootstrap", + "num_bootstrap_samples": 9999, + "bootstrap_random_seed": None, + "truncate_values": { + "lower_bound": 0, + "upper_bound": None, + }, + "round_decimals": 0, + }, + rules={ + "default_expect_column_unique_values_to_be_between_rule": { + "domain_builder": { + "class_name": "ColumnDomainBuilder", + "module_name": "great_expectations.rule_based_profiler.domain_builder", + }, + "expectation_configuration_builders": [ + { + "expectation_type": "expect_column_unique_value_count_to_be_between", + "class_name": "DefaultExpectationConfigurationBuilder", + "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", + "validation_parameter_builder_configs": validation_parameter_builder_configs, + "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", + "min_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", + "max_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", + "strict_min": f"{VARIABLES_KEY}strict_min", + "strict_max": f"{VARIABLES_KEY}strict_max", + "meta": { + "profiler_details": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", + }, + }, + ], + }, + }, ) # Default values @@ -94,6 +173,8 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation): "result_format": "BASIC", "include_config": True, "catch_exceptions": False, + "auto": False, + "profiler_config": default_profiler_config, } args_keys = ( "column", diff --git a/great_expectations/expectations/core/expect_column_values_to_be_between.py b/great_expectations/expectations/core/expect_column_values_to_be_between.py index 5fe888064e81..93a3e0cb3737 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_between.py @@ -108,7 +108,7 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation): "profiler_config", ) - min_estimato_parameter_builder_config: ParameterBuilderConfig = ( + min_estimator_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", @@ -123,7 +123,7 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation): json_serialize=True, ) ) - max_estimato_parameter_builder_config: ParameterBuilderConfig = ( + max_estimator_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", @@ -139,8 +139,8 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation): ) ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ - min_estimato_parameter_builder_config, - max_estimato_parameter_builder_config, + min_estimator_parameter_builder_config, + max_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name="expect_column_values_to_be_between", # Convention: use "expectation_type" as profiler name. @@ -163,15 +163,15 @@ class ExpectColumnValuesToBeBetween(ColumnMapExpectation): "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", - "min_value": f"{PARAMETER_KEY}{min_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]", - "max_value": f"{PARAMETER_KEY}{max_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]", + "min_value": f"{PARAMETER_KEY}{min_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]", + "max_value": f"{PARAMETER_KEY}{max_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[-1]", "mostly": f"{VARIABLES_KEY}mostly", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": { - "min_estimator": f"{PARAMETER_KEY}{min_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", - "max_estimator": f"{PARAMETER_KEY}{max_estimato_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", + "min_estimator": f"{PARAMETER_KEY}{min_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", + "max_estimator": f"{PARAMETER_KEY}{max_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, }, diff --git a/great_expectations/expectations/expectation.py b/great_expectations/expectations/expectation.py index 7318309f7b6e..9b92cd2389b5 100644 --- a/great_expectations/expectations/expectation.py +++ b/great_expectations/expectations/expectation.py @@ -8,15 +8,12 @@ from collections import Counter from copy import deepcopy from inspect import isabstract -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import pandas as pd from dateutil.parser import parse -from numpy import negative from great_expectations import __version__ as ge_version -from great_expectations import execution_engine -from great_expectations.core import expectation_configuration from great_expectations.core.batch import Batch from great_expectations.core.expectation_configuration import ( ExpectationConfiguration, @@ -58,6 +55,7 @@ from great_expectations.expectations.registry import ( _registered_metrics, _registered_renderers, + get_expectation_impl, get_metric_kwargs, register_expectation, register_renderer, @@ -75,6 +73,7 @@ renderedAtomicValueSchema, ) from great_expectations.render.util import num_to_str +from great_expectations.rule_based_profiler.config.base import RuleBasedProfilerConfig from great_expectations.self_check.util import ( evaluate_json_test_cfe, generate_expectation_tests, @@ -2406,3 +2405,22 @@ def _format_map_output( return return_obj raise ValueError(f"Unknown result_format {result_format['result_format']}.") + + +def get_default_profiler_config_for_expectation_type( + expectation_type: str, +) -> Optional[RuleBasedProfilerConfig]: + """Retrieves the default profiler config as defined within a given Expectation. + + Args: + expectation_type (str): The name of the Expectation to parse + + Returns: + The default profiler config within the target Expectation. + If not available, returns None. + """ + expectation_impl = get_expectation_impl(expectation_name=expectation_type) + profiler_config: Optional[ + RuleBasedProfilerConfig + ] = expectation_impl.default_kwarg_values.get("profiler_config") + return profiler_config diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index 2a62d51c5b6e..c976e80ea703 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -1616,7 +1616,7 @@ def generate_expectation_tests( c, d["data"], d["schemas"] ) except Exception as e: - # Adding these print staments for build_gallery.py's console output + # Adding these print statements for build_gallery.py's console output print("\n\n[[ Problem calling get_test_validator_with_data ]]") print(f"expectation_type -> {expectation_type}") print(f"c -> {c}\ne -> {e}") diff --git a/scripts/check_docstring_coverage.py b/scripts/check_docstring_coverage.py index df14884368a1..bca72271fe26 100644 --- a/scripts/check_docstring_coverage.py +++ b/scripts/check_docstring_coverage.py @@ -7,7 +7,7 @@ Diagnostics = Dict[str, List[Tuple[ast.FunctionDef, bool]]] DOCSTRING_ERROR_THRESHOLD: int = ( - 1112 # This number is to be reduced as we document more public functions! + 1107 # This number is to be reduced as we document more public functions! ) @@ -125,8 +125,9 @@ def review_diagnostics(diagnostics: Diagnostics, changed_files: List[str]) -> No relevant_diagnostics[file].append(func) total_funcs += 1 + total_failed: int = total_funcs - total_passed print( - f"[SUMMARY] {total_passed} of {total_funcs} public functions ({100 * total_passed / total_funcs:.2f}%) have docstrings!" + f"[SUMMARY] {total_failed} of {total_funcs} public functions ({100 * total_failed / total_funcs:.2f}%) are missing docstrings!" ) if relevant_diagnostics: @@ -142,7 +143,6 @@ def review_diagnostics(diagnostics: Diagnostics, changed_files: List[str]) -> No # Chetan - 20220305 - While this number should be 0, getting the number of style guide violations down takes time # and effort. In the meanwhile, we want to set an upper bound on errors to ensure we're not introducing # further regressions. As docstrings are added, developers should update this number. - total_failed: int = total_funcs - total_passed assert ( total_failed <= DOCSTRING_ERROR_THRESHOLD ), f"""A public function without a docstring was introduced; please resolve the matter before merging. diff --git a/scripts/check_type_hint_coverage.py b/scripts/check_type_hint_coverage.py index 4a57ed4916dc..a3ff17f424c5 100644 --- a/scripts/check_type_hint_coverage.py +++ b/scripts/check_type_hint_coverage.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional TYPE_HINT_ERROR_THRESHOLD: int = ( - 2867 # This number is to be reduced as we annotate more functions! + 2839 # This number is to be reduced as we annotate more functions! ) diff --git a/tests/expectations/test_expectation_diagnostics.py b/tests/expectations/test_expectation_diagnostics.py index 3af0491d0070..e05b365b3a90 100644 --- a/tests/expectations/test_expectation_diagnostics.py +++ b/tests/expectations/test_expectation_diagnostics.py @@ -140,10 +140,11 @@ def test__convert_checks_into_output_message(): assert ( edr._convert_checks_into_output_message( class_name="ExpectColumnValuesToEqualThree", + maturity_level="EXPERIMENTAL", maturity_messages=edr.maturity_checklist, ) == """\ -Completeness checklist for ExpectColumnValuesToEqualThree: +Completeness checklist for ExpectColumnValuesToEqualThree (EXPERIMENTAL): ✔ AAA BBB CCC diff --git a/tests/expectations/test_generate_diagnostic_checklist.py b/tests/expectations/test_generate_diagnostic_checklist.py index a8bc6fd94780..09d1a1e0886b 100644 --- a/tests/expectations/test_generate_diagnostic_checklist.py +++ b/tests/expectations/test_generate_diagnostic_checklist.py @@ -34,7 +34,7 @@ def test_print_diagnostic_checklist__second_iteration(): assert ( output_message == """\ -Completeness checklist for ExpectColumnValuesToEqualThree__SecondIteration: +Completeness checklist for ExpectColumnValuesToEqualThree__SecondIteration (EXPERIMENTAL): ✔ Has a valid library_metadata object ✔ Has a docstring, including a one-line short description ✔ "Expect values in this column to equal the number three." @@ -63,7 +63,7 @@ def test_print_diagnostic_checklist__third_iteration(): assert ( output_message == """\ -Completeness checklist for ExpectColumnValuesToEqualThree__ThirdIteration: +Completeness checklist for ExpectColumnValuesToEqualThree__ThirdIteration (EXPERIMENTAL): ✔ Has a valid library_metadata object Has a docstring, including a one-line short description ✔ Has at least one positive and negative example case, and all test cases pass diff --git a/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py b/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py index 73ba9946c6eb..6ed280149c06 100644 --- a/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py +++ b/tests/integration/profiling/rule_based_profilers/test_profiler_user_workflows.py @@ -1,7 +1,7 @@ import datetime import uuid from numbers import Number -from typing import Any, Dict, List, Optional, Tuple, cast +from typing import Any, Callable, Dict, List, Optional, Tuple, cast from unittest import mock import numpy as np @@ -17,6 +17,9 @@ from great_expectations.core import ExpectationSuite, ExpectationValidationResult from great_expectations.core.batch import BatchRequest from great_expectations.datasource import DataConnector, Datasource +from great_expectations.expectations.expectation import ( + get_default_profiler_config_for_expectation_type, +) from great_expectations.expectations.registry import get_expectation_impl from great_expectations.rule_based_profiler.config.base import ( RuleBasedProfilerConfig, @@ -31,10 +34,23 @@ usage_stats_invalid_messages_exist, ) from tests.rule_based_profiler.conftest import ATOL, RTOL +from tests.rule_based_profiler.parameter_builder.conftest import RANDOM_SEED yaml = YAML() +@pytest.fixture +def set_consistent_seed_within_expectation_default_profiler_config() -> Callable: + def _set_seed(expectation_type: str): + default_profiler: Optional[ + RuleBasedProfilerConfig + ] = get_default_profiler_config_for_expectation_type(expectation_type) + assert default_profiler is not None and default_profiler.variables is not None + default_profiler.variables["bootstrap_random_seed"] = RANDOM_SEED + + return _set_seed + + def test_alice_columnar_table_single_batch_batches_are_accessible( alice_columnar_table_single_batch_context, alice_columnar_table_single_batch, @@ -2236,8 +2252,6 @@ def test_quentin_expect_column_max_to_be_between_auto_yes_default_profiler_confi result: ExpectationValidationResult - custom_profiler_config: RuleBasedProfilerConfig - suite: ExpectationSuite expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" @@ -2317,3 +2331,91 @@ def test_quentin_expect_column_max_to_be_between_auto_yes_default_profiler_confi atol=atol, err_msg=f"Actual value of {max_value_actual} differs from expected value of {max_value_expected} by more than {atol + rtol * abs(max_value_expected)} tolerance.", ) + + +@pytest.mark.skipif( + version.parse(np.version.version) < version.parse("1.21.0"), + reason="requires numpy version 1.21.0 or newer", +) +@freeze_time("09/26/2019 13:42:41") +def test_quentin_expect_column_unique_value_count_to_be_between_auto_yes_default_profiler_config_yes_custom_profiler_config_no( + quentin_columnar_table_multi_batch_data_context, + set_consistent_seed_within_expectation_default_profiler_config: Callable, +) -> None: + context: DataContext = quentin_columnar_table_multi_batch_data_context + + result: ExpectationValidationResult + + suite: ExpectationSuite + + expectation_suite_name: str = f"tmp.profiler_suite_{str(uuid.uuid4())[:8]}" + try: + # noinspection PyUnusedLocal + suite = context.get_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + except ge_exceptions.DataContextError: + suite = context.create_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + print(f'Created ExpectationSuite "{suite.expectation_suite_name}".') + + batch_request: dict = { + "datasource_name": "taxi_pandas", + "data_connector_name": "monthly", + "data_asset_name": "my_reports", + } + + validator: Validator = context.get_validator( + batch_request=BatchRequest(**batch_request), + expectation_suite_name=expectation_suite_name, + ) + assert len(validator.batches) == 36 + + # Utilize a consistent seed to deal with probabilistic nature of this feature + set_consistent_seed_within_expectation_default_profiler_config( + "expect_column_unique_value_count_to_be_between" + ) + + test_cases: Tuple[Tuple[str, int, int], ...] = ( + ("pickup_location_id", 118, 212), + ("dropoff_location_id", 190, 236), + ) + + for column_name, min_value_expected, max_value_expected in test_cases: + + # Use all batches, loaded by Validator, for estimating Expectation argument values. + result = validator.expect_column_unique_value_count_to_be_between( + column=column_name, + result_format="SUMMARY", + include_config=True, + auto=True, + ) + assert result.success + + key: str + value: Any + expectation_config_kwargs: dict = { + key: value + for key, value in result.expectation_config["kwargs"].items() + if key + not in [ + "min_value", + "max_value", + ] + } + assert expectation_config_kwargs == { + "column": column_name, + "strict_min": False, + "strict_max": False, + "result_format": "SUMMARY", + "include_config": True, + "auto": True, + "batch_id": "84000630d1b69a0fe870c94fb26a32bc", + } + + min_value_actual: int = result.expectation_config["kwargs"]["min_value"] + assert min_value_actual == min_value_expected + + max_value_actual: int = result.expectation_config["kwargs"]["max_value"] + assert max_value_actual == max_value_expected