Skip to content

Commit

Permalink
Merge branch 'develop' into bugfix/multi-table-expectation-diagnostics
Browse files Browse the repository at this point in the history
  • Loading branch information
austiezr committed Apr 20, 2022
2 parents afbebaa + 40940b6 commit be3b023
Show file tree
Hide file tree
Showing 12 changed files with 275 additions and 61 deletions.
Expand Up @@ -87,6 +87,7 @@ def generate_checklist(self) -> str:
"""Generates the checklist in CLI-appropriate string format."""
str_ = self._convert_checks_into_output_message(
self.description["camel_name"],
self.library_metadata.maturity,
self.maturity_checklist,
)
return str_
Expand Down Expand Up @@ -327,11 +328,13 @@ def _count_unexpected_test_cases(

@staticmethod
def _convert_checks_into_output_message(
class_name: str, maturity_messages: ExpectationDiagnosticMaturityMessages
class_name: str,
maturity_level: str,
maturity_messages: ExpectationDiagnosticMaturityMessages,
) -> str:
"""Converts a list of checks into an output string (potentially nested), with ✔ to indicate checks that passed."""

output_message = f"Completeness checklist for {class_name}:"
output_message = f"Completeness checklist for {class_name} ({maturity_level}):"

checks = (
maturity_messages.experimental
Expand Down
51 changes: 29 additions & 22 deletions great_expectations/data_context/data_context/base_data_context.py
Expand Up @@ -11,7 +11,7 @@
import warnings
import webbrowser
from collections import OrderedDict
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union, cast

from dateutil.parser import parse
from ruamel.yaml import YAML
Expand Down Expand Up @@ -73,6 +73,7 @@
DataContextConfig,
DataContextConfigDefaults,
DatasourceConfig,
GeCloudConfig,
ProgressBarsConfig,
anonymizedUsageStatisticsSchema,
dataContextConfigSchema,
Expand All @@ -98,7 +99,6 @@
from great_expectations.datasource import LegacyDatasource
from great_expectations.datasource.data_connector.data_connector import DataConnector
from great_expectations.datasource.new_datasource import BaseDatasource, Datasource
from great_expectations.exceptions import DataContextError
from great_expectations.marshmallow__shade import ValidationError
from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler
from great_expectations.render.renderer.site_builder import SiteBuilder
Expand Down Expand Up @@ -300,7 +300,7 @@ class BaseDataContext(ConfigPeer):
_data_context = None

@classmethod
def validate_config(cls, project_config):
def validate_config(cls, project_config: Union[DataContextConfig, Mapping]) -> bool:
if isinstance(project_config, DataContextConfig):
return True
try:
Expand All @@ -314,12 +314,12 @@ def validate_config(cls, project_config):
)
def __init__(
self,
project_config,
context_root_dir=None,
runtime_environment=None,
ge_cloud_mode=False,
ge_cloud_config=None,
):
project_config: Union[DataContextConfig, Mapping],
context_root_dir: Optional[str] = None,
runtime_environment: Optional[dict] = None,
ge_cloud_mode: bool = False,
ge_cloud_config: Optional[GeCloudConfig] = None,
) -> None:
"""DataContext constructor
Args:
Expand Down Expand Up @@ -399,14 +399,16 @@ def __init__(
self._evaluation_parameter_dependencies = {}

@property
def ge_cloud_config(self):
def ge_cloud_config(self) -> Optional[GeCloudConfig]:
return self._ge_cloud_config

@property
def ge_cloud_mode(self):
def ge_cloud_mode(self) -> bool:
return self._ge_cloud_mode

def _build_store_from_config(self, store_name, store_config):
def _build_store_from_config(
self, store_name: str, store_config: dict
) -> Optional[Store]:
module_name = "great_expectations.data_context.store"
# Set expectations_store.store_backend_id to the data_context_id from the project_config if
# the expectations_store does not yet exist by:
Expand Down Expand Up @@ -439,7 +441,7 @@ def _build_store_from_config(self, store_name, store_config):
self._stores[store_name] = new_store
return new_store

def _init_stores(self, store_configs):
def _init_stores(self, store_configs: Dict[str, dict]) -> None:
"""Initialize all Stores for this DataContext.
Stores are a good fit for reading/writing objects that:
Expand All @@ -466,7 +468,7 @@ def _init_datasources(self, config: DataContextConfig) -> None:
# caught at the context.get_batch() step. So we just pass here.
pass

def _apply_global_config_overrides(self):
def _apply_global_config_overrides(self) -> None:
# check for global usage statistics opt out
validation_errors = {}

Expand Down Expand Up @@ -524,8 +526,11 @@ def _apply_global_config_overrides(self):

@classmethod
def _get_global_config_value(
cls, environment_variable=None, conf_file_section=None, conf_file_option=None
):
cls,
environment_variable: Optional[str] = None,
conf_file_section=None,
conf_file_option=None,
) -> Optional[str]:
assert (conf_file_section and conf_file_option) or (
not conf_file_section and not conf_file_option
), "Must pass both 'conf_file_section' and 'conf_file_option' or neither."
Expand All @@ -543,7 +548,7 @@ def _get_global_config_value(
return None

@staticmethod
def _check_global_usage_statistics_opt_out():
def _check_global_usage_statistics_opt_out() -> bool:
if os.environ.get("GE_USAGE_STATS", False):
ge_usage_stats = os.environ.get("GE_USAGE_STATS")
if ge_usage_stats in BaseDataContext.FALSEY_STRINGS:
Expand Down Expand Up @@ -598,7 +603,7 @@ def _construct_data_context_id(self) -> str:

def _initialize_usage_statistics(
self, usage_statistics_config: AnonymizedUsageStatisticsConfig
):
) -> None:
"""Initialize the usage statistics system."""
if not usage_statistics_config.enabled:
logger.info("Usage statistics is disabled; skipping initialization.")
Expand All @@ -611,7 +616,7 @@ def _initialize_usage_statistics(
usage_statistics_url=usage_statistics_config.usage_statistics_url,
)

def add_store(self, store_name, store_config):
def add_store(self, store_name: str, store_config: dict) -> Optional[Store]:
"""Add a new Store to the DataContext and (for convenience) return the instantiated Store object.
Args:
Expand All @@ -626,8 +631,8 @@ def add_store(self, store_name, store_config):
return self._build_store_from_config(store_name, store_config)

def add_validation_operator(
self, validation_operator_name, validation_operator_config
):
self, validation_operator_name: str, validation_operator_config: dict
) -> "ValidationOperator":
"""Add a new ValidationOperator to the DataContext and (for convenience) return the instantiated object.
Args:
Expand Down Expand Up @@ -662,7 +667,9 @@ def add_validation_operator(
self.validation_operators[validation_operator_name] = new_validation_operator
return new_validation_operator

def _normalize_absolute_or_relative_path(self, path):
def _normalize_absolute_or_relative_path(
self, path: Optional[str]
) -> Optional[str]:
if path is None:
return
if os.path.isabs(path):
Expand Down
28 changes: 15 additions & 13 deletions great_expectations/data_context/data_context/data_context.py
Expand Up @@ -2,7 +2,7 @@
import os
import shutil
import warnings
from typing import Optional, Union
from typing import Dict, Optional, Union

import requests
from ruamel.yaml import YAML, YAMLError
Expand Down Expand Up @@ -76,10 +76,10 @@ class DataContext(BaseDataContext):
@classmethod
def create(
cls,
project_root_dir=None,
usage_statistics_enabled=True,
runtime_environment=None,
):
project_root_dir: Optional[str] = None,
usage_statistics_enabled: bool = True,
runtime_environment: Optional[dict] = None,
) -> "DataContext":
"""
Build a new great_expectations directory and DataContext object in the provided project_root_dir.
Expand Down Expand Up @@ -126,7 +126,7 @@ def create(
return cls(ge_dir, runtime_environment=runtime_environment)

@classmethod
def all_uncommitted_directories_exist(cls, ge_dir):
def all_uncommitted_directories_exist(cls, ge_dir: str) -> bool:
"""Check if all uncommitted directories exist."""
uncommitted_dir = os.path.join(ge_dir, cls.GE_UNCOMMITTED_DIR)
for directory in cls.UNCOMMITTED_DIRECTORIES:
Expand All @@ -136,7 +136,7 @@ def all_uncommitted_directories_exist(cls, ge_dir):
return True

@classmethod
def config_variables_yml_exist(cls, ge_dir):
def config_variables_yml_exist(cls, ge_dir: str) -> bool:
"""Check if all config_variables.yml exists."""
path_to_yml = os.path.join(ge_dir, cls.GE_YML)

Expand All @@ -148,14 +148,16 @@ def config_variables_yml_exist(cls, ge_dir):
return os.path.isfile(config_var_path)

@classmethod
def write_config_variables_template_to_disk(cls, uncommitted_dir):
def write_config_variables_template_to_disk(cls, uncommitted_dir: str) -> None:
os.makedirs(uncommitted_dir, exist_ok=True)
config_var_file = os.path.join(uncommitted_dir, "config_variables.yml")
with open(config_var_file, "w") as template:
template.write(CONFIG_VARIABLES_TEMPLATE)

@classmethod
def write_project_template_to_disk(cls, ge_dir, usage_statistics_enabled=True):
def write_project_template_to_disk(
cls, ge_dir: str, usage_statistics_enabled: bool = True
) -> None:
file_path = os.path.join(ge_dir, cls.GE_YML)
with open(file_path, "w") as template:
if usage_statistics_enabled:
Expand All @@ -164,7 +166,7 @@ def write_project_template_to_disk(cls, ge_dir, usage_statistics_enabled=True):
template.write(PROJECT_TEMPLATE_USAGE_STATISTICS_DISABLED)

@classmethod
def scaffold_directories(cls, base_dir):
def scaffold_directories(cls, base_dir: str) -> None:
"""Safely create GE directories for a new project."""
os.makedirs(base_dir, exist_ok=True)
with open(os.path.join(base_dir, ".gitignore"), "w") as f:
Expand Down Expand Up @@ -200,7 +202,7 @@ def scaffold_directories(cls, base_dir):
os.makedirs(new_directory_path, exist_ok=True)

@classmethod
def scaffold_custom_data_docs(cls, plugins_dir):
def scaffold_custom_data_docs(cls, plugins_dir: str) -> None:
"""Copy custom data docs templates"""
styles_template = file_relative_path(
__file__,
Expand All @@ -219,7 +221,7 @@ def _get_ge_cloud_config_dict(
ge_cloud_account_id: Optional[str] = None,
ge_cloud_access_token: Optional[str] = None,
ge_cloud_organization_id: Optional[str] = None,
):
) -> Dict[str, Optional[str]]:
ge_cloud_base_url = (
ge_cloud_base_url
or super()._get_global_config_value(
Expand Down Expand Up @@ -278,7 +280,7 @@ def get_ge_cloud_config(
ge_cloud_account_id: Optional[str] = None,
ge_cloud_access_token: Optional[str] = None,
ge_cloud_organization_id: Optional[str] = None,
):
) -> GeCloudConfig:
"""
Build a GeCloudConfig object. Config attributes are collected from any combination of args passed in at
runtime, environment variables, or a global great_expectations.conf file (in order of precedence)
Expand Down
@@ -1,4 +1,4 @@
from typing import Dict, Optional
from typing import Dict, List, Optional

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.execution_engine import ExecutionEngine
Expand All @@ -12,6 +12,18 @@
parse_row_condition_string_pandas_engine,
substitute_none_for_missing,
)
from great_expectations.rule_based_profiler.config.base import (
ParameterBuilderConfig,
RuleBasedProfilerConfig,
)
from great_expectations.rule_based_profiler.types.parameter_container import (
DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY,
FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER,
FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY,
PARAMETER_KEY,
VARIABLES_KEY,
)


class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation):
Expand Down Expand Up @@ -83,6 +95,73 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation):
success_keys = (
"min_value",
"max_value",
"auto",
"profiler_config",
)

column_unique_values_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig(
module_name="great_expectations.rule_based_profiler.parameter_builder",
class_name="NumericMetricRangeMultiBatchParameterBuilder",
name="column_unique_values_range_estimator",
metric_name="column.distinct_values.count",
metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
metric_value_kwargs=None,
enforce_numeric_metric=True,
replace_nan_with_zero=True,
reduce_scalar_metric=True,
false_positive_rate=f"{VARIABLES_KEY}false_positive_rate",
estimator=f"{VARIABLES_KEY}estimator",
num_bootstrap_samples=f"{VARIABLES_KEY}num_bootstrap_samples",
bootstrap_random_seed=f"{VARIABLES_KEY}bootstrap_random_seed",
truncate_values=f"{VARIABLES_KEY}truncate_values",
round_decimals=f"{VARIABLES_KEY}round_decimals",
evaluation_parameter_builder_configs=None,
json_serialize=True,
)
validation_parameter_builder_configs: List[ParameterBuilderConfig] = [
column_unique_values_range_estimator_parameter_builder_config,
]
default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
name="expect_column_unique_value_count_to_be_between", # Convention: use "expectation_type" as profiler name.
config_version=1.0,
variables={
"mostly": 1.0,
"strict_min": False,
"strict_max": False,
"false_positive_rate": 0.05,
"estimator": "bootstrap",
"num_bootstrap_samples": 9999,
"bootstrap_random_seed": None,
"truncate_values": {
"lower_bound": 0,
"upper_bound": None,
},
"round_decimals": 0,
},
rules={
"default_expect_column_unique_values_to_be_between_rule": {
"domain_builder": {
"class_name": "ColumnDomainBuilder",
"module_name": "great_expectations.rule_based_profiler.domain_builder",
},
"expectation_configuration_builders": [
{
"expectation_type": "expect_column_unique_value_count_to_be_between",
"class_name": "DefaultExpectationConfigurationBuilder",
"module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder",
"validation_parameter_builder_configs": validation_parameter_builder_configs,
"column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
"min_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]",
"max_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]",
"strict_min": f"{VARIABLES_KEY}strict_min",
"strict_max": f"{VARIABLES_KEY}strict_max",
"meta": {
"profiler_details": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
},
},
],
},
},
)

# Default values
Expand All @@ -94,6 +173,8 @@ class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation):
"result_format": "BASIC",
"include_config": True,
"catch_exceptions": False,
"auto": False,
"profiler_config": default_profiler_config,
}
args_keys = (
"column",
Expand Down

0 comments on commit be3b023

Please sign in to comment.