Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MAINTENANCE] Defining Common Test Fixtures for DataAssistant Testing #4959

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 0 additions & 8 deletions great_expectations/exceptions/exceptions.py
Expand Up @@ -135,14 +135,6 @@ class ProfilerNotFoundError(ProfilerError):
pass


class DataAssistantError(GreatExpectationsError):
pass


class DataAssistantNotFoundError(DataAssistantError):
pass


class InvalidConfigError(DataContextError):
def __init__(self, message):
self.message = message
Expand Down
6 changes: 4 additions & 2 deletions great_expectations/expectations/expectation.py
Expand Up @@ -10,11 +10,9 @@
from inspect import isabstract
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
from dateutil.parser import parse

from great_expectations import __version__ as ge_version
from great_expectations.core.batch import Batch
from great_expectations.core.expectation_configuration import (
ExpectationConfiguration,
parse_result_format,
Expand Down Expand Up @@ -94,6 +92,7 @@
)


# noinspection PyMethodParameters
class MetaExpectation(ABCMeta):
"""MetaExpectation registers Expectations as they are defined, adding them to the Expectation registry.

Expand All @@ -103,9 +102,12 @@ class MetaExpectation(ABCMeta):

def __new__(cls, clsname, bases, attrs):
newclass = super().__new__(cls, clsname, bases, attrs)
# noinspection PyUnresolvedReferences
if not newclass.is_abstract():
newclass.expectation_type = camel_to_snake(clsname)
register_expectation(newclass)

# noinspection PyUnresolvedReferences
newclass._register_renderer_functions()
default_kwarg_values = {}
for base in reversed(bases):
Expand Down
Expand Up @@ -20,7 +20,7 @@
from great_expectations.validator.metric_configuration import MetricConfiguration


class DomainBuilder(Builder, ABC):
class DomainBuilder(ABC, Builder):
"""
A DomainBuilder provides methods to get domains based on one or more batches of data.
"""
Expand Down
Expand Up @@ -20,7 +20,7 @@
logger.setLevel(logging.INFO)


class ExpectationConfigurationBuilder(Builder, ABC):
class ExpectationConfigurationBuilder(ABC, Builder):
exclude_field_names: Set[str] = Builder.exclude_field_names | {
"validation_parameter_builders",
}
Expand Down
Expand Up @@ -91,7 +91,7 @@ def to_json_dict(self) -> dict:
return convert_to_json_serializable(data=self.to_dict())


class ParameterBuilder(Builder, ABC):
class ParameterBuilder(ABC, Builder):
"""
A ParameterBuilder implementation provides support for building Expectation Configuration Parameters suitable for
use in other ParameterBuilders or in ConfigurationBuilders as part of profiling.
Expand Down
239 changes: 128 additions & 111 deletions tests/rule_based_profiler/data_assistant/test_volume_data_assistant.py
Expand Up @@ -28,111 +28,8 @@
from tests.test_utils import set_bootstrap_random_seed_variable


def run_volume_data_assistant_result_jupyter_notebook_with_new_cell(
context: DataContext, new_cell: str
):
"""
To set this test up we:
- create a suite
- write code (as a string) for creating a VolumeDataAssistantResult
- add a new cell to the notebook that was passed to this method
- write both cells to ipynb file

We then:
- load the notebook back from disk
- execute the notebook (Note: this will raise various errors like
CellExecutionError if any cell in the notebook fails)
"""

root_dir: str = context.root_directory
expectation_suite_name: str = "test_suite"
context.create_expectation_suite(expectation_suite_name)
notebook_path: str = os.path.join(root_dir, f"run_volume_data_assistant.ipynb")
notebook_code: str = """
from typing import Optional, Union

import uuid

import great_expectations as ge
from great_expectations.data_context import BaseDataContext
from great_expectations.validator.validator import Validator
from great_expectations.rule_based_profiler.data_assistant import (
DataAssistant,
VolumeDataAssistant,
)
from great_expectations.rule_based_profiler.types.data_assistant_result import DataAssistantResult
from great_expectations.rule_based_profiler.helpers.util import get_validator_with_expectation_suite
import great_expectations.exceptions as ge_exceptions
"""
notebook_code += """
context = ge.get_context()

batch_request: dict = {
"datasource_name": "taxi_pandas",
"data_connector_name": "monthly",
"data_asset_name": "my_reports",
}

validator: Validator = get_validator_with_expectation_suite(
batch_request=batch_request,
data_context=context,
expectation_suite_name=None,
expectation_suite=None,
component_name="volume_data_assistant",
)

data_assistant: DataAssistant = VolumeDataAssistant(
name="test_volume_data_assistant",
validator=validator,
)

expectation_suite_name: str = "test_suite"
data_assistant_result: DataAssistantResult = data_assistant.run(
expectation_suite_name=expectation_suite_name,
)
"""

nb = nbformat.v4.new_notebook()
nb["cells"] = []
nb["cells"].append(nbformat.v4.new_code_cell(notebook_code))
nb["cells"].append(nbformat.v4.new_code_cell(new_cell))

# Write notebook to path and load it as NotebookNode
with open(notebook_path, "w") as f:
nbformat.write(nb, f)

nb: nbformat.notebooknode.NotebookNode = load_notebook_from_path(
notebook_path=notebook_path
)

# Run notebook
ep: nbconvert.preprocessors.ExecutePreprocessor = (
nbconvert.preprocessors.ExecutePreprocessor(timeout=60, kernel_name="python3")
)
ep.preprocess(nb, {"metadata": {"path": root_dir}})


@freeze_time("09/26/2019 13:42:41")
def test_get_metrics_and_expectations(
quentin_columnar_table_multi_batch_data_context,
):
context: DataContext = quentin_columnar_table_multi_batch_data_context

batch_request: dict = {
"datasource_name": "taxi_pandas",
"data_connector_name": "monthly",
"data_asset_name": "my_reports",
}

validator: Validator = get_validator_with_expectation_suite(
batch_request=batch_request,
data_context=context,
expectation_suite_name=None,
expectation_suite=None,
component_name="volume_data_assistant",
)
assert len(validator.batches) == 36

@pytest.fixture()
def quentin_expected_metrics_by_domain() -> Dict[Domain, Dict[str, Any]]:
expected_metrics_by_domain: Dict[Domain, Dict[str, Any]] = {
Domain(domain_type="table",): {
"$parameter.table_row_count": {
Expand Down Expand Up @@ -1841,7 +1738,11 @@ def test_get_metrics_and_expectations(
}
},
}
return expected_metrics_by_domain


@pytest.fixture()
def quentin_expected_expectation_suite() -> ExpectationSuite:
expected_expect_table_row_count_to_be_between_expectation_configuration: ExpectationConfiguration = ExpectationConfiguration(
**{
"expectation_type": "expect_table_row_count_to_be_between",
Expand Down Expand Up @@ -2465,6 +2366,11 @@ def test_get_metrics_and_expectations(

expected_expectation_suite.meta = expected_expectation_suite_meta

return expected_expectation_suite


@pytest.fixture()
def quentin_expected_rule_based_profiler_configuration() -> RuleBasedProfilerConfig:
expected_rule_based_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
config_version=1.0,
name="test_volume_data_assistant",
Expand Down Expand Up @@ -2591,6 +2497,116 @@ def test_get_metrics_and_expectations(
},
},
)
return expected_rule_based_profiler_config


def run_volume_data_assistant_result_jupyter_notebook_with_new_cell(
context: DataContext, new_cell: str
):
"""
To set this test up we:
- create a suite
- write code (as a string) for creating a VolumeDataAssistantResult
- add a new cell to the notebook that was passed to this method
- write both cells to ipynb file

We then:
- load the notebook back from disk
- execute the notebook (Note: this will raise various errors like
CellExecutionError if any cell in the notebook fails)
"""

root_dir: str = context.root_directory
expectation_suite_name: str = "test_suite"
context.create_expectation_suite(expectation_suite_name)
notebook_path: str = os.path.join(root_dir, f"run_volume_data_assistant.ipynb")
notebook_code: str = """
from typing import Optional, Union

import uuid

import great_expectations as ge
from great_expectations.data_context import BaseDataContext
from great_expectations.validator.validator import Validator
from great_expectations.rule_based_profiler.data_assistant import (
DataAssistant,
VolumeDataAssistant,
)
from great_expectations.rule_based_profiler.types.data_assistant_result import DataAssistantResult
from great_expectations.rule_based_profiler.helpers.util import get_validator_with_expectation_suite
import great_expectations.exceptions as ge_exceptions
"""
notebook_code += """
context = ge.get_context()

batch_request: dict = {
"datasource_name": "taxi_pandas",
"data_connector_name": "monthly",
"data_asset_name": "my_reports",
}

validator: Validator = get_validator_with_expectation_suite(
batch_request=batch_request,
data_context=context,
expectation_suite_name=None,
expectation_suite=None,
component_name="volume_data_assistant",
)

data_assistant: DataAssistant = VolumeDataAssistant(
name="test_volume_data_assistant",
validator=validator,
)

expectation_suite_name: str = "test_suite"
data_assistant_result: DataAssistantResult = data_assistant.run(
expectation_suite_name=expectation_suite_name,
)
"""

nb = nbformat.v4.new_notebook()
nb["cells"] = []
nb["cells"].append(nbformat.v4.new_code_cell(notebook_code))
nb["cells"].append(nbformat.v4.new_code_cell(new_cell))

# Write notebook to path and load it as NotebookNode
with open(notebook_path, "w") as f:
nbformat.write(nb, f)

nb: nbformat.notebooknode.NotebookNode = load_notebook_from_path(
notebook_path=notebook_path
)

# Run notebook
ep: nbconvert.preprocessors.ExecutePreprocessor = (
nbconvert.preprocessors.ExecutePreprocessor(timeout=60, kernel_name="python3")
)
ep.preprocess(nb, {"metadata": {"path": root_dir}})


@freeze_time("09/26/2019 13:42:41")
def test_get_metrics_and_expectations(
quentin_columnar_table_multi_batch_data_context,
quentin_expected_metrics_by_domain,
quentin_expected_expectation_suite,
quentin_expected_rule_based_profiler_configuration,
):
context: DataContext = quentin_columnar_table_multi_batch_data_context

batch_request: dict = {
"datasource_name": "taxi_pandas",
"data_connector_name": "monthly",
"data_asset_name": "my_reports",
}

validator: Validator = get_validator_with_expectation_suite(
batch_request=batch_request,
data_context=context,
expectation_suite_name=None,
expectation_suite=None,
component_name="volume_data_assistant",
)
assert len(validator.batches) == 36

# Utilize a consistent seed to deal with probabilistic nature of this feature.
data_assistant: DataAssistant = VolumeDataAssistant(
Expand All @@ -2599,27 +2615,28 @@ def test_get_metrics_and_expectations(
)
set_bootstrap_random_seed_variable(profiler=data_assistant.profiler)
data_assistant_result: DataAssistantResult = data_assistant.run(
expectation_suite_name=expectation_suite_name,
expectation_suite_name=quentin_expected_expectation_suite.expectation_suite_name,
)

assert data_assistant_result.metrics_by_domain == expected_metrics_by_domain
assert data_assistant_result.metrics_by_domain == quentin_expected_metrics_by_domain
assert (
data_assistant_result.expectation_suite.expectations
== expected_expectation_configurations
== quentin_expected_expectation_suite.expectations
)

data_assistant_result.expectation_suite.meta.pop("great_expectations_version", None)

assert data_assistant_result.expectation_suite == expected_expectation_suite
assert data_assistant_result.expectation_suite == quentin_expected_expectation_suite

assert (
data_assistant_result.expectation_suite.meta == expected_expectation_suite_meta
data_assistant_result.expectation_suite.meta
== quentin_expected_expectation_suite.meta
)

assert deep_filter_properties_iterable(
properties=data_assistant_result.profiler_config.to_json_dict()
) == deep_filter_properties_iterable(
properties=expected_rule_based_profiler_config.to_json_dict()
properties=quentin_expected_rule_based_profiler_configuration.to_json_dict()
)


Expand Down