Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] V1 Checkpoint #9590

Merged
merged 30 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
44546e2
add new class
cdkini Mar 7, 2024
d93bd88
start writing tests
cdkini Mar 7, 2024
fe25879
start on tests
cdkini Mar 7, 2024
452953f
move bundle to shared place
cdkini Mar 7, 2024
ecbba25
move bundle to shared place again
cdkini Mar 7, 2024
df5b13f
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Mar 8, 2024
21ef021
misc updates
cdkini Mar 8, 2024
0435290
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Mar 15, 2024
97b2106
remove json_encoder for actions
cdkini Mar 15, 2024
e0b9a30
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Mar 15, 2024
b32079b
write basic serialization tests
cdkini Mar 15, 2024
198427a
add test cases
cdkini Mar 15, 2024
a0023fb
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Mar 19, 2024
29ac1a9
add more test cases
cdkini Mar 19, 2024
bfc2855
more progress
cdkini Mar 19, 2024
ffc8ec0
refactor to serialize methods
cdkini Mar 19, 2024
479f1ff
add id test
cdkini Mar 19, 2024
f4aac67
get tests passing
cdkini Mar 19, 2024
4a2bd89
make renderer func private again
cdkini Mar 19, 2024
aa53728
misc cleanup
cdkini Mar 19, 2024
092c9fe
remove comment
cdkini Mar 19, 2024
9f18740
bolster tests
cdkini Mar 19, 2024
10c12c1
mypy
cdkini Mar 19, 2024
b55d802
mypy
cdkini Mar 19, 2024
12f69b9
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Mar 19, 2024
50da4c2
cleanup based on bill's initial review
cdkini Mar 19, 2024
36d1ea9
mypy
cdkini Mar 19, 2024
d047ac4
Merge branch 'develop' of https://github.com/great-expectations/great…
cdkini Mar 19, 2024
e5c072d
misc updates around serialization
cdkini Mar 19, 2024
571f245
update renderer call
cdkini Mar 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
123 changes: 123 additions & 0 deletions great_expectations/checkpoint/v1_checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Dict, List, Union

import great_expectations.exceptions as gx_exceptions
from great_expectations import project_manager
from great_expectations._docs_decorators import public_api
from great_expectations.checkpoint.actions import ValidationAction # noqa: TCH001
from great_expectations.compatibility.pydantic import BaseModel, validator
from great_expectations.core.serdes import _IdentifierBundle
from great_expectations.core.validation_config import ValidationConfig
from great_expectations.render.renderer.renderer import Renderer

if TYPE_CHECKING:
from great_expectations.checkpoint.types.checkpoint_result import CheckpointResult
from great_expectations.data_context.store.validation_config_store import (
ValidationConfigStore,
)


class Checkpoint(BaseModel):
"""
A Checkpoint is the primary means for validating data in a production deployment of Great Expectations.

Checkpoints provide a convenient abstraction for running a number of validations and triggering a set of actions
to be taken after the validation step.

Args:
validations: List of validation configs to be run.
cdkini marked this conversation as resolved.
Show resolved Hide resolved
actions: List of actions to be taken after the validations are run.

"""

name: str
validations: List[ValidationConfig]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not a list of ValidationDefinitions instead of config objects? That's what this shows: https://greatexpectations.atlassian.net/wiki/spaces/SUP/pages/917471267/Validation+Workflows

Or does the ValidationConfig need to be renamed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I think ValidationConfig does need to be renamed. That can be done in a separate PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs to be renamed!

actions: List[ValidationAction]
cdkini marked this conversation as resolved.
Show resolved Hide resolved
id: Union[str, None] = None

class Config:
arbitrary_types_allowed = (
True # Necessary for compatibility with ValidationAction's Marshmallow dep
)
"""
When serialized, the validations field should be encoded as a set of identifiers.
cdkini marked this conversation as resolved.
Show resolved Hide resolved
cdkini marked this conversation as resolved.
Show resolved Hide resolved
These will be used as foreign keys to retrieve the actual objects from the appropriate stores.

Example:
cdkini marked this conversation as resolved.
Show resolved Hide resolved
{
"name": "my_checkpoint",
"validations": [
{
"name": "my_first_validation",
"id": "a758816-64c8-46cb-8f7e-03c12cea1d67"
},
{
"name": "my_second_validation",
"id": "1339js16-64c8-46cb-8f7e-03c12cea1d67"
},
],
"actions": [
{
"name": "my_slack_action",
"slack_webhook": "https://hooks.slack.com/services/ABC123/DEF456/XYZ789",
"notify_on": "all",
"notify_with": ["my_data_docs_site"],
"renderer": {
"class_name": "SlackRenderer",
}
}
"""
json_encoders = {
ValidationConfig: lambda v: v.serialize(),
Renderer: lambda r: r.serialize(),
# ExpectationSuite: lambda e: e.serialize(),
# BatchConfig: lambda b: b.serialize(),
}

@validator("validations", pre=True)
def _validate_validations(
cls, validations: list[ValidationConfig] | list[dict]
) -> list[ValidationConfig]:
if len(validations) == 0:
raise ValueError("Checkpoint must contain at least one validation")

if isinstance(validations[0], dict):
validation_config_store = project_manager.get_validation_config_store()
identifier_bundles = [_IdentifierBundle(**v) for v in validations]
return cls._deserialize_identifier_bundles_to_validation_configs(
identifier_bundles=identifier_bundles, store=validation_config_store
)

return validations

@classmethod
def _deserialize_identifier_bundles_to_validation_configs(
cls, identifier_bundles: list[_IdentifierBundle], store: ValidationConfigStore
) -> list[ValidationConfig]:
validations: list[ValidationConfig] = []
for id_bundle in identifier_bundles:
key = store.get_key(name=id_bundle.name, id=id_bundle.id)

try:
validation_config = store.get(key=key)
except (KeyError, gx_exceptions.InvalidKeyError):
raise ValueError(
f"Unable to retrieve validation config {id_bundle} from store"
)

validations.append(validation_config)

return validations

@public_api
def run(
self,
batch_params: Dict[str, Any] | None = None,
cdkini marked this conversation as resolved.
Show resolved Hide resolved
suite_params: Dict[str, Any] | None = None,
cdkini marked this conversation as resolved.
Show resolved Hide resolved
) -> CheckpointResult:
raise NotImplementedError

@public_api
def save(self) -> None:
raise NotImplementedError
19 changes: 19 additions & 0 deletions great_expectations/core/batch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# if we move this import into the TYPE_CHECKING block, we need to provide the
# Partitioner class when we update forward refs, so we just import here.
from great_expectations.core.partitioners import Partitioner # noqa: TCH001
from great_expectations.core.serdes import _EncodedValidationData, _IdentifierBundle

if TYPE_CHECKING:
from great_expectations.datasource.fluent.batch_request import (
Expand Down Expand Up @@ -47,3 +48,21 @@ def build_batch_request(

def save(self) -> None:
self.data_asset._save_batch_config(self)

def serialize(self) -> _EncodedValidationData:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect a method called serialize on an object to serialize the object but this looks like it serializes the validation definition? Could you update the name?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to serialize_validation_definition!

asset = self.data_asset
ds = asset.datasource
return _EncodedValidationData(
datasource=_IdentifierBundle(
name=ds.name,
id=ds.id,
),
asset=_IdentifierBundle(
name=asset.name,
id=str(asset.id) if asset.id else None,
),
batch_definition=_IdentifierBundle(
name=self.name,
id=self.id,
),
)
11 changes: 11 additions & 0 deletions great_expectations/core/expectation_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
_deduplicate_evaluation_parameter_dependencies,
)
from great_expectations.core.metric_domain_types import MetricDomainTypes
from great_expectations.core.serdes import _IdentifierBundle
from great_expectations.core.util import (
convert_to_json_serializable,
ensure_json_serializable,
Expand Down Expand Up @@ -1110,6 +1111,16 @@ def render(self) -> None:
)
)

def serialize(self) -> _IdentifierBundle:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved any encoding logic to the actual class being serialized (move from _encode_suite in validation config to serialize as instance method on suite).

I don't love exposing these publicly but how do we get around this? Do we want custom serializer classes to encapsulate this logic?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This only serializes this object when it is used in another object right? I would call this serialize_identifier_bundle() or just identifier_bundler(). This name should be uniform across any object that needs to do this. I think this public since whatever object serializes this expects this method to exist. We could formalize it as an internal protocol.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree for suites and validation definitions but renderers and batch definitions have different outputs (_EncodedValidationData and class_name/module_name, respectively).

from great_expectations import project_manager

if not self.id:
expectation_store = project_manager.get_expectations_store()
key = expectation_store.get_key(name=self.name, id=None)
expectation_store.add(key=key, value=self)

return _IdentifierBundle(name=self.name, id=self.id)


_TExpectationSuite = TypeVar("_TExpectationSuite", ExpectationSuite, dict)

Expand Down
16 changes: 16 additions & 0 deletions great_expectations/core/serdes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Union

from great_expectations.compatibility.pydantic import (
BaseModel,
)


class _IdentifierBundle(BaseModel):
name: str
id: Union[str, None]


class _EncodedValidationData(BaseModel):
datasource: _IdentifierBundle
asset: _IdentifierBundle
batch_definition: _IdentifierBundle
Comment on lines +1 to +16
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shared space with intermediate serialization models

55 changes: 14 additions & 41 deletions great_expectations/core/validation_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
expectationSuiteSchema,
)
from great_expectations.core.run_identifier import RunIdentifier
from great_expectations.core.serdes import _EncodedValidationData, _IdentifierBundle
from great_expectations.data_context.cloud_constants import GXCloudRESTResource
from great_expectations.data_context.data_context.context_factory import project_manager
from great_expectations.data_context.types.resource_identifiers import (
Expand All @@ -40,45 +41,6 @@
from great_expectations.datasource.fluent.interfaces import DataAsset, Datasource


class _IdentifierBundle(BaseModel):
name: str
id: Union[str, None]


class _EncodedValidationData(BaseModel):
datasource: _IdentifierBundle
asset: _IdentifierBundle
batch_definition: _IdentifierBundle


def _encode_suite(suite: ExpectationSuite) -> _IdentifierBundle:
if not suite.id:
expectation_store = project_manager.get_expectations_store()
key = expectation_store.get_key(name=suite.name, id=None)
expectation_store.add(key=key, value=suite)

return _IdentifierBundle(name=suite.name, id=suite.id)


def _encode_data(data: BatchConfig) -> _EncodedValidationData:
asset = data.data_asset
ds = asset.datasource
return _EncodedValidationData(
datasource=_IdentifierBundle(
name=ds.name,
id=ds.id,
),
asset=_IdentifierBundle(
name=asset.name,
id=str(asset.id) if asset.id else None,
),
batch_definition=_IdentifierBundle(
name=data.name,
id=data.id,
),
)


class ValidationConfig(BaseModel):
"""
Responsible for running a suite against data and returning a validation result.
Expand Down Expand Up @@ -125,8 +87,8 @@ class Config:
}
"""
json_encoders = {
ExpectationSuite: lambda e: _encode_suite(e),
BatchConfig: lambda b: _encode_data(b),
ExpectationSuite: lambda e: e.serialize(),
BatchConfig: lambda b: b.serialize(),
}

name: str = Field(..., allow_mutation=False)
Expand Down Expand Up @@ -302,3 +264,14 @@ def _get_expectation_suite_and_validation_result_ids(
run_id=run_id,
)
return expectation_suite_identifier, validation_result_id

def serialize(self) -> dict:
if not self.id:
validation_config_store = project_manager.get_validation_config_store()
key = validation_config_store.get_key(name=self.name, id=None)
validation_config_store.add(key=key, value=self)

self.data.serialize()
self.suite.serialize()

return _IdentifierBundle(name=self.name, id=self.id)