-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
experimental column map expectation checking for vectors
- Loading branch information
1 parent
e3ce2d0
commit e36ab2c
Showing
1 changed file
with
290 additions
and
0 deletions.
There are no files selected for viewing
290 changes: 290 additions & 0 deletions
290
...mental/great_expectations_experimental/expectations/expect_column_values_to_be_vectors.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,290 @@ | ||
import json | ||
import re | ||
|
||
#!!! This giant block of imports should be something simpler, such as: | ||
# from great_exepectations.helpers.expectation_creation import * | ||
from great_expectations.execution_engine import ( | ||
PandasExecutionEngine, | ||
SparkDFExecutionEngine, | ||
SqlAlchemyExecutionEngine, | ||
) | ||
from great_expectations.expectations.expectation import ( | ||
ColumnMapExpectation, | ||
Expectation, | ||
ExpectationConfiguration, | ||
) | ||
from great_expectations.expectations.metrics import ( | ||
ColumnMapMetricProvider, | ||
column_condition_partial, | ||
) | ||
from great_expectations.expectations.registry import ( | ||
_registered_expectations, | ||
_registered_metrics, | ||
_registered_renderers, | ||
) | ||
from great_expectations.expectations.util import render_evaluation_parameter_string | ||
from great_expectations.render.renderer.renderer import renderer | ||
from great_expectations.render.types import RenderedStringTemplateContent | ||
from great_expectations.render.util import num_to_str, substitute_none_for_missing | ||
from great_expectations.validator.validator import Validator | ||
|
||
|
||
# This class defines a Metric to support your Expectation | ||
# For most Expectations, the main business logic for calculation will live here. | ||
# To learn about the relationship between Metrics and Expectations, please visit | ||
# https://docs.greatexpectations.io/en/latest/reference/core_concepts.html#expectations-and-metrics. | ||
class ColumnValuesToContainVector(ColumnMapMetricProvider): | ||
|
||
# This is the id string that will be used to reference your metric. | ||
# Please see https://docs.greatexpectations.io/en/latest/reference/core_concepts/metrics.html#metrics | ||
# for information on how to choose an id string for your Metric. | ||
condition_metric_name = "column_values.is_vector" | ||
|
||
# This method defines the business logic for evaluating your metric when using a PandasExecutionEngine | ||
|
||
|
||
@column_condition_partial(engine=PandasExecutionEngine) | ||
def _pandas(cls, column, **kwargs): | ||
def matches_vector(x): | ||
''' Checks if the row is a list containing only numbers with length greater than 1. Returns true for such rows and empty rows ''' | ||
VECTOR_REGEX = r"\[\d+\,\s*\d+(,\s*\d+)*]" | ||
if re.match(VECTOR_REGEX, str(x)): | ||
return True | ||
else: | ||
return False | ||
return column.apply(lambda x: matches_vector(x) ) | ||
|
||
|
||
# This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine | ||
# @column_condition_partial(engine=SqlAlchemyExecutionEngine) | ||
# def _sqlalchemy(cls, column, _dialect, **kwargs): | ||
# return column.in_([3]) | ||
|
||
# This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine | ||
# @column_condition_partial(engine=SparkDFExecutionEngine) | ||
# def _spark(cls, column, **kwargs): | ||
# return column.isin([3]) | ||
|
||
|
||
# This class defines the Expectation itself | ||
# The main business logic for calculation lives here. | ||
class ExpectColumnValuesToBeVector(ColumnMapExpectation): | ||
"""Expect column values to be vectors""" | ||
|
||
# These examples will be shown in the public gallery, and also executed as unit tests for your Expectation | ||
examples = [ | ||
{ | ||
"data": { | ||
"mostly_vectors_and_numbers": [ | ||
[1,4,5], | ||
[2,4,6], | ||
[3,9,7], | ||
[2,2,2], | ||
[6,7,9], | ||
6, | ||
[9,4], | ||
5, | ||
0, | ||
None | ||
] , | ||
"mostly_vectors_and_strings": [ | ||
[1, 4, 5], | ||
[2, 4, 6], | ||
[3, 9, 7], | ||
[2, 2, 2], | ||
[6, 7, 9], | ||
'some', | ||
[9, 4], | ||
'extra', | ||
'bits', | ||
None | ||
], | ||
"mostly_vectors_and_scalars": [ | ||
[1, 4, 5], | ||
[2, 4, 6], | ||
[3, 9, 7], | ||
[2, 2, 2], | ||
[6, 7, 9], | ||
[2], | ||
[9, 4], | ||
[1], | ||
[9], | ||
None | ||
], | ||
"all_valid_vectors": [ | ||
[2,3,4], | ||
[9,5,4], | ||
[0,0,2], | ||
[9,1,4], | ||
[8,7,8], | ||
[2,6,0], | ||
[1,2,9], | ||
[8,7,4], | ||
[2,3,6], | ||
[6,7,2] | ||
] | ||
}, | ||
"tests": [ | ||
{ | ||
"title": "vectors_and_integers", | ||
"exact_match_out": False, | ||
"include_in_gallery": True, | ||
"in": {"column": "mostly_vectors_and_numbers", "mostly": 0.6}, | ||
"out": { | ||
"success": True, | ||
"unexpected_index_list": [5,7,8], | ||
"unexpected_list": [6, 5, 0], | ||
}, | ||
}, | ||
{ | ||
"title": "vectors_and_strings", | ||
"exact_match_out": False, | ||
"include_in_gallery": True, | ||
"in": {"column": "mostly_vectors_and_strings", "mostly": 0.6}, | ||
"out": { | ||
"success": True, | ||
"unexpected_index_list": [5, 7, 8], | ||
"unexpected_list": ['some', 'extra', 'bits'], | ||
}, | ||
}, | ||
{ | ||
"title": "vectors_and_scalars", | ||
"exact_match_out": False, | ||
"include_in_gallery": True, | ||
"in": {"column": "mostly_vectors_and_scalars", "mostly": 0.6}, | ||
"out": { | ||
"success": True, | ||
"unexpected_index_list": [5, 7, 8], | ||
"unexpected_list": [[2], [1], [9]], | ||
}, | ||
}, | ||
{ | ||
"title":"valid_vectors", | ||
"exact_match_out":False, | ||
"include_in_gallery":True, | ||
"in" : {"column": "all_valid_vectors", "mostly":1}, | ||
"out" : { | ||
"success" : True, | ||
"unexpected_index_list":[], | ||
"unexpected_list":[], | ||
}, | ||
}, | ||
], | ||
} | ||
] | ||
|
||
# This dictionary contains metadata for display in the public gallery | ||
library_metadata = { | ||
"maturity": "experimental", # "experimental", "beta", or "production" | ||
"tags": [ "experimental" , "datatypes" , "column map expectation" | ||
], | ||
"contributors": [ "@manyshapes" ], | ||
"package": "experimental_expectations", | ||
"requirements" : [] | ||
} | ||
|
||
# This is the id string of the Metric used by this Expectation. | ||
# For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above. | ||
map_metric = "column_values.is_vector" | ||
|
||
# This is a list of parameter names that can affect whether the Expectation evaluates to True or False | ||
# Please see https://docs.greatexpectations.io/en/latest/reference/core_concepts/expectations/expectations.html#expectation-concepts-domain-and-success-keys | ||
# for more information about domain and success keys, and other arguments to Expectations | ||
success_keys = ("mostly",) | ||
|
||
# This dictionary contains default values for any parameters that should have default values | ||
default_kwarg_values = {} | ||
|
||
# This method defines a question Renderer | ||
# For more info on Renderers, see | ||
# https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_docs/how_to_create_renderers_for_custom_expectations.html | ||
#!!! This example renderer should render RenderedStringTemplateContent, not just a string | ||
|
||
|
||
# @classmethod | ||
# @renderer(renderer_type="renderer.question") | ||
# def _question_renderer( | ||
# cls, configuration, result=None, language=None, runtime_configuration=None | ||
# ): | ||
# column = configuration.kwargs.get("column") | ||
# mostly = configuration.kwargs.get("mostly") | ||
|
||
# return f'Do at least {mostly * 100}% of values in column "{column}" equal 3?' | ||
|
||
# This method defines an answer Renderer | ||
#!!! This example renderer should render RenderedStringTemplateContent, not just a string | ||
# @classmethod | ||
# @renderer(renderer_type="renderer.answer") | ||
# def _answer_renderer( | ||
# cls, configuration=None, result=None, language=None, runtime_configuration=None | ||
# ): | ||
# column = result.expectation_config.kwargs.get("column") | ||
# mostly = result.expectation_config.kwargs.get("mostly") | ||
# regex = result.expectation_config.kwargs.get("regex") | ||
# if result.success: | ||
# return f'At least {mostly * 100}% of values in column "{column}" equal 3.' | ||
# else: | ||
# return f'Less than {mostly * 100}% of values in column "{column}" equal 3.' | ||
|
||
# This method defines a prescriptive Renderer | ||
# @classmethod | ||
# @renderer(renderer_type="renderer.prescriptive") | ||
# @render_evaluation_parameter_string | ||
# def _prescriptive_renderer( | ||
# cls, | ||
# configuration=None, | ||
# result=None, | ||
# language=None, | ||
# runtime_configuration=None, | ||
# **kwargs, | ||
# ): | ||
#!!! This example renderer should be shorter | ||
# runtime_configuration = runtime_configuration or {} | ||
# include_column_name = runtime_configuration.get("include_column_name", True) | ||
# include_column_name = ( | ||
# include_column_name if include_column_name is not None else True | ||
# ) | ||
# styling = runtime_configuration.get("styling") | ||
# params = substitute_none_for_missing( | ||
# configuration.kwargs, | ||
# ["column", "regex", "mostly", "row_condition", "condition_parser"], | ||
# ) | ||
|
||
# template_str = "values must be equal to 3" | ||
# if params["mostly"] is not None: | ||
# params["mostly_pct"] = num_to_str( | ||
# params["mostly"] * 100, precision=15, no_scientific=True | ||
# ) | ||
# # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") | ||
# template_str += ", at least $mostly_pct % of the time." | ||
# else: | ||
# template_str += "." | ||
|
||
# if include_column_name: | ||
# template_str = "$column " + template_str | ||
|
||
# if params["row_condition"] is not None: | ||
# ( | ||
# conditional_template_str, | ||
# conditional_params, | ||
# ) = parse_row_condition_string_pandas_engine(params["row_condition"]) | ||
# template_str = conditional_template_str + ", then " + template_str | ||
# params.update(conditional_params) | ||
|
||
# return [ | ||
# RenderedStringTemplateContent( | ||
# **{ | ||
# "content_block_type": "string_template", | ||
# "string_template": { | ||
# "template": template_str, | ||
# "params": params, | ||
# "styling": styling, | ||
# }, | ||
# } | ||
# ) | ||
# ] | ||
|
||
if __name__ == "__main__": | ||
diagnostics_report = ExpectColumnValuesToBeVector().run_diagnostics() | ||
print(json.dumps(diagnostics_report, indent=2)) | ||
|