Skip to content

Commit

Permalink
experimental column map expectation checking for vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
manyshapes committed Jul 9, 2021
1 parent e3ce2d0 commit e36ab2c
Showing 1 changed file with 290 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
import json
import re

#!!! This giant block of imports should be something simpler, such as:
# from great_exepectations.helpers.expectation_creation import *
from great_expectations.execution_engine import (
PandasExecutionEngine,
SparkDFExecutionEngine,
SqlAlchemyExecutionEngine,
)
from great_expectations.expectations.expectation import (
ColumnMapExpectation,
Expectation,
ExpectationConfiguration,
)
from great_expectations.expectations.metrics import (
ColumnMapMetricProvider,
column_condition_partial,
)
from great_expectations.expectations.registry import (
_registered_expectations,
_registered_metrics,
_registered_renderers,
)
from great_expectations.expectations.util import render_evaluation_parameter_string
from great_expectations.render.renderer.renderer import renderer
from great_expectations.render.types import RenderedStringTemplateContent
from great_expectations.render.util import num_to_str, substitute_none_for_missing
from great_expectations.validator.validator import Validator


# This class defines a Metric to support your Expectation
# For most Expectations, the main business logic for calculation will live here.
# To learn about the relationship between Metrics and Expectations, please visit
# https://docs.greatexpectations.io/en/latest/reference/core_concepts.html#expectations-and-metrics.
class ColumnValuesToContainVector(ColumnMapMetricProvider):

# This is the id string that will be used to reference your metric.
# Please see https://docs.greatexpectations.io/en/latest/reference/core_concepts/metrics.html#metrics
# for information on how to choose an id string for your Metric.
condition_metric_name = "column_values.is_vector"

# This method defines the business logic for evaluating your metric when using a PandasExecutionEngine


@column_condition_partial(engine=PandasExecutionEngine)
def _pandas(cls, column, **kwargs):
def matches_vector(x):
''' Checks if the row is a list containing only numbers with length greater than 1. Returns true for such rows and empty rows '''
VECTOR_REGEX = r"\[\d+\,\s*\d+(,\s*\d+)*]"
if re.match(VECTOR_REGEX, str(x)):
return True
else:
return False
return column.apply(lambda x: matches_vector(x) )


# This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
# @column_condition_partial(engine=SqlAlchemyExecutionEngine)
# def _sqlalchemy(cls, column, _dialect, **kwargs):
# return column.in_([3])

# This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine
# @column_condition_partial(engine=SparkDFExecutionEngine)
# def _spark(cls, column, **kwargs):
# return column.isin([3])


# This class defines the Expectation itself
# The main business logic for calculation lives here.
class ExpectColumnValuesToBeVector(ColumnMapExpectation):
"""Expect column values to be vectors"""

# These examples will be shown in the public gallery, and also executed as unit tests for your Expectation
examples = [
{
"data": {
"mostly_vectors_and_numbers": [
[1,4,5],
[2,4,6],
[3,9,7],
[2,2,2],
[6,7,9],
6,
[9,4],
5,
0,
None
] ,
"mostly_vectors_and_strings": [
[1, 4, 5],
[2, 4, 6],
[3, 9, 7],
[2, 2, 2],
[6, 7, 9],
'some',
[9, 4],
'extra',
'bits',
None
],
"mostly_vectors_and_scalars": [
[1, 4, 5],
[2, 4, 6],
[3, 9, 7],
[2, 2, 2],
[6, 7, 9],
[2],
[9, 4],
[1],
[9],
None
],
"all_valid_vectors": [
[2,3,4],
[9,5,4],
[0,0,2],
[9,1,4],
[8,7,8],
[2,6,0],
[1,2,9],
[8,7,4],
[2,3,6],
[6,7,2]
]
},
"tests": [
{
"title": "vectors_and_integers",
"exact_match_out": False,
"include_in_gallery": True,
"in": {"column": "mostly_vectors_and_numbers", "mostly": 0.6},
"out": {
"success": True,
"unexpected_index_list": [5,7,8],
"unexpected_list": [6, 5, 0],
},
},
{
"title": "vectors_and_strings",
"exact_match_out": False,
"include_in_gallery": True,
"in": {"column": "mostly_vectors_and_strings", "mostly": 0.6},
"out": {
"success": True,
"unexpected_index_list": [5, 7, 8],
"unexpected_list": ['some', 'extra', 'bits'],
},
},
{
"title": "vectors_and_scalars",
"exact_match_out": False,
"include_in_gallery": True,
"in": {"column": "mostly_vectors_and_scalars", "mostly": 0.6},
"out": {
"success": True,
"unexpected_index_list": [5, 7, 8],
"unexpected_list": [[2], [1], [9]],
},
},
{
"title":"valid_vectors",
"exact_match_out":False,
"include_in_gallery":True,
"in" : {"column": "all_valid_vectors", "mostly":1},
"out" : {
"success" : True,
"unexpected_index_list":[],
"unexpected_list":[],
},
},
],
}
]

# This dictionary contains metadata for display in the public gallery
library_metadata = {
"maturity": "experimental", # "experimental", "beta", or "production"
"tags": [ "experimental" , "datatypes" , "column map expectation"
],
"contributors": [ "@manyshapes" ],
"package": "experimental_expectations",
"requirements" : []
}

# This is the id string of the Metric used by this Expectation.
# For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
map_metric = "column_values.is_vector"

# This is a list of parameter names that can affect whether the Expectation evaluates to True or False
# Please see https://docs.greatexpectations.io/en/latest/reference/core_concepts/expectations/expectations.html#expectation-concepts-domain-and-success-keys
# for more information about domain and success keys, and other arguments to Expectations
success_keys = ("mostly",)

# This dictionary contains default values for any parameters that should have default values
default_kwarg_values = {}

# This method defines a question Renderer
# For more info on Renderers, see
# https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_docs/how_to_create_renderers_for_custom_expectations.html
#!!! This example renderer should render RenderedStringTemplateContent, not just a string


# @classmethod
# @renderer(renderer_type="renderer.question")
# def _question_renderer(
# cls, configuration, result=None, language=None, runtime_configuration=None
# ):
# column = configuration.kwargs.get("column")
# mostly = configuration.kwargs.get("mostly")

# return f'Do at least {mostly * 100}% of values in column "{column}" equal 3?'

# This method defines an answer Renderer
#!!! This example renderer should render RenderedStringTemplateContent, not just a string
# @classmethod
# @renderer(renderer_type="renderer.answer")
# def _answer_renderer(
# cls, configuration=None, result=None, language=None, runtime_configuration=None
# ):
# column = result.expectation_config.kwargs.get("column")
# mostly = result.expectation_config.kwargs.get("mostly")
# regex = result.expectation_config.kwargs.get("regex")
# if result.success:
# return f'At least {mostly * 100}% of values in column "{column}" equal 3.'
# else:
# return f'Less than {mostly * 100}% of values in column "{column}" equal 3.'

# This method defines a prescriptive Renderer
# @classmethod
# @renderer(renderer_type="renderer.prescriptive")
# @render_evaluation_parameter_string
# def _prescriptive_renderer(
# cls,
# configuration=None,
# result=None,
# language=None,
# runtime_configuration=None,
# **kwargs,
# ):
#!!! This example renderer should be shorter
# runtime_configuration = runtime_configuration or {}
# include_column_name = runtime_configuration.get("include_column_name", True)
# include_column_name = (
# include_column_name if include_column_name is not None else True
# )
# styling = runtime_configuration.get("styling")
# params = substitute_none_for_missing(
# configuration.kwargs,
# ["column", "regex", "mostly", "row_condition", "condition_parser"],
# )

# template_str = "values must be equal to 3"
# if params["mostly"] is not None:
# params["mostly_pct"] = num_to_str(
# params["mostly"] * 100, precision=15, no_scientific=True
# )
# # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
# template_str += ", at least $mostly_pct % of the time."
# else:
# template_str += "."

# if include_column_name:
# template_str = "$column " + template_str

# if params["row_condition"] is not None:
# (
# conditional_template_str,
# conditional_params,
# ) = parse_row_condition_string_pandas_engine(params["row_condition"])
# template_str = conditional_template_str + ", then " + template_str
# params.update(conditional_params)

# return [
# RenderedStringTemplateContent(
# **{
# "content_block_type": "string_template",
# "string_template": {
# "template": template_str,
# "params": params,
# "styling": styling,
# },
# }
# )
# ]

if __name__ == "__main__":
diagnostics_report = ExpectColumnValuesToBeVector().run_diagnostics()
print(json.dumps(diagnostics_report, indent=2))

0 comments on commit e36ab2c

Please sign in to comment.