From 0d11698f3c75c8e83e21fd838fc27e9073544c89 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Fri, 12 Apr 2019 10:14:31 -0400 Subject: [PATCH 1/3] Add support for parse_strings_as_datetimes to expect_column_values_to_be_in_set (#422) --- great_expectations/dataset/dataset.py | 5 ++++- great_expectations/dataset/pandas_dataset.py | 7 ++++++- .../dataset/sqlalchemy_dataset.py | 9 +++++++-- ...xpect_column_values_to_be_in_type_list.json | 14 ++++++-------- .../expect_column_values_to_be_of_type.json | 18 ++++++++---------- tests/test_utils.py | 6 +++++- 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py index a746209eb767..a161f752786c 100644 --- a/great_expectations/dataset/dataset.py +++ b/great_expectations/dataset/dataset.py @@ -515,6 +515,7 @@ def expect_column_values_to_be_in_set(self, column, value_set, mostly=None, + parse_strings_as_datetimes=None, result_format=None, include_config=False, catch_exceptions=None, meta=None ): """Expect each column value to be in a given set. @@ -552,6 +553,8 @@ def expect_column_values_to_be_in_set(self, mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly percent of values match the expectation. \ For more detail, see :ref:`mostly`. + parse_strings_as_datetimes (boolean or None) : If True values provided in value_set will be parsed as \ + datetimes before making comparisons. Other Parameters: result_format (str or None): \ @@ -590,7 +593,7 @@ def expect_column_values_to_not_be_in_set(self, :: # my_df.my_col = [1,2,2,3,3,3] - >>> my_df.expect_column_values_to_be_in_set( + >>> my_df.expect_column_values_to_not_be_in_set( "my_col", [1,2] ) diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py index 3a7c0f43e3c4..a6cd83c339e9 100644 --- a/great_expectations/dataset/pandas_dataset.py +++ b/great_expectations/dataset/pandas_dataset.py @@ -551,8 +551,13 @@ def expect_column_values_to_be_in_type_list(self, column, type_list, @MetaPandasDataset.column_map_expectation def expect_column_values_to_be_in_set(self, column, value_set, mostly=None, + parse_strings_as_datetimes=None, result_format=None, include_config=False, catch_exceptions=None, meta=None): - return column.map(lambda x: x in value_set) + if parse_strings_as_datetimes: + parsed_value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set] + else: + parsed_value_set = value_set + return column.map(lambda x: x in parsed_value_set) @DocInherit @MetaPandasDataset.column_map_expectation diff --git a/great_expectations/dataset/sqlalchemy_dataset.py b/great_expectations/dataset/sqlalchemy_dataset.py index 811719a4b186..1a4d270480fb 100644 --- a/great_expectations/dataset/sqlalchemy_dataset.py +++ b/great_expectations/dataset/sqlalchemy_dataset.py @@ -95,7 +95,7 @@ def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs if "unexpected_count" not in count_results or count_results["unexpected_count"] is None: count_results["unexpected_count"] = 0 - # Retrieve unexpected values + # Retrieve unexpected values unexpected_query_results = self.engine.execute( sa.select([sa.column(column)]).select_from(self._table).where( sa.and_(sa.not_(expected_condition), @@ -468,9 +468,14 @@ def expect_column_values_to_be_in_set(self, column, value_set, mostly=None, + parse_strings_as_datetimes=None, result_format=None, include_config=False, catch_exceptions=None, meta=None ): - return sa.column(column).in_(tuple(value_set)) + if parse_strings_as_datetimes: + parsed_value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set] + else: + parsed_value_set = value_set + return sa.column(column).in_(tuple(parsed_value_set)) @DocInherit @MetaSqlAlchemyDataset.column_map_expectation diff --git a/tests/column_map_expectations/expect_column_values_to_be_in_type_list.json b/tests/column_map_expectations/expect_column_values_to_be_in_type_list.json index dd070fbd7611..7a6a4e248ec3 100644 --- a/tests/column_map_expectations/expect_column_values_to_be_in_type_list.json +++ b/tests/column_map_expectations/expect_column_values_to_be_in_type_list.json @@ -13,16 +13,14 @@ }, "schemas": { "pandas" : { - "x": "np.int_", - "y": "np.float_", + "x": "int", + "y": "float", "z": "str", - "n": "null", + "n": "str", "b": "bool", - "s": ["string", "int"], - "s1": ["string", "np.int64", "int"] - }, - "sql": {}, - "spark": {} + "s": "str", + "s1": "str" + } }, "tests": [ { diff --git a/tests/column_map_expectations/expect_column_values_to_be_of_type.json b/tests/column_map_expectations/expect_column_values_to_be_of_type.json index 026ec3ad77be..f039081f3d09 100644 --- a/tests/column_map_expectations/expect_column_values_to_be_of_type.json +++ b/tests/column_map_expectations/expect_column_values_to_be_of_type.json @@ -13,16 +13,14 @@ }, "schemas": { "pandas" : { - "c1": "int", - "c2": "float", - "c3": "str", - "c4": "null", - "c5": "bool", - "c6": "np.float64", - "c7": "np.int64" - }, - "sql": {}, - "spark": {} + "x": "int", + "y": "float", + "z": "str", + "n": "str", + "b": "bool", + "s": "float64", + "s1": "int64" + } }, "tests": [ { diff --git a/tests/test_utils.py b/tests/test_utils.py index ec8addf93fd5..4c3c9abb5a9c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -72,7 +72,11 @@ def get_dataset(dataset_type, data, schemas=None, autoinspect_func=autoinspect.c """ if dataset_type == 'PandasDataset': - return PandasDataset(data, autoinspect_func=autoinspect_func) + df = pd.DataFrame(data) + if schemas and "pandas" in schemas: + pandas_schema = {key:np.dtype(value) for (key, value) in schemas["pandas"].items()} + df = df.astype(pandas_schema) + return PandasDataset(df, autoinspect_func=autoinspect_func) elif dataset_type == 'SqlAlchemyDataset': # Create a new database From c858ffbcd9db37441b487fd54d67912829ebe95e Mon Sep 17 00:00:00 2001 From: James Campbell Date: Fri, 12 Apr 2019 10:33:22 -0400 Subject: [PATCH 2/3] Datetime tests --- .../expect_column_values_to_be_in_set.json | 42 ++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/tests/column_map_expectations/expect_column_values_to_be_in_set.json b/tests/column_map_expectations/expect_column_values_to_be_in_set.json index 05b5bea8377b..aafa60585592 100644 --- a/tests/column_map_expectations/expect_column_values_to_be_in_set.json +++ b/tests/column_map_expectations/expect_column_values_to_be_in_set.json @@ -135,5 +135,45 @@ "success": true } }] + }, + { + "data" : { + "dates" : ["2018-01-01", "2018-01-02", "2018-01-02 00:34:01"] + }, + "schemas": { + "sqlite": { + "dates": "datetime" + }, + "pandas": { + "dates": "datetime64" + } + }, + "tests" : [ + { + "title": "Basic positive test case, datetime set", + "exact_match_out": false, + "in": { + "column": "dates", + "value_set": ["2018-01-01", "2018-01-02", "2018-01-02 00:34:01"], + "parse_strings_as_datetimes": true + }, + "out": { + "success": true + } + }, + { + "title": "Basic negative test case, datetime set", + "exact_match_out": false, + "in": { + "column": "dates", + "value_set": ["2018-01-02", "2018-01-02 00:34:01"], + "parse_strings_as_datetimes": true + }, + "out": { + "success": false, + "unexpected_index_list": [0], + "unexpected_list": ["2018-01-01 00:00:00"] + } + }] }] -} +} \ No newline at end of file From 4586e752a36da41cb7efef31c482944032315493 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Fri, 12 Apr 2019 11:54:49 -0400 Subject: [PATCH 3/3] Suppress in_set tests for sqlalchemy --- .../expect_column_values_to_be_in_set.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/column_map_expectations/expect_column_values_to_be_in_set.json b/tests/column_map_expectations/expect_column_values_to_be_in_set.json index aafa60585592..4e41ac831a4e 100644 --- a/tests/column_map_expectations/expect_column_values_to_be_in_set.json +++ b/tests/column_map_expectations/expect_column_values_to_be_in_set.json @@ -159,7 +159,8 @@ }, "out": { "success": true - } + }, + "suppress_test_for": "SQLAlchemy" }, { "title": "Basic negative test case, datetime set", @@ -173,7 +174,8 @@ "success": false, "unexpected_index_list": [0], "unexpected_list": ["2018-01-01 00:00:00"] - } + }, + "suppress_test_for": "SQLAlchemy" }] }] } \ No newline at end of file