Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for parse_strings_as_datetimes to expect_column_values_to… #423

Merged
merged 3 commits into from
Apr 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion great_expectations/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@ def expect_column_values_to_be_in_set(self,
column,
value_set,
mostly=None,
parse_strings_as_datetimes=None,
result_format=None, include_config=False, catch_exceptions=None, meta=None
):
"""Expect each column value to be in a given set.
Expand Down Expand Up @@ -552,6 +553,8 @@ def expect_column_values_to_be_in_set(self,
mostly (None or a float between 0 and 1): \
Return `"success": True` if at least mostly percent of values match the expectation. \
For more detail, see :ref:`mostly`.
parse_strings_as_datetimes (boolean or None) : If True values provided in value_set will be parsed as \
datetimes before making comparisons.

Other Parameters:
result_format (str or None): \
Expand Down Expand Up @@ -590,7 +593,7 @@ def expect_column_values_to_not_be_in_set(self,
::

# my_df.my_col = [1,2,2,3,3,3]
>>> my_df.expect_column_values_to_be_in_set(
>>> my_df.expect_column_values_to_not_be_in_set(
"my_col",
[1,2]
)
Expand Down
7 changes: 6 additions & 1 deletion great_expectations/dataset/pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,8 +551,13 @@ def expect_column_values_to_be_in_type_list(self, column, type_list,
@MetaPandasDataset.column_map_expectation
def expect_column_values_to_be_in_set(self, column, value_set,
mostly=None,
parse_strings_as_datetimes=None,
result_format=None, include_config=False, catch_exceptions=None, meta=None):
return column.map(lambda x: x in value_set)
if parse_strings_as_datetimes:
parsed_value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set]
else:
parsed_value_set = value_set
return column.map(lambda x: x in parsed_value_set)

@DocInherit
@MetaPandasDataset.column_map_expectation
Expand Down
9 changes: 7 additions & 2 deletions great_expectations/dataset/sqlalchemy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs
if "unexpected_count" not in count_results or count_results["unexpected_count"] is None:
count_results["unexpected_count"] = 0

# Retrieve unexpected values
# Retrieve unexpected values
unexpected_query_results = self.engine.execute(
sa.select([sa.column(column)]).select_from(self._table).where(
sa.and_(sa.not_(expected_condition),
Expand Down Expand Up @@ -468,9 +468,14 @@ def expect_column_values_to_be_in_set(self,
column,
value_set,
mostly=None,
parse_strings_as_datetimes=None,
result_format=None, include_config=False, catch_exceptions=None, meta=None
):
return sa.column(column).in_(tuple(value_set))
if parse_strings_as_datetimes:
parsed_value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set]
else:
parsed_value_set = value_set
return sa.column(column).in_(tuple(parsed_value_set))

@DocInherit
@MetaSqlAlchemyDataset.column_map_expectation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,5 +135,47 @@
"success": true
}
}]
},
{
"data" : {
"dates" : ["2018-01-01", "2018-01-02", "2018-01-02 00:34:01"]
},
"schemas": {
"sqlite": {
"dates": "datetime"
},
"pandas": {
"dates": "datetime64"
}
},
"tests" : [
{
"title": "Basic positive test case, datetime set",
"exact_match_out": false,
"in": {
"column": "dates",
"value_set": ["2018-01-01", "2018-01-02", "2018-01-02 00:34:01"],
"parse_strings_as_datetimes": true
},
"out": {
"success": true
},
"suppress_test_for": "SQLAlchemy"
},
{
"title": "Basic negative test case, datetime set",
"exact_match_out": false,
"in": {
"column": "dates",
"value_set": ["2018-01-02", "2018-01-02 00:34:01"],
"parse_strings_as_datetimes": true
},
"out": {
"success": false,
"unexpected_index_list": [0],
"unexpected_list": ["2018-01-01 00:00:00"]
},
"suppress_test_for": "SQLAlchemy"
}]
}]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,14 @@
},
"schemas": {
"pandas" : {
"x": "np.int_",
"y": "np.float_",
"x": "int",
"y": "float",
"z": "str",
"n": "null",
"n": "str",
"b": "bool",
"s": ["string", "int"],
"s1": ["string", "np.int64", "int"]
},
"sql": {},
"spark": {}
"s": "str",
"s1": "str"
}
},
"tests": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,14 @@
},
"schemas": {
"pandas" : {
"c1": "int",
"c2": "float",
"c3": "str",
"c4": "null",
"c5": "bool",
"c6": "np.float64",
"c7": "np.int64"
},
"sql": {},
"spark": {}
"x": "int",
"y": "float",
"z": "str",
"n": "str",
"b": "bool",
"s": "float64",
"s1": "int64"
}
},
"tests": [
{
Expand Down
6 changes: 5 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ def get_dataset(dataset_type, data, schemas=None, autoinspect_func=autoinspect.c

"""
if dataset_type == 'PandasDataset':
return PandasDataset(data, autoinspect_func=autoinspect_func)
df = pd.DataFrame(data)
if schemas and "pandas" in schemas:
pandas_schema = {key:np.dtype(value) for (key, value) in schemas["pandas"].items()}
df = df.astype(pandas_schema)
return PandasDataset(df, autoinspect_func=autoinspect_func)
elif dataset_type == 'SqlAlchemyDataset':
# Create a new database

Expand Down