Add support for parse_strings_as_datetimes to expect_column_values_to…

…_be_in_set (#422)
great-expectations · Apr 12, 2019 · 0d11698 · 0d11698
1 parent 103b962
commit 0d11698
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 23 deletions.
diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py
@@ -515,6 +515,7 @@ def expect_column_values_to_be_in_set(self,
                                           column,
                                           value_set,
                                           mostly=None,
+                                          parse_strings_as_datetimes=None,
                                           result_format=None, include_config=False, catch_exceptions=None, meta=None
                                           ):
         """Expect each column value to be in a given set.
@@ -552,6 +553,8 @@ def expect_column_values_to_be_in_set(self,
             mostly (None or a float between 0 and 1): \
                 Return `"success": True` if at least mostly percent of values match the expectation. \
                 For more detail, see :ref:`mostly`.
+            parse_strings_as_datetimes (boolean or None) : If True values provided in value_set will be parsed as \
+                datetimes before making comparisons.
 
         Other Parameters:
             result_format (str or None): \
@@ -590,7 +593,7 @@ def expect_column_values_to_not_be_in_set(self,
         ::
 
             # my_df.my_col = [1,2,2,3,3,3]
-            >>> my_df.expect_column_values_to_be_in_set(
+            >>> my_df.expect_column_values_to_not_be_in_set(
                 "my_col",
                 [1,2]
             )

diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py
@@ -551,8 +551,13 @@ def expect_column_values_to_be_in_type_list(self, column, type_list,
     @MetaPandasDataset.column_map_expectation
     def expect_column_values_to_be_in_set(self, column, value_set,
                                           mostly=None,
+                                          parse_strings_as_datetimes=None,
                                           result_format=None, include_config=False, catch_exceptions=None, meta=None):
-        return column.map(lambda x: x in value_set)
+        if parse_strings_as_datetimes:
+            parsed_value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set]
+        else:
+            parsed_value_set = value_set
+        return column.map(lambda x: x in parsed_value_set)
 
     @DocInherit
     @MetaPandasDataset.column_map_expectation

diff --git a/great_expectations/dataset/sqlalchemy_dataset.py b/great_expectations/dataset/sqlalchemy_dataset.py
@@ -95,7 +95,7 @@ def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs
             if "unexpected_count" not in count_results or count_results["unexpected_count"] is None:
                 count_results["unexpected_count"] = 0
 
-            # Retrieve unexpected  values
+            # Retrieve unexpected values
             unexpected_query_results = self.engine.execute(
                 sa.select([sa.column(column)]).select_from(self._table).where(
                     sa.and_(sa.not_(expected_condition),
@@ -468,9 +468,14 @@ def expect_column_values_to_be_in_set(self,
                                           column,
                                           value_set,
                                           mostly=None,
+                                          parse_strings_as_datetimes=None,
                                           result_format=None, include_config=False, catch_exceptions=None, meta=None
                                           ):
-        return sa.column(column).in_(tuple(value_set))
+        if parse_strings_as_datetimes:
+            parsed_value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set]
+        else:
+            parsed_value_set = value_set
+        return sa.column(column).in_(tuple(parsed_value_set))
 
     @DocInherit
     @MetaSqlAlchemyDataset.column_map_expectation

diff --git a/tests/column_map_expectations/expect_column_values_to_be_in_type_list.json b/tests/column_map_expectations/expect_column_values_to_be_in_type_list.json
@@ -13,16 +13,14 @@
       },
       "schemas": {
         "pandas" : {
-          "x": "np.int_",
-          "y": "np.float_",
+          "x": "int",
+          "y": "float",
           "z": "str",
-          "n": "null",
+          "n": "str",
           "b": "bool",
-          "s": ["string", "int"],
-          "s1": ["string", "np.int64", "int"]
-        },
-        "sql": {},
-        "spark": {}
+          "s": "str",
+          "s1": "str"
+        }
       },
       "tests": [
         {

diff --git a/tests/column_map_expectations/expect_column_values_to_be_of_type.json b/tests/column_map_expectations/expect_column_values_to_be_of_type.json
@@ -13,16 +13,14 @@
       },
       "schemas": {
         "pandas" : {
-          "c1": "int",
-          "c2": "float",
-          "c3": "str",
-          "c4": "null",
-          "c5": "bool",
-          "c6": "np.float64",
-          "c7": "np.int64"
-        },
-        "sql": {},
-        "spark": {}
+          "x": "int",
+          "y": "float",
+          "z": "str",
+          "n": "str",
+          "b": "bool",
+          "s": "float64",
+          "s1": "int64"
+        }
       },
       "tests": [
         {

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -72,7 +72,11 @@ def get_dataset(dataset_type, data, schemas=None, autoinspect_func=autoinspect.c
 
     """
     if dataset_type == 'PandasDataset':
-        return PandasDataset(data, autoinspect_func=autoinspect_func)
+        df = pd.DataFrame(data)
+        if schemas and "pandas" in schemas:
+            pandas_schema = {key:np.dtype(value) for (key, value) in schemas["pandas"].items()}
+            df = df.astype(pandas_schema)
+        return PandasDataset(df, autoinspect_func=autoinspect_func)
     elif dataset_type == 'SqlAlchemyDataset':
         # Create a new database