Merge 47df52f into 88d5605

great-expectations · Jan 17, 2019 · 1c69717 · 1c69717
2 parents 88d5605 + 47df52f
commit 1c69717
Show file tree

Hide file tree

Showing 8 changed files with 249 additions and 18 deletions.
diff --git a/great_expectations/dataset/base.py b/great_expectations/dataset/base.py
@@ -203,7 +203,7 @@ def column_map_expectation(cls, func):
         Args:
             func (function): \
                 The function implementing a row-wise expectation. The function should take a column of data and \
-                return an equally-long column of boolean values corresponding to whether the truthiness of the \
+                return an equally-long column of boolean values corresponding to the truthiness of the \
                 underlying expectation.
 
         Notes:
@@ -1032,18 +1032,22 @@ def _format_column_map_output(self,
             return return_obj
 
         # Try to return the most common values, if possible.
-        try:
-            partial_unexpected_counts = [
-                {'value': key, 'count': value}
-                for key, value
-                in sorted(
-                    Counter(unexpected_list).most_common(
-                        result_format['partial_unexpected_count']),
-                    key=lambda x: (-x[1], x[0]))
-            ]
-        except TypeError:
-            partial_unexpected_counts = [
-                'partial_exception_counts requires a hashable type']
+        # If we have a dict, we probably had a dataframe; punt
+        if isinstance(unexpected_list, list):
+            try:
+                partial_unexpected_counts = [
+                    {'value': key, 'count': value}
+                    for key, value
+                    in sorted(
+                        Counter(unexpected_list).most_common(
+                            result_format['partial_unexpected_count']),
+                        key=lambda x: (-x[1], x[0]))
+                ]
+            except TypeError:
+                partial_unexpected_counts = [
+                    'partial_exception_counts requires a hashable type']
+        else:
+            partial_unexpected_counts = ['partial_unexpected_counts requires a flattened type']
 
         return_obj['result'].update(
             {
@@ -3320,7 +3324,46 @@ def expect_column_pair_values_to_be_in_set(self,
             value_pairs_set (list of tuples): All the valid pairs to be matched
 
         Keyword Args:
-            ignore_row_if (str): "both_values_are_missing", "either_value_is_missing", "neither
+            ignore_row_if (str): "both_values_are_missing", "either_value_is_missing", "never"
+
+        Other Parameters:
+            result_format (str or None): \
+                Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
+                For more detail, see :ref:`result_format <result_format>`.
+            include_config (boolean): \
+                If True, then include the expectation config as part of the result object. \
+                For more detail, see :ref:`include_config`.
+            catch_exceptions (boolean or None): \
+                If True, then catch exceptions and include them as part of the result object. \
+                For more detail, see :ref:`catch_exceptions`.
+            meta (dict or None): \
+                A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. \
+                For more detail, see :ref:`meta`.
+
+        Returns:
+            A JSON-serializable expectation result object.
+
+            Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
+            :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
+
+        """
+        raise NotImplementedError
+
+    ### Multicolumn pairs ###
+
+    def expect_multicolumn_values_to_be_unique(self,
+                                              column_list,
+                                              ignore_row_if="all_values_are_missing",
+                                              result_format=None, include_config=False, catch_exceptions=None, meta=None
+                                              ):
+        """
+        Expect the values for each row to be unique across the columns listed.
+
+        Args:
+            column_list (tuple or list): The first column name
+
+        Keyword Args:
+            ignore_row_if (str): "all_values_are_missing", "any_value_is_missing", "never"
 
         Other Parameters:
             result_format (str or None): \

diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py
@@ -194,6 +194,62 @@ def inner_wrapper(self, column_A, column_B, mostly=None, ignore_row_if="both_val
         inner_wrapper.__doc__ = func.__doc__
         return inner_wrapper
 
+    @classmethod
+    def multicolumn_map_expectation(cls, func):
+        """
+        The multicolumn_map_expectation decorator handles boilerplate issues surrounding the common pattern of
+        evaluating truthiness of some condition on a per row basis across a set of columns.
+        """
+        if PY3:
+            argspec = inspect.getfullargspec(func)[0][1:]
+        else:
+            argspec = inspect.getargspec(func)[0][1:]
+
+        @cls.expectation(argspec)
+        @wraps(func)
+        def inner_wrapper(self, column_list, mostly=None, ignore_row_if="all_values_are_missing",
+                          result_format=None, *args, **kwargs):
+
+            if result_format is None:
+                result_format = self.default_expectation_args["result_format"]
+
+            test_df = self[column_list]
+
+            if ignore_row_if == "all_values_are_missing":
+                boolean_mapped_skip_values = test_df.isnull().all(axis=1)
+            elif ignore_row_if == "any_value_is_missing":
+                boolean_mapped_skip_values = test_df.isnull().any(axis=1)
+            elif ignore_row_if == "never":
+                boolean_mapped_skip_values = pd.Series([False] * len(test_df))
+            else:
+                raise ValueError(
+                    "Unknown value of ignore_row_if: %s", (ignore_row_if,))
+
+            boolean_mapped_success_values = func(
+                self, test_df[boolean_mapped_skip_values == False], *args, **kwargs)
+            success_count = boolean_mapped_success_values.sum()
+            nonnull_count = (~boolean_mapped_skip_values).sum()
+            element_count = len(test_df)
+
+            unexpected_list = test_df[(boolean_mapped_skip_values == False) & (boolean_mapped_success_values == False)]
+            unexpected_index_list = list(unexpected_list.index)
+
+            success, percent_success = self._calc_map_expectation_success(
+                success_count, nonnull_count, mostly)
+
+            return_obj = self._format_column_map_output(
+                result_format, success,
+                element_count, nonnull_count,
+                unexpected_list, unexpected_index_list
+            )
+
+            return return_obj
+
+        inner_wrapper.__name__ = func.__name__
+        inner_wrapper.__doc__ = func.__doc__
+        return inner_wrapper
+
+
     @classmethod
     def column_aggregate_expectation(cls, func):
         """Constructs an expectation using column-aggregate semantics.
@@ -1513,3 +1569,14 @@ def expect_column_pair_values_to_be_in_set(self,
             results.append((a, b) in value_pairs_set)
 
         return pd.Series(results, temp_df.index)
+
+    @DocInherit
+    @MetaPandasDataset.multicolumn_map_expectation
+    def expect_multicolumn_values_to_be_unique(self,
+                                               column_list,
+                                               ignore_row_if="all_values_are_missing",
+                                               result_format=None, include_config=False, catch_exceptions=None, meta=None
+                                               ):
+        threshold = len(column_list.columns)
+        # Do not dropna here, since we have separately dealt with na in decorator
+        return column_list.nunique(dropna=False, axis=1) >= threshold
diff --git a/great_expectations/dataset/util.py b/great_expectations/dataset/util.py
@@ -139,7 +139,8 @@ def recursively_convert_to_json_serializable(test_obj):
     elif isinstance(test_obj, dict):
         new_dict = {}
         for key in test_obj:
-            new_dict[key] = recursively_convert_to_json_serializable(
+            # A pandas index can be numeric, and a dict key can be numeric, but a json key must be a string
+            new_dict[str(key)] = recursively_convert_to_json_serializable(
                 test_obj[key])
 
         return new_dict
@@ -159,7 +160,7 @@ def recursively_convert_to_json_serializable(test_obj):
 
     # Note: This clause has to come after checking for np.ndarray or we get:
     #      `ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()`
-    elif test_obj == None:
+    elif test_obj is None:
         # No problem to encode json
         return test_obj
 
@@ -178,6 +179,9 @@ def recursively_convert_to_json_serializable(test_obj):
         # Note: Use np.floating to avoid FutureWarning from numpy
         return float(round(test_obj, sys.float_info.dig))
 
+    elif isinstance(test_obj, pd.DataFrame):
+        return recursively_convert_to_json_serializable(test_obj.to_dict())
+
     # elif np.issubdtype(type(test_obj), np.complexfloating):
         # Note: Use np.complexfloating to avoid Future Warning from numpy
         # Complex numbers consist of two floating point numbers

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -10,6 +10,6 @@ xlrd>=1.1.0
 pyarrow==0.11.0
 sphinxcontrib-napoleon>=0.6.1
 pypandoc>=1.4
-pytest>=3.2.5
-pytest-cov>=2.5
+pytest>=4.1.1
+pytest-cov>=2.6.1
 coveralls>=1.3
diff --git a/tests/multicolumn_map_expectations/__init__.py b/tests/multicolumn_map_expectations/__init__.py
diff --git a/tests/multicolumn_map_expectations/expect_multicolumn_values_to_be_unique.json b/tests/multicolumn_map_expectations/expect_multicolumn_values_to_be_unique.json
@@ -0,0 +1,50 @@
+{
+  "expectation_type" : "expect_multicolumn_values_to_be_unique",
+  "datasets" : [{
+    "data" : {
+      "w" : [2, 3, 4, 5, 6, 7, 8, 9, 10, null],
+      "x" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+      "y" : [2, 3, 4, 5, 6, 7, 8, 9, 10, null],
+      "z" : [1, 2, 3, 4, 5, null, null, null, null, null],
+      "a" : [1, 1, 1, 1, 1, 2, 2, 2, 2, null],
+      "b" : [1, 2, 1, 2, 1, 2, 1, 2, 1, null]
+    },
+    "tests" : [{
+      "title": "Basic test; ignore if all are missing",
+      "exact_match_out" : false,
+      "in": {
+        "column_list": ["w", "x"],
+        "ignore_row_if": "all_values_are_missing"
+      },
+      "out": {
+        "unexpected_list": {"w": {}, "x": {}},
+        "unexpected_index_list": [],
+        "success": true
+      }
+    },{
+      "title": "Basic test; ignore if any are missing",
+      "exact_match_out" : false,
+      "in": {
+        "column_list": ["w", "x"],
+        "ignore_row_if": "any_value_is_missing"
+      },
+      "out": {
+        "unexpected_list": {"w": {}, "x": {}},
+        "unexpected_index_list": [],
+        "success": true
+      }
+    },{
+      "title": "Unexpected Values",
+      "exact_match_out" : false,
+      "in": {
+        "column_list": ["a", "b"]
+      },
+      "out": {
+        "unexpected_list": {"a": {"0": 1.0, "2": 1.0, "4": 1.0, "5": 2.0, "7": 2.0},
+                            "b": {"0": 1.0, "2": 1.0, "4": 1.0, "5": 2.0, "7": 2.0}},
+        "unexpected_index_list": [0,2,4,5,7],
+        "success": false
+      }
+    }]
+  }]
+}
diff --git a/tests/multicolumn_map_expectations/test_multicolumn_map_expectations.py b/tests/multicolumn_map_expectations/test_multicolumn_map_expectations.py
@@ -0,0 +1,66 @@
+###
+###
+#
+# This file should not be modified. To adjust test cases, edit the related json file(s).
+#
+###
+###
+
+
+import os
+import json
+import glob
+import warnings
+
+from tests.test_utils import get_dataset, candidate_test_is_on_temporary_notimplemented_list, evaluate_json_test
+
+contexts = ['PandasDataset', 'SqlAlchemyDataset']
+
+
+def pytest_generate_tests(metafunc):
+
+    # Load all the JSON files in the directory
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    test_configuration_files = glob.glob(dir_path+'/*.json')
+
+    parametrized_tests = []
+    ids = []
+    for c in contexts:
+        for filename in test_configuration_files:
+            file = open(filename)
+            test_configuration = json.load(file)
+
+            if candidate_test_is_on_temporary_notimplemented_list(c, test_configuration["expectation_type"]):
+                warnings.warn("Skipping generation of tests for expectation " + test_configuration["expectation_type"] +
+                              " and context " + c)
+            else:
+                for d in test_configuration['datasets']:
+                    my_dataset = get_dataset(c, d["data"])
+
+                    for test in d["tests"]:
+                        parametrized_tests.append({
+                            "expectation_type": test_configuration["expectation_type"],
+                            "dataset": my_dataset,
+                            "test": test,
+                        })
+
+                        ids.append(
+                            c+":"+test_configuration["expectation_type"]+":"+test["title"])
+
+    metafunc.parametrize(
+        "test_case",
+        parametrized_tests,
+        ids=ids
+    )
+
+
+def test_case_runner(test_case):
+    # Note: this should never be done in practice, but we are wiping expectations to reuse datasets during testing.
+    test_case["dataset"]._initialize_expectations()
+
+
+    evaluate_json_test(
+        test_case["dataset"],
+        test_case["expectation_type"],
+        test_case["test"]
+    )
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -120,6 +120,7 @@ def candidate_test_is_on_temporary_notimplemented_list(context, expectation_type
             "expect_column_pair_values_to_be_equal",
             "expect_column_pair_values_A_to_be_greater_than_B",
             "expect_column_pair_values_to_be_in_set",
+            "expect_multicolumn_values_to_be_unique"
         ]
     return False