Merge f103be9 into 5852b5d

great-expectations · Aug 16, 2019 · c0a798a · c0a798a
2 parents 5852b5d + f103be9
commit c0a798a
Show file tree

Hide file tree

Showing 12 changed files with 373 additions and 253 deletions.
diff --git a/great_expectations/data_asset/data_asset.py b/great_expectations/data_asset/data_asset.py
@@ -959,7 +959,31 @@ def validate(self,
             warnings.warn(
                 "WARNING: No great_expectations version found in configuration object.")
 
-        for expectation in expectation_suite['expectations']:
+
+
+        ###
+        # This is an early example of what will become part of the ValidationOperator
+        # This operator would be dataset-semantic aware
+        # Adding now to simply ensure we can be slightly better at ordering our expectation evaluation
+        ###
+
+        # Group expectations by column
+        columns = {}
+
+        for expectation in expectation_suite["expectations"]:
+            if "column" in expectation["kwargs"]:
+                column = expectation["kwargs"]["column"]
+            else:
+                column = "_nocolumn"
+            if column not in columns:
+                columns[column] = []
+            columns[column].append(expectation)
+
+        expectations_to_evaluate = []
+        for col in columns:
+            expectations_to_evaluate.extend(columns[col])
+
+        for expectation in expectations_to_evaluate:
 
             try:
                 expectation_method = getattr(

diff --git a/great_expectations/dataset/sparkdf_dataset.py b/great_expectations/dataset/sparkdf_dataset.py
@@ -407,8 +407,13 @@ def expect_column_values_to_be_in_set(
         if parse_strings_as_datetimes:
             column = self._apply_dateutil_parse(column)
             value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set]
-        success_udf = udf(lambda x: x in value_set)
-        return column.withColumn('__success', success_udf(column[0]))
+        if None in value_set:
+            # spark isin returns None when any value is compared to None
+            logger.error("expect_column_values_to_be_in_set cannot support a None in the value_set in spark")
+            raise ValueError(
+                "expect_column_values_to_be_in_set cannot support a None in the value_set in spark")
+        return column.withColumn('__success', column[0].isin(value_set))
+
 
     @DocInherit
     @MetaSparkDFDataset.column_map_expectation
@@ -422,8 +427,11 @@ def expect_column_values_to_not_be_in_set(
             catch_exceptions=None,
             meta=None,
     ):
-        success_udf = udf(lambda x: x not in value_set)
-        return column.withColumn('__success', success_udf(column[0]))
+        if None in value_set:
+            # spark isin returns None when any value is compared to None
+            logger.error("expect_column_values_to_not_be_in_set cannot support a None in the value_set in spark")
+            raise ValueError("expect_column_values_to_not_be_in_set cannot support a None in the value_set in spark")
+        return column.withColumn('__success', ~column[0].isin(value_set))
 
     @DocInherit
     @MetaSparkDFDataset.column_map_expectation
@@ -652,9 +660,7 @@ def expect_column_values_to_match_regex(
         catch_exceptions=None,
         meta=None,
     ):
-        # not sure know about casting to string here
-        success_udf = udf(lambda x: re.findall(regex, str(x)) != [])
-        return column.withColumn('__success', success_udf(column[0]))
+        return column.withColumn('__success', column[0].rlike(regex))
 
     @DocInherit
     @MetaSparkDFDataset.column_map_expectation
@@ -668,6 +674,4 @@ def expect_column_values_to_not_match_regex(
         catch_exceptions=None,
         meta=None,
     ):
-        # not sure know about casting to string here
-        success_udf = udf(lambda x: re.findall(regex, str(x)) == [])
-        return column.withColumn('__success', success_udf(column[0]))
+        return column.withColumn('__success', ~column[0].rlike(regex))
diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py
@@ -62,7 +62,7 @@ def _get_column_cardinality(cls, df, column):
             pct_unique = df.expect_column_proportion_of_unique_values_to_be_between(
                 column, None, None)['result']['observed_value']
         except KeyError:  # if observed_value value is not set
-            logger.exception("Failed to get cardinality of column {0:s} - continuing...".format(column))
+            logger.error("Failed to get cardinality of column {0:s} - continuing...".format(column))
 
         if num_unique is None or num_unique == 0 or pct_unique is None:
             cardinality = "none"

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 try:
     import pypandoc
-    long_description = pypandoc.convert('README.md', 'rst')
+    long_description = pypandoc.convert_file('README.md', 'rst')
 except (IOError, ImportError):
     long_description = 'Always know what to expect from your data. (See https://github.com/great-expectations/great_expectations for full description).'
 
@@ -23,7 +23,6 @@
         'spark':  ['pyspark>=2.3.2'],
         'sqlalchemy': ['sqlalchemy>=1.2'],
         'airflow': ['apache-airflow[s3]>=1.9.0', 'boto3>=1.7.3']
-
     },
     'packages': find_packages(exclude=['docs', 'tests', 'examples']),
     'entry_points': {

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -5,7 +5,6 @@
 from click.testing import CliRunner
 import great_expectations.version
 from great_expectations.cli import cli
-import tempfile
 import pytest
 import json
 import os
@@ -22,6 +21,7 @@
 except ImportError:
     import mock
 
+from six import PY2
 
 from great_expectations.cli.init import scaffold_directories_and_notebooks
 
@@ -135,6 +135,11 @@ def test_validate_basic_operation():
     with open('./tests/test_sets/expected_cli_results_default.json', 'r') as f:
         expected_cli_results = json.load(f)
 
+    # In PY2 sorting is possible and order is wonky. Order doesn't matter. So sort in that case
+    if PY2:
+        json_result["results"] = sorted(json_result["results"])
+        expected_cli_results["results"] = sorted(expected_cli_results["results"])
+
     assert json_result == expected_cli_results
 
 

diff --git a/tests/test_data_asset.py b/tests/test_data_asset.py
@@ -9,6 +9,7 @@
 import great_expectations as ge
 
 import unittest
+from six import PY2
 
 
 def test_data_asset_name_inheritance(dataset):
@@ -800,9 +801,7 @@ def test_find_expectations(self):
             }]
         )
 
-        self.assertEqual(
-            my_df.find_expectations("expect_column_to_exist"),
-            [{
+        exp1 = [{
                 "expectation_type": "expect_column_to_exist",
                 "kwargs": {
                     "column": "x"
@@ -818,7 +817,11 @@ def test_find_expectations(self):
                     "column": "z"
                 }
             }]
-        )
+
+        if PY2:
+            self.assertEqual(sorted(my_df.find_expectations("expect_column_to_exist")), sorted(exp1))
+        else:
+            self.assertEqual(my_df.find_expectations("expect_column_to_exist"), exp1)
 
         with self.assertRaises(Exception) as context:
             my_df.find_expectations(
@@ -829,9 +832,7 @@ def test_find_expectations(self):
         # print 'Conflicting column names in remove_expectation' in context.exception
         # self.assertTrue('Conflicting column names in remove_expectation:' in context.exception)
 
-        self.assertEqual(
-            my_df.find_expectations(column="x"),
-            [{
+        exp1 = [{
                 "expectation_type": "expect_column_to_exist",
                 "kwargs": {
                     "column": "x"
@@ -848,7 +849,11 @@ def test_find_expectations(self):
                     "column": "x"
                 }
             }]
-        )
+
+        if PY2:
+            self.assertEqual(sorted(my_df.find_expectations(column="x")), sorted(exp1))
+        else:
+            self.assertEqual(my_df.find_expectations(column="x"), exp1)
 
     def test_remove_expectation(self):
         my_df = ge.dataset.PandasDataset({
@@ -909,10 +914,7 @@ def test_remove_expectation(self):
         # FIXME: Python 3 doesn't like this. It would be nice to use assertRaisesRegex, but that's not available in python 2.7
         # self.assertTrue('Multiple expectations matched arguments. No expectations removed.' in context.exception)
 
-        self.assertEqual(
-            my_df.remove_expectation(
-                "expect_column_to_exist", remove_multiple_matches=True, dry_run=True),
-            [{
+        exp1 = [{
                 "expectation_type": "expect_column_to_exist",
                 "kwargs": {
                     "column": "x"
@@ -928,7 +930,17 @@ def test_remove_expectation(self):
                     "column": "z"
                 }
             }]
-        )
+
+        if PY2:
+            self.assertEqual(
+                sorted(my_df.remove_expectation("expect_column_to_exist", remove_multiple_matches=True, dry_run=True)),
+                sorted(exp1)
+            )
+        else:
+            self.assertEqual(
+                my_df.remove_expectation("expect_column_to_exist", remove_multiple_matches=True, dry_run=True),
+                exp1
+            )
 
         with self.assertRaises(Exception) as context:
             my_df.remove_expectation("expect_column_to_exist", "x", {
@@ -939,10 +951,7 @@ def test_remove_expectation(self):
         # print 'Conflicting column names in remove_expectation' in context.exception
         # self.assertTrue('Conflicting column names in remove_expectation:' in context.exception)
 
-        self.assertEqual(
-            my_df.remove_expectation(
-                column="x", remove_multiple_matches=True, dry_run=True),
-            [{
+        exp1 = [{
                 "expectation_type": "expect_column_to_exist",
                 "kwargs": {
                     "column": "x"
@@ -959,7 +968,17 @@ def test_remove_expectation(self):
                     "column": "x"
                 }
             }]
-        )
+
+        if PY2:
+            self.assertEqual(
+                sorted(my_df.remove_expectation(column="x", remove_multiple_matches=True, dry_run=True)),
+                sorted(exp1)
+            )
+        else:
+            self.assertEqual(
+                my_df.remove_expectation(column="x", remove_multiple_matches=True, dry_run=True),
+                exp1
+            )
 
         self.assertEqual(
             len(my_df._expectation_suite.expectations),
@@ -1029,52 +1048,68 @@ def test_discard_failing_expectations(self):
         exp1 = [
             {'expectation_type': 'expect_column_to_exist',
              'kwargs': {'column': 'A'}},
-            {'expectation_type': 'expect_column_to_exist',
-             'kwargs': {'column': 'B'}},
-            {'expectation_type': 'expect_column_to_exist',
-             'kwargs': {'column': 'C'}},
-            {'expectation_type': 'expect_column_to_exist',
-             'kwargs': {'column': 'D'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}},
+            {'expectation_type': 'expect_column_to_exist',
+             'kwargs': {'column': 'B'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'B', 'value_set': [5, 6, 7, 8]}},
+            {'expectation_type': 'expect_column_to_exist',
+             'kwargs': {'column': 'C'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'C', 'value_set': ['a', 'b', 'c', 'd']}},
+            {'expectation_type': 'expect_column_to_exist',
+             'kwargs': {'column': 'D'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}}
         ]
 
         sub1 = df[:3]
 
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        # PY2 sorting is allowed and order not guaranteed
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
         sub1 = df[1:2]
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
         sub1 = df[:-1]
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
         sub1 = df[-1:]
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
         sub1 = df[['A', 'D']]
         exp1 = [
             {'expectation_type': 'expect_column_to_exist',
              'kwargs': {'column': 'A'}},
-            {'expectation_type': 'expect_column_to_exist',
-             'kwargs': {'column': 'D'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}},
+            {'expectation_type': 'expect_column_to_exist',
+             'kwargs': {'column': 'D'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}}
         ]
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
         sub1 = df[['A']]
         exp1 = [
@@ -1084,39 +1119,48 @@ def test_discard_failing_expectations(self):
              'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}}
         ]
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
         sub1 = df.iloc[:3, 1:4]
         exp1 = [
             {'expectation_type': 'expect_column_to_exist',
              'kwargs': {'column': 'B'}},
-            {'expectation_type': 'expect_column_to_exist',
-             'kwargs': {'column': 'C'}},
-            {'expectation_type': 'expect_column_to_exist',
-             'kwargs': {'column': 'D'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'B', 'value_set': [5, 6, 7, 8]}},
+            {'expectation_type': 'expect_column_to_exist',
+             'kwargs': {'column': 'C'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'C', 'value_set': ['a', 'b', 'c', 'd']}},
+            {'expectation_type': 'expect_column_to_exist',
+             'kwargs': {'column': 'D'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}}
         ]
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
         sub1 = df.loc[0:, 'A':'B']
         exp1 = [
             {'expectation_type': 'expect_column_to_exist',
              'kwargs': {'column': 'A'}},
-            {'expectation_type': 'expect_column_to_exist',
-             'kwargs': {'column': 'B'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}},
+            {'expectation_type': 'expect_column_to_exist',
+             'kwargs': {'column': 'B'}},
             {'expectation_type': 'expect_column_values_to_be_in_set',
              'kwargs': {'column': 'B', 'value_set': [5, 6, 7, 8]}}
         ]
         sub1.discard_failing_expectations()
-        self.assertEqual(sub1.find_expectations(), exp1)
+        if PY2:
+            self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
+        else:
+            self.assertEqual(sub1.find_expectations(), exp1)
 
     def test_test_expectation_function(self):
         D = ge.dataset.PandasDataset({