Skip to content

Commit

Permalink
Merge f103be9 into 5852b5d
Browse files Browse the repository at this point in the history
  • Loading branch information
jcampbell committed Aug 16, 2019
2 parents 5852b5d + f103be9 commit c0a798a
Show file tree
Hide file tree
Showing 12 changed files with 373 additions and 253 deletions.
26 changes: 25 additions & 1 deletion great_expectations/data_asset/data_asset.py
Expand Up @@ -959,7 +959,31 @@ def validate(self,
warnings.warn(
"WARNING: No great_expectations version found in configuration object.")

for expectation in expectation_suite['expectations']:


###
# This is an early example of what will become part of the ValidationOperator
# This operator would be dataset-semantic aware
# Adding now to simply ensure we can be slightly better at ordering our expectation evaluation
###

# Group expectations by column
columns = {}

for expectation in expectation_suite["expectations"]:
if "column" in expectation["kwargs"]:
column = expectation["kwargs"]["column"]
else:
column = "_nocolumn"
if column not in columns:
columns[column] = []
columns[column].append(expectation)

expectations_to_evaluate = []
for col in columns:
expectations_to_evaluate.extend(columns[col])

for expectation in expectations_to_evaluate:

try:
expectation_method = getattr(
Expand Down
24 changes: 14 additions & 10 deletions great_expectations/dataset/sparkdf_dataset.py
Expand Up @@ -407,8 +407,13 @@ def expect_column_values_to_be_in_set(
if parse_strings_as_datetimes:
column = self._apply_dateutil_parse(column)
value_set = [parse(value) if isinstance(value, string_types) else value for value in value_set]
success_udf = udf(lambda x: x in value_set)
return column.withColumn('__success', success_udf(column[0]))
if None in value_set:
# spark isin returns None when any value is compared to None
logger.error("expect_column_values_to_be_in_set cannot support a None in the value_set in spark")
raise ValueError(
"expect_column_values_to_be_in_set cannot support a None in the value_set in spark")
return column.withColumn('__success', column[0].isin(value_set))


@DocInherit
@MetaSparkDFDataset.column_map_expectation
Expand All @@ -422,8 +427,11 @@ def expect_column_values_to_not_be_in_set(
catch_exceptions=None,
meta=None,
):
success_udf = udf(lambda x: x not in value_set)
return column.withColumn('__success', success_udf(column[0]))
if None in value_set:
# spark isin returns None when any value is compared to None
logger.error("expect_column_values_to_not_be_in_set cannot support a None in the value_set in spark")
raise ValueError("expect_column_values_to_not_be_in_set cannot support a None in the value_set in spark")
return column.withColumn('__success', ~column[0].isin(value_set))

@DocInherit
@MetaSparkDFDataset.column_map_expectation
Expand Down Expand Up @@ -652,9 +660,7 @@ def expect_column_values_to_match_regex(
catch_exceptions=None,
meta=None,
):
# not sure know about casting to string here
success_udf = udf(lambda x: re.findall(regex, str(x)) != [])
return column.withColumn('__success', success_udf(column[0]))
return column.withColumn('__success', column[0].rlike(regex))

@DocInherit
@MetaSparkDFDataset.column_map_expectation
Expand All @@ -668,6 +674,4 @@ def expect_column_values_to_not_match_regex(
catch_exceptions=None,
meta=None,
):
# not sure know about casting to string here
success_udf = udf(lambda x: re.findall(regex, str(x)) == [])
return column.withColumn('__success', success_udf(column[0]))
return column.withColumn('__success', ~column[0].rlike(regex))
2 changes: 1 addition & 1 deletion great_expectations/profile/basic_dataset_profiler.py
Expand Up @@ -62,7 +62,7 @@ def _get_column_cardinality(cls, df, column):
pct_unique = df.expect_column_proportion_of_unique_values_to_be_between(
column, None, None)['result']['observed_value']
except KeyError: # if observed_value value is not set
logger.exception("Failed to get cardinality of column {0:s} - continuing...".format(column))
logger.error("Failed to get cardinality of column {0:s} - continuing...".format(column))

if num_unique is None or num_unique == 0 or pct_unique is None:
cardinality = "none"
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Expand Up @@ -6,7 +6,7 @@

try:
import pypandoc
long_description = pypandoc.convert('README.md', 'rst')
long_description = pypandoc.convert_file('README.md', 'rst')
except (IOError, ImportError):
long_description = 'Always know what to expect from your data. (See https://github.com/great-expectations/great_expectations for full description).'

Expand All @@ -23,7 +23,6 @@
'spark': ['pyspark>=2.3.2'],
'sqlalchemy': ['sqlalchemy>=1.2'],
'airflow': ['apache-airflow[s3]>=1.9.0', 'boto3>=1.7.3']

},
'packages': find_packages(exclude=['docs', 'tests', 'examples']),
'entry_points': {
Expand Down
7 changes: 6 additions & 1 deletion tests/test_cli.py
Expand Up @@ -5,7 +5,6 @@
from click.testing import CliRunner
import great_expectations.version
from great_expectations.cli import cli
import tempfile
import pytest
import json
import os
Expand All @@ -22,6 +21,7 @@
except ImportError:
import mock

from six import PY2

from great_expectations.cli.init import scaffold_directories_and_notebooks

Expand Down Expand Up @@ -135,6 +135,11 @@ def test_validate_basic_operation():
with open('./tests/test_sets/expected_cli_results_default.json', 'r') as f:
expected_cli_results = json.load(f)

# In PY2 sorting is possible and order is wonky. Order doesn't matter. So sort in that case
if PY2:
json_result["results"] = sorted(json_result["results"])
expected_cli_results["results"] = sorted(expected_cli_results["results"])

assert json_result == expected_cli_results


Expand Down
124 changes: 84 additions & 40 deletions tests/test_data_asset.py
Expand Up @@ -9,6 +9,7 @@
import great_expectations as ge

import unittest
from six import PY2


def test_data_asset_name_inheritance(dataset):
Expand Down Expand Up @@ -800,9 +801,7 @@ def test_find_expectations(self):
}]
)

self.assertEqual(
my_df.find_expectations("expect_column_to_exist"),
[{
exp1 = [{
"expectation_type": "expect_column_to_exist",
"kwargs": {
"column": "x"
Expand All @@ -818,7 +817,11 @@ def test_find_expectations(self):
"column": "z"
}
}]
)

if PY2:
self.assertEqual(sorted(my_df.find_expectations("expect_column_to_exist")), sorted(exp1))
else:
self.assertEqual(my_df.find_expectations("expect_column_to_exist"), exp1)

with self.assertRaises(Exception) as context:
my_df.find_expectations(
Expand All @@ -829,9 +832,7 @@ def test_find_expectations(self):
# print 'Conflicting column names in remove_expectation' in context.exception
# self.assertTrue('Conflicting column names in remove_expectation:' in context.exception)

self.assertEqual(
my_df.find_expectations(column="x"),
[{
exp1 = [{
"expectation_type": "expect_column_to_exist",
"kwargs": {
"column": "x"
Expand All @@ -848,7 +849,11 @@ def test_find_expectations(self):
"column": "x"
}
}]
)

if PY2:
self.assertEqual(sorted(my_df.find_expectations(column="x")), sorted(exp1))
else:
self.assertEqual(my_df.find_expectations(column="x"), exp1)

def test_remove_expectation(self):
my_df = ge.dataset.PandasDataset({
Expand Down Expand Up @@ -909,10 +914,7 @@ def test_remove_expectation(self):
# FIXME: Python 3 doesn't like this. It would be nice to use assertRaisesRegex, but that's not available in python 2.7
# self.assertTrue('Multiple expectations matched arguments. No expectations removed.' in context.exception)

self.assertEqual(
my_df.remove_expectation(
"expect_column_to_exist", remove_multiple_matches=True, dry_run=True),
[{
exp1 = [{
"expectation_type": "expect_column_to_exist",
"kwargs": {
"column": "x"
Expand All @@ -928,7 +930,17 @@ def test_remove_expectation(self):
"column": "z"
}
}]
)

if PY2:
self.assertEqual(
sorted(my_df.remove_expectation("expect_column_to_exist", remove_multiple_matches=True, dry_run=True)),
sorted(exp1)
)
else:
self.assertEqual(
my_df.remove_expectation("expect_column_to_exist", remove_multiple_matches=True, dry_run=True),
exp1
)

with self.assertRaises(Exception) as context:
my_df.remove_expectation("expect_column_to_exist", "x", {
Expand All @@ -939,10 +951,7 @@ def test_remove_expectation(self):
# print 'Conflicting column names in remove_expectation' in context.exception
# self.assertTrue('Conflicting column names in remove_expectation:' in context.exception)

self.assertEqual(
my_df.remove_expectation(
column="x", remove_multiple_matches=True, dry_run=True),
[{
exp1 = [{
"expectation_type": "expect_column_to_exist",
"kwargs": {
"column": "x"
Expand All @@ -959,7 +968,17 @@ def test_remove_expectation(self):
"column": "x"
}
}]
)

if PY2:
self.assertEqual(
sorted(my_df.remove_expectation(column="x", remove_multiple_matches=True, dry_run=True)),
sorted(exp1)
)
else:
self.assertEqual(
my_df.remove_expectation(column="x", remove_multiple_matches=True, dry_run=True),
exp1
)

self.assertEqual(
len(my_df._expectation_suite.expectations),
Expand Down Expand Up @@ -1029,52 +1048,68 @@ def test_discard_failing_expectations(self):
exp1 = [
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'A'}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'B'}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'C'}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'D'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'B'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'B', 'value_set': [5, 6, 7, 8]}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'C'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'C', 'value_set': ['a', 'b', 'c', 'd']}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'D'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}}
]

sub1 = df[:3]

sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
# PY2 sorting is allowed and order not guaranteed
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

sub1 = df[1:2]
sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

sub1 = df[:-1]
sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

sub1 = df[-1:]
sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

sub1 = df[['A', 'D']]
exp1 = [
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'A'}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'D'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'D'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}}
]
sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

sub1 = df[['A']]
exp1 = [
Expand All @@ -1084,39 +1119,48 @@ def test_discard_failing_expectations(self):
'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}}
]
sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

sub1 = df.iloc[:3, 1:4]
exp1 = [
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'B'}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'C'}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'D'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'B', 'value_set': [5, 6, 7, 8]}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'C'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'C', 'value_set': ['a', 'b', 'c', 'd']}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'D'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'D', 'value_set': ['e', 'f', 'g', 'h']}}
]
sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

sub1 = df.loc[0:, 'A':'B']
exp1 = [
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'A'}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'B'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'A', 'value_set': [1, 2, 3, 4]}},
{'expectation_type': 'expect_column_to_exist',
'kwargs': {'column': 'B'}},
{'expectation_type': 'expect_column_values_to_be_in_set',
'kwargs': {'column': 'B', 'value_set': [5, 6, 7, 8]}}
]
sub1.discard_failing_expectations()
self.assertEqual(sub1.find_expectations(), exp1)
if PY2:
self.assertEqual(sorted(sub1.find_expectations()), sorted(exp1))
else:
self.assertEqual(sub1.find_expectations(), exp1)

def test_test_expectation_function(self):
D = ge.dataset.PandasDataset({
Expand Down

0 comments on commit c0a798a

Please sign in to comment.