Skip to content

Commit

Permalink
Merge 47df52f into 88d5605
Browse files Browse the repository at this point in the history
  • Loading branch information
jcampbell committed Jan 17, 2019
2 parents 88d5605 + 47df52f commit 1c69717
Show file tree
Hide file tree
Showing 8 changed files with 249 additions and 18 deletions.
71 changes: 57 additions & 14 deletions great_expectations/dataset/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def column_map_expectation(cls, func):
Args:
func (function): \
The function implementing a row-wise expectation. The function should take a column of data and \
return an equally-long column of boolean values corresponding to whether the truthiness of the \
return an equally-long column of boolean values corresponding to the truthiness of the \
underlying expectation.
Notes:
Expand Down Expand Up @@ -1032,18 +1032,22 @@ def _format_column_map_output(self,
return return_obj

# Try to return the most common values, if possible.
try:
partial_unexpected_counts = [
{'value': key, 'count': value}
for key, value
in sorted(
Counter(unexpected_list).most_common(
result_format['partial_unexpected_count']),
key=lambda x: (-x[1], x[0]))
]
except TypeError:
partial_unexpected_counts = [
'partial_exception_counts requires a hashable type']
# If we have a dict, we probably had a dataframe; punt
if isinstance(unexpected_list, list):
try:
partial_unexpected_counts = [
{'value': key, 'count': value}
for key, value
in sorted(
Counter(unexpected_list).most_common(
result_format['partial_unexpected_count']),
key=lambda x: (-x[1], x[0]))
]
except TypeError:
partial_unexpected_counts = [
'partial_exception_counts requires a hashable type']
else:
partial_unexpected_counts = ['partial_unexpected_counts requires a flattened type']

return_obj['result'].update(
{
Expand Down Expand Up @@ -3320,7 +3324,46 @@ def expect_column_pair_values_to_be_in_set(self,
value_pairs_set (list of tuples): All the valid pairs to be matched
Keyword Args:
ignore_row_if (str): "both_values_are_missing", "either_value_is_missing", "neither
ignore_row_if (str): "both_values_are_missing", "either_value_is_missing", "never"
Other Parameters:
result_format (str or None): \
Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
For more detail, see :ref:`result_format <result_format>`.
include_config (boolean): \
If True, then include the expectation config as part of the result object. \
For more detail, see :ref:`include_config`.
catch_exceptions (boolean or None): \
If True, then catch exceptions and include them as part of the result object. \
For more detail, see :ref:`catch_exceptions`.
meta (dict or None): \
A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. \
For more detail, see :ref:`meta`.
Returns:
A JSON-serializable expectation result object.
Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
:ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
"""
raise NotImplementedError

### Multicolumn pairs ###

def expect_multicolumn_values_to_be_unique(self,
column_list,
ignore_row_if="all_values_are_missing",
result_format=None, include_config=False, catch_exceptions=None, meta=None
):
"""
Expect the values for each row to be unique across the columns listed.
Args:
column_list (tuple or list): The first column name
Keyword Args:
ignore_row_if (str): "all_values_are_missing", "any_value_is_missing", "never"
Other Parameters:
result_format (str or None): \
Expand Down
67 changes: 67 additions & 0 deletions great_expectations/dataset/pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,62 @@ def inner_wrapper(self, column_A, column_B, mostly=None, ignore_row_if="both_val
inner_wrapper.__doc__ = func.__doc__
return inner_wrapper

@classmethod
def multicolumn_map_expectation(cls, func):
"""
The multicolumn_map_expectation decorator handles boilerplate issues surrounding the common pattern of
evaluating truthiness of some condition on a per row basis across a set of columns.
"""
if PY3:
argspec = inspect.getfullargspec(func)[0][1:]
else:
argspec = inspect.getargspec(func)[0][1:]

@cls.expectation(argspec)
@wraps(func)
def inner_wrapper(self, column_list, mostly=None, ignore_row_if="all_values_are_missing",
result_format=None, *args, **kwargs):

if result_format is None:
result_format = self.default_expectation_args["result_format"]

test_df = self[column_list]

if ignore_row_if == "all_values_are_missing":
boolean_mapped_skip_values = test_df.isnull().all(axis=1)
elif ignore_row_if == "any_value_is_missing":
boolean_mapped_skip_values = test_df.isnull().any(axis=1)
elif ignore_row_if == "never":
boolean_mapped_skip_values = pd.Series([False] * len(test_df))
else:
raise ValueError(
"Unknown value of ignore_row_if: %s", (ignore_row_if,))

boolean_mapped_success_values = func(
self, test_df[boolean_mapped_skip_values == False], *args, **kwargs)
success_count = boolean_mapped_success_values.sum()
nonnull_count = (~boolean_mapped_skip_values).sum()
element_count = len(test_df)

unexpected_list = test_df[(boolean_mapped_skip_values == False) & (boolean_mapped_success_values == False)]
unexpected_index_list = list(unexpected_list.index)

success, percent_success = self._calc_map_expectation_success(
success_count, nonnull_count, mostly)

return_obj = self._format_column_map_output(
result_format, success,
element_count, nonnull_count,
unexpected_list, unexpected_index_list
)

return return_obj

inner_wrapper.__name__ = func.__name__
inner_wrapper.__doc__ = func.__doc__
return inner_wrapper


@classmethod
def column_aggregate_expectation(cls, func):
"""Constructs an expectation using column-aggregate semantics.
Expand Down Expand Up @@ -1513,3 +1569,14 @@ def expect_column_pair_values_to_be_in_set(self,
results.append((a, b) in value_pairs_set)

return pd.Series(results, temp_df.index)

@DocInherit
@MetaPandasDataset.multicolumn_map_expectation
def expect_multicolumn_values_to_be_unique(self,
column_list,
ignore_row_if="all_values_are_missing",
result_format=None, include_config=False, catch_exceptions=None, meta=None
):
threshold = len(column_list.columns)
# Do not dropna here, since we have separately dealt with na in decorator
return column_list.nunique(dropna=False, axis=1) >= threshold
8 changes: 6 additions & 2 deletions great_expectations/dataset/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ def recursively_convert_to_json_serializable(test_obj):
elif isinstance(test_obj, dict):
new_dict = {}
for key in test_obj:
new_dict[key] = recursively_convert_to_json_serializable(
# A pandas index can be numeric, and a dict key can be numeric, but a json key must be a string
new_dict[str(key)] = recursively_convert_to_json_serializable(
test_obj[key])

return new_dict
Expand All @@ -159,7 +160,7 @@ def recursively_convert_to_json_serializable(test_obj):

# Note: This clause has to come after checking for np.ndarray or we get:
# `ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()`
elif test_obj == None:
elif test_obj is None:
# No problem to encode json
return test_obj

Expand All @@ -178,6 +179,9 @@ def recursively_convert_to_json_serializable(test_obj):
# Note: Use np.floating to avoid FutureWarning from numpy
return float(round(test_obj, sys.float_info.dig))

elif isinstance(test_obj, pd.DataFrame):
return recursively_convert_to_json_serializable(test_obj.to_dict())

# elif np.issubdtype(type(test_obj), np.complexfloating):
# Note: Use np.complexfloating to avoid Future Warning from numpy
# Complex numbers consist of two floating point numbers
Expand Down
4 changes: 2 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ xlrd>=1.1.0
pyarrow==0.11.0
sphinxcontrib-napoleon>=0.6.1
pypandoc>=1.4
pytest>=3.2.5
pytest-cov>=2.5
pytest>=4.1.1
pytest-cov>=2.6.1
coveralls>=1.3
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"expectation_type" : "expect_multicolumn_values_to_be_unique",
"datasets" : [{
"data" : {
"w" : [2, 3, 4, 5, 6, 7, 8, 9, 10, null],
"x" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"y" : [2, 3, 4, 5, 6, 7, 8, 9, 10, null],
"z" : [1, 2, 3, 4, 5, null, null, null, null, null],
"a" : [1, 1, 1, 1, 1, 2, 2, 2, 2, null],
"b" : [1, 2, 1, 2, 1, 2, 1, 2, 1, null]
},
"tests" : [{
"title": "Basic test; ignore if all are missing",
"exact_match_out" : false,
"in": {
"column_list": ["w", "x"],
"ignore_row_if": "all_values_are_missing"
},
"out": {
"unexpected_list": {"w": {}, "x": {}},
"unexpected_index_list": [],
"success": true
}
},{
"title": "Basic test; ignore if any are missing",
"exact_match_out" : false,
"in": {
"column_list": ["w", "x"],
"ignore_row_if": "any_value_is_missing"
},
"out": {
"unexpected_list": {"w": {}, "x": {}},
"unexpected_index_list": [],
"success": true
}
},{
"title": "Unexpected Values",
"exact_match_out" : false,
"in": {
"column_list": ["a", "b"]
},
"out": {
"unexpected_list": {"a": {"0": 1.0, "2": 1.0, "4": 1.0, "5": 2.0, "7": 2.0},
"b": {"0": 1.0, "2": 1.0, "4": 1.0, "5": 2.0, "7": 2.0}},
"unexpected_index_list": [0,2,4,5,7],
"success": false
}
}]
}]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
###
###
#
# This file should not be modified. To adjust test cases, edit the related json file(s).
#
###
###


import os
import json
import glob
import warnings

from tests.test_utils import get_dataset, candidate_test_is_on_temporary_notimplemented_list, evaluate_json_test

contexts = ['PandasDataset', 'SqlAlchemyDataset']


def pytest_generate_tests(metafunc):

# Load all the JSON files in the directory
dir_path = os.path.dirname(os.path.realpath(__file__))
test_configuration_files = glob.glob(dir_path+'/*.json')

parametrized_tests = []
ids = []
for c in contexts:
for filename in test_configuration_files:
file = open(filename)
test_configuration = json.load(file)

if candidate_test_is_on_temporary_notimplemented_list(c, test_configuration["expectation_type"]):
warnings.warn("Skipping generation of tests for expectation " + test_configuration["expectation_type"] +
" and context " + c)
else:
for d in test_configuration['datasets']:
my_dataset = get_dataset(c, d["data"])

for test in d["tests"]:
parametrized_tests.append({
"expectation_type": test_configuration["expectation_type"],
"dataset": my_dataset,
"test": test,
})

ids.append(
c+":"+test_configuration["expectation_type"]+":"+test["title"])

metafunc.parametrize(
"test_case",
parametrized_tests,
ids=ids
)


def test_case_runner(test_case):
# Note: this should never be done in practice, but we are wiping expectations to reuse datasets during testing.
test_case["dataset"]._initialize_expectations()


evaluate_json_test(
test_case["dataset"],
test_case["expectation_type"],
test_case["test"]
)
1 change: 1 addition & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def candidate_test_is_on_temporary_notimplemented_list(context, expectation_type
"expect_column_pair_values_to_be_equal",
"expect_column_pair_values_A_to_be_greater_than_B",
"expect_column_pair_values_to_be_in_set",
"expect_multicolumn_values_to_be_unique"
]
return False

Expand Down

0 comments on commit 1c69717

Please sign in to comment.