Skip to content

Commit

Permalink
Merge pull request #770 from elsander/669-column-expectations
Browse files Browse the repository at this point in the history
Add expect_table_column_count_to_equal and _to_be_between
  • Loading branch information
jcampbell committed Oct 9, 2019
2 parents 7ee7554 + 7df63c1 commit 589026f
Show file tree
Hide file tree
Showing 7 changed files with 378 additions and 39 deletions.
194 changes: 157 additions & 37 deletions great_expectations/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ class Dataset(MetaDataset):
'get_column_unique_count',
'get_column_value_counts',
'get_row_count',
'get_column_count',
'get_table_columns',
'get_column_count_in_range',
]
Expand All @@ -190,6 +191,10 @@ def get_row_count(self):
"""Returns: int, table row count"""
raise NotImplementedError

def get_column_count(self):
"""Returns: int, table column count"""
raise NotImplementedError

def get_table_columns(self):
"""Returns: List[str], list of column names"""
raise NotImplementedError
Expand Down Expand Up @@ -486,13 +491,155 @@ def expect_table_columns_to_match_ordered_list(
}
}

# noinspection PyUnusedLocal
@DocInherit
@DataAsset.expectation(['min_value', 'max_value'])
def expect_table_column_count_to_be_between(
self,
min_value=None, max_value=None,
result_format=None, include_config=False, catch_exceptions=None,
meta=None,
):
"""Expect the number of columns to be between two values.
expect_table_column_count_to_be_between is a :func:`expectation \
<great_expectations.data_asset.data_asset.DataAsset.expectation>`, not a
``column_map_expectation`` or ``column_aggregate_expectation``.
Keyword Args:
min_value (int or None): \
The minimum number of columns, inclusive.
max_value (int or None): \
The maximum number of columns, inclusive.
Other Parameters:
result_format (str or None): \
Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
For more detail, see :ref:`result_format <result_format>`.
include_config (boolean): \
If True, then include the expectation config as part of the result object. \
For more detail, see :ref:`include_config`.
catch_exceptions (boolean or None): \
If True, then catch exceptions and include them as part of the result object. \
For more detail, see :ref:`catch_exceptions`.
meta (dict or None): \
A JSON-serializable dictionary (nesting allowed) that will be included in the output without \
modification. For more detail, see :ref:`meta`.
Returns:
A JSON-serializable expectation result object.
Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
:ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
Notes:
* min_value and max_value are both inclusive.
* If min_value is None, then max_value is treated as an upper bound, and the number of acceptable columns \
has no minimum.
* If max_value is None, then min_value is treated as a lower bound, and the number of acceptable columns \
has no maximum.
See Also:
expect_table_column_count_to_equal
"""
try:
if min_value is not None:
if not float(min_value).is_integer():
raise ValueError("min_value must be integer")
if max_value is not None:
if not float(max_value).is_integer():
raise ValueError("max_value must be integer")
except ValueError:
raise ValueError("min_value and max_value must be integers")

# check that min_value or max_value is set
# if min_value is None and max_value is None:
# raise Exception('Must specify either or both of min_value and max_value')

column_count = self.get_column_count()

if min_value is not None:
above_min = column_count >= min_value
else:
above_min = True

if max_value is not None:
below_max = column_count <= max_value
else:
below_max = True

outcome = above_min and below_max

return {
'success': outcome,
'result': {
'observed_value': column_count
}
}

# noinspection PyUnusedLocal
@DocInherit
@DataAsset.expectation(['value'])
def expect_table_column_count_to_equal(
self,
value,
result_format=None, include_config=False, catch_exceptions=None,
meta=None
):
"""Expect the number of columns to equal a value.
expect_table_column_count_to_equal is a :func:`expectation \
<great_expectations.data_asset.data_asset.DataAsset.expectation>`, not a
``column_map_expectation`` or ``column_aggregate_expectation``.
Args:
value (int): \
The expected number of columns.
Other Parameters:
result_format (string or None): \
Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
For more detail, see :ref:`result_format <result_format>`.
include_config (boolean): \
If True, then include the expectation config as part of the result object. \
For more detail, see :ref:`include_config`.
catch_exceptions (boolean or None): \
If True, then catch exceptions and include them as part of the result object. \
For more detail, see :ref:`catch_exceptions`.
meta (dict or None): \
A JSON-serializable dictionary (nesting allowed) that will be included in the output without \
modification. For more detail, see :ref:`meta`.
Returns:
A JSON-serializable expectation result object.
Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
:ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
See Also:
expect_table_column_count_to_be_between
"""
try:
if not float(value).is_integer():
raise ValueError("value must be an integer")
except ValueError:
raise ValueError("value must be an integer")

column_count = self.get_column_count()

return {
'success': column_count == value,
'result': {
'observed_value': column_count
}
}

# noinspection PyUnusedLocal
@DocInherit
@DataAsset.expectation(['min_value', 'max_value'])
def expect_table_row_count_to_be_between(
self,
min_value=None, max_value=None,
strict_min=False, strict_max=False,
result_format=None, include_config=False, catch_exceptions=None,
meta=None,
):
Expand All @@ -504,13 +651,9 @@ def expect_table_row_count_to_be_between(
Keyword Args:
min_value (int or None): \
The minimum number of rows, inclusive unless strict_min=True.
The minimum number of rows, inclusive.
max_value (int or None): \
The maximum number of rows, inclusive unless strict_max=True.
strict_min (boolean):
If True, the table row count must be strictly larger than min_value.
strict_max (boolean):
If True, the table row count be strictly smaller than max_value.
The maximum number of rows, inclusive.
Other Parameters:
result_format (str or None): \
Expand All @@ -533,7 +676,7 @@ def expect_table_row_count_to_be_between(
:ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
Notes:
* min_value and max_value are both inclusive unless strict_min or strict_max are set to True.
* min_value and max_value are both inclusive.
* If min_value is None, then max_value is treated as an upper bound, and the number of acceptable rows has \
no minimum.
* If max_value is None, then min_value is treated as a lower bound, and the number of acceptable rows has \
Expand All @@ -559,18 +702,12 @@ def expect_table_row_count_to_be_between(
row_count = self.get_row_count()

if min_value is not None:
if strict_min:
above_min = row_count > min_value
else:
above_min = row_count >= min_value
above_min = row_count >= min_value
else:
above_min = True

if max_value is not None:
if strict_max:
below_max = row_count < max_value
else:
below_max = row_count <= max_value
below_max = row_count <= max_value
else:
below_max = True

Expand Down Expand Up @@ -1257,8 +1394,6 @@ def expect_column_value_lengths_to_be_between(
column,
min_value=None,
max_value=None,
strict_min=False,
strict_max=False,
mostly=None,
result_format=None, include_config=False, catch_exceptions=None, meta=None
):
Expand All @@ -1281,10 +1416,6 @@ def expect_column_value_lengths_to_be_between(
mostly (None or a float between 0 and 1): \
Return `"success": True` if at least mostly fraction of values match the expectation. \
For more detail, see :ref:`mostly`.
strict_min (boolean):
If True, value lengths must be strictly larger than min_value, default=False
strict_max (boolean):
If True, value lengths must be strictly smaller than max_value, default=False
Other Parameters:
result_format (str or None): \
Expand All @@ -1307,7 +1438,7 @@ def expect_column_value_lengths_to_be_between(
:ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
Notes:
* min_value and max_value are both inclusive unless strict_min or strict_max are set to True.
* min_value and max_value are both inclusive.
* If min_value is None, then max_value is treated as an upper bound, and the number of acceptable rows has \
no minimum.
* If max_value is None, then min_value is treated as a lower bound, and the number of acceptable rows has \
Expand Down Expand Up @@ -2617,7 +2748,6 @@ def expect_column_unique_value_count_to_be_between(
self,
column,
min_value=None, max_value=None,
strict_min=False, strict_max=False,
result_format=None, include_config=False, catch_exceptions=None,
meta=None,
):
Expand All @@ -2633,10 +2763,6 @@ def expect_column_unique_value_count_to_be_between(
The minimum number of unique values allowed.
max_value (int or None): \
The maximum number of unique values allowed.
strict_min (boolean):
If True, the number of unique values must be strictly larger than min_value, default=False
strict_max (boolean):
If True, the number of unique values must be strictly smaller than max_value, default=False
Other Parameters:
result_format (str or None): \
Expand Down Expand Up @@ -2666,7 +2792,7 @@ def expect_column_unique_value_count_to_be_between(
"observed_value": (int) The number of unique values in the column
}
* min_value and max_value are both inclusive unless strict_min or strict_max are set to True.
* min_value and max_value are both inclusive.
* If min_value is None, then max_value is treated as an upper bound
* If max_value is None, then min_value is treated as a lower bound
Expand All @@ -2686,18 +2812,12 @@ def expect_column_unique_value_count_to_be_between(
}

if min_value is not None:
if strict_min:
above_min = unique_value_count > min_value
else:
above_min = unique_value_count >= min_value
above_min = unique_value_count >= min_value
else:
above_min = True

if max_value is not None:
if strict_max:
below_max = unique_value_count < max_value
else:
below_max = unique_value_count <= max_value
below_max = unique_value_count <= max_value
else:
below_max = True

Expand Down
5 changes: 3 additions & 2 deletions great_expectations/dataset/pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,9 @@ def __init__(self, *args, **kwargs):
def get_row_count(self):
return self.shape[0]

def get_column_count(self):
return self.shape[1]

def get_table_columns(self):
return list(self.columns)

Expand Down Expand Up @@ -1009,8 +1012,6 @@ def expect_column_values_to_be_decreasing(self, column, strictly=None, parse_str
def expect_column_value_lengths_to_be_between(self, column,
min_value=None,
max_value=None,
strict_min=False,
strict_max=False,
mostly=None,
result_format=None, include_config=False, catch_exceptions=None, meta=None):

Expand Down
3 changes: 3 additions & 0 deletions great_expectations/dataset/sparkdf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ def head(self, n=5):
def get_row_count(self):
return self.spark_df.count()

def get_column_count(self):
return len(self.spark_df.columns)

def get_table_columns(self):
return self.spark_df.columns

Expand Down
3 changes: 3 additions & 0 deletions great_expectations/dataset/sqlalchemy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,9 @@ def get_row_count(self):
self._table)
return self.engine.execute(count_query).scalar()

def get_column_count(self):
return len(self.columns)

def get_table_columns(self):
return [col['name'] for col in self.columns]

Expand Down
2 changes: 2 additions & 0 deletions great_expectations/jupyter_ux/expectation_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(self):
'expect_column_unique_value_count_to_be_between': ['min_value', 'max_value'],
'expect_column_value_lengths_to_be_between': ['min_value', 'max_value', 'mostly'],
'expect_table_row_count_to_be_between': ['min_value', 'max_value'],
'expect_table_column_count_to_be_between': ['min_value', 'max_value'],
'expect_column_proportion_of_unique_values_to_be_between': ['min_value', 'max_value'],
'expect_column_median_to_be_between': ['min_value', 'max_value'],
'expect_column_mean_to_be_between': ['min_value', 'max_value'],
Expand All @@ -48,6 +49,7 @@ def __init__(self):
'expect_column_to_exist': ['column_index'],
'expect_column_value_lengths_to_equal': ['value', 'mostly'],
'expect_table_row_count_to_equal': ['value'],
'expect_table_column_count_to_equal': ['value'],
'expect_column_values_to_match_strftime_format': ['strftime_format', 'mostly'],
'expect_column_values_to_be_between': ['min_max_type', 'parse_strings_as_datetimes', 'output_strftime_format', 'min_value', 'max_value', 'mostly', 'allow_cross_type_comparisons'],
'expect_column_max_to_be_between': ['min_max_type', 'parse_strings_as_datetimes', 'output_strftime_format', 'min_value', 'max_value'],
Expand Down

0 comments on commit 589026f

Please sign in to comment.