From b79dab5d577c2d7cb30e44ba8597634faf0f93ff Mon Sep 17 00:00:00 2001 From: Abe Date: Wed, 31 Jul 2019 11:14:12 -0600 Subject: [PATCH 1/6] Fix first bug --- great_expectations/render/renderer/other_section_renderer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/great_expectations/render/renderer/other_section_renderer.py b/great_expectations/render/renderer/other_section_renderer.py index 2f45caf974d9..db1b6b5eb6c4 100644 --- a/great_expectations/render/renderer/other_section_renderer.py +++ b/great_expectations/render/renderer/other_section_renderer.py @@ -258,7 +258,10 @@ def _get_column_types(cls, evrs): for evr in type_evrs: column = evr["expectation_config"]["kwargs"]["column"] if evr["expectation_config"]["expectation_type"] == "expect_column_values_to_be_in_type_list": - expected_types = set(evr["expectation_config"]["kwargs"]["type_list"]) + if evr["expectation_config"]["kwargs"]["type_list"] == None: + expected_types = {} + else: + expected_types = set(evr["expectation_config"]["kwargs"]["type_list"]) else: # assuming expect_column_values_to_be_of_type expected_types = set([evr["expectation_config"]["kwargs"]["type_"]]) From 5ed0ac79ce2c2bd0ec735f39151f7e667a5d9e49 Mon Sep 17 00:00:00 2001 From: Abe Date: Wed, 31 Jul 2019 11:21:32 -0600 Subject: [PATCH 2/6] Add slightly better error messaging --- great_expectations/data_context/data_context.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index d62650e1137b..e47d937c4e5f 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -1538,7 +1538,12 @@ def render_full_static_site(self): data_asset_name = expectation_suite['data_asset_name'] expectation_suite_name = expectation_suite['expectation_suite_name'] - model = PrescriptivePageRenderer.render(expectation_suite) + try: + model = PrescriptivePageRenderer.render(expectation_suite) + except Exception as e: + print("Ran into an error in ", expectation_suite_filepath) + raise(e) + out_filepath = self.get_validation_doc_filepath( data_asset_name, expectation_suite_name From 07bb05df93c4a79e0a83b9984f34abf15e10052f Mon Sep 17 00:00:00 2001 From: Tal Gluck Date: Wed, 31 Jul 2019 13:48:50 -0400 Subject: [PATCH 3/6] added a fix to deal with TYPE_LIST = None --- .../bullet_list_content_block.py | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/great_expectations/render/renderer/content_block/bullet_list_content_block.py b/great_expectations/render/renderer/content_block/bullet_list_content_block.py index ff841deb78b6..aec8fb6777c6 100644 --- a/great_expectations/render/renderer/content_block/bullet_list_content_block.py +++ b/great_expectations/render/renderer/content_block/bullet_list_content_block.py @@ -441,29 +441,35 @@ def expect_column_values_to_be_in_type_list(cls, expectation, styling=None, incl expectation["kwargs"], ["column", "type_list", "mostly"], ) + if params["type_list"] is not None: + for i, v in enumerate(params["type_list"]): + params["v__"+str(i)] = v + values_string = " ".join( + ["$v__"+str(i) for i, v in enumerate(params["type_list"])] + ) - for i, v in enumerate(params["type_list"]): - params["v__"+str(i)] = v - values_string = " ".join( - ["$v__"+str(i) for i, v in enumerate(params["type_list"])] - ) - - if params["mostly"] is not None: - params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,) + if params["mostly"] is not None: + params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,) - if include_column_name: - # NOTE: Localization will be tricky for this template_str. - template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." + if include_column_name: + # NOTE: Localization will be tricky for this template_str. + template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." + else: + # NOTE: Localization will be tricky for this template_str. + template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." else: - # NOTE: Localization will be tricky for this template_str. - template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." + if include_column_name: + # NOTE: Localization will be tricky for this template_str. + template_str = "$column value types must belong to this set: "+values_string+"." + else: + # NOTE: Localization will be tricky for this template_str. + template_str = "value types must belong to this set: "+values_string+"." else: if include_column_name: - # NOTE: Localization will be tricky for this template_str. - template_str = "$column value types must belong to this set: "+values_string+"." + # NOTE: Localization will be tricky for this template_str. + template_str = "$column value types must belong to a set which has not yet been defined" else: - # NOTE: Localization will be tricky for this template_str. - template_str = "value types must belong to this set: "+values_string+"." + template_str = "value types must belong to a set which has not yet been defined" return [{ "template": template_str, @@ -471,6 +477,7 @@ def expect_column_values_to_be_in_type_list(cls, expectation, styling=None, incl "styling": styling, }] + @classmethod def expect_column_values_to_be_in_set(cls, expectation, styling=None, include_column_name=True): params = substitute_none_for_missing( From 7d0f0c21e82e2c2022e500a0c81f11a452a68ffe Mon Sep 17 00:00:00 2001 From: Tal Gluck Date: Thu, 1 Aug 2019 10:50:28 -0400 Subject: [PATCH 4/6] Add support for datetime types to basic_dataset_profiler and bullet_list_renderer Moved data types from dataset to basic_dataset_profiler Added a smoke test for bullet_list_renderer against existing test definitions --- great_expectations/dataset/dataset.py | 6 - .../profile/basic_dataset_profiler.py | 31 +++- .../bullet_list_content_block.py | 4 + .../render/renderer/other_section_renderer.py | 13 +- .../test_render_BulletListContentBlock.py | 153 ++++++++++-------- 5 files changed, 122 insertions(+), 85 deletions(-) diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py index 54634e3362cf..f5a790d7e8fa 100644 --- a/great_expectations/dataset/dataset.py +++ b/great_expectations/dataset/dataset.py @@ -149,12 +149,6 @@ class Dataset(MetaDataset): # That way, multiple backends can implement the same data_asset_type _data_asset_type = "Dataset" - INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"]) - FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"]) - STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"]) - BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"]) - - # getter functions with hashable arguments - can be cached hashable_getters = [ 'get_column_min', diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index d386963eeb57..5c33e0cb6d0d 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -15,23 +15,31 @@ class BasicDatasetProfiler(DatasetProfiler): Based on the column's type it provides a description of the column by computing a number of statistics, such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate. """ + INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"]) + FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"]) + STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"]) + BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"]) + DATETIME_TYPE_NAMES = set(["DATETIME", "DATE", "TIMESTAMP", "DateType", "TimestampType", "datetime64", "Timestamp"]) @classmethod def _get_column_type(cls, df, column): # list of types is used to support pandas and sqlalchemy try: - if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.INT_TYPE_NAMES)))["success"]: + if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.INT_TYPE_NAMES)))["success"]: type_ = "int" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.FLOAT_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.FLOAT_TYPE_NAMES)))["success"]: type_ = "float" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.STRING_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.STRING_TYPE_NAMES)))["success"]: type_ = "string" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.BOOLEAN_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.BOOLEAN_TYPE_NAMES)))["success"]: type_ = "bool" - + + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.DATETIME_TYPE_NAMES)))["success"]: + type_ = "datetime" + else: df.expect_column_values_to_be_in_type_list(column, type_list=None) type_ = "unknown" @@ -168,6 +176,19 @@ def _profile(cls, dataset): else: # print(column, type_, cardinality) pass + + elif type_ == "datetime": + df.expect_column_min_to_be_between(column, min_value=None, max_value=None) + + df.expect_column_max_to_be_between(column, min_value=None, max_value=None) + + df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None, + threshold=None, result_format='COMPLETE') + + if cardinality in ["one", "two", "very few", "few"]: + df.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY") + + else: if cardinality == "unique": diff --git a/great_expectations/render/renderer/content_block/bullet_list_content_block.py b/great_expectations/render/renderer/content_block/bullet_list_content_block.py index aec8fb6777c6..1499c7eb1059 100644 --- a/great_expectations/render/renderer/content_block/bullet_list_content_block.py +++ b/great_expectations/render/renderer/content_block/bullet_list_content_block.py @@ -58,6 +58,10 @@ def sparkline(cls, weights): """ mn, mx = min(weights), max(weights) extent = mx - mn + + if extent == 0: + extent = 1 + sparkline = ''.join(cls.bar[min([cls.barcount - 1, int((n - mn) / extent * cls.barcount)])] for n in weights) diff --git a/great_expectations/render/renderer/other_section_renderer.py b/great_expectations/render/renderer/other_section_renderer.py index db1b6b5eb6c4..655f6ab29355 100644 --- a/great_expectations/render/renderer/other_section_renderer.py +++ b/great_expectations/render/renderer/other_section_renderer.py @@ -9,8 +9,7 @@ TableContentBlockRenderer, PrescriptiveBulletListContentBlockRenderer ) -from great_expectations.dataset.dataset import Dataset - +from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler class DescriptiveOverviewSectionRenderer(Renderer): @@ -265,12 +264,16 @@ def _get_column_types(cls, evrs): else: # assuming expect_column_values_to_be_of_type expected_types = set([evr["expectation_config"]["kwargs"]["type_"]]) - if Dataset.INT_TYPE_NAMES.issubset(expected_types): + if BasicDatasetProfiler.INT_TYPE_NAMES.issubset(expected_types): column_types[column] = "int" - elif Dataset.FLOAT_TYPE_NAMES.issubset(expected_types): + elif BasicDatasetProfiler.FLOAT_TYPE_NAMES.issubset(expected_types): column_types[column] = "float" - elif Dataset.STRING_TYPE_NAMES.issubset(expected_types): + elif BasicDatasetProfiler.STRING_TYPE_NAMES.issubset(expected_types): column_types[column] = "string" + elif BasicDatasetProfiler.DATETIME_TYPE_NAMES.issubset(expected_types): + column_types[column] = "datetime" + elif BasicDatasetProfiler.BOOLEAN_TYPE_NAMES.issubset(expected_types): + column_types[column] = "bool" else: warnings.warn("The expected type list is not a subset of any of the profiler type sets: {0:s}".format(str(expected_types))) column_types[column] = "--" diff --git a/tests/render/test_render_BulletListContentBlock.py b/tests/render/test_render_BulletListContentBlock.py index 305d8ee10789..7c90440ccaf5 100644 --- a/tests/render/test_render_BulletListContentBlock.py +++ b/tests/render/test_render_BulletListContentBlock.py @@ -7,6 +7,7 @@ import glob import json +import pytest from string import Template as pTemplate @@ -27,72 +28,86 @@ def test_substitute_none_for_missing(): # Commenting out the test below. It is helpful during development, but is not a high confidence acceptance test. -# def test_all_expectations_using_test_definitions(): -# # Fetch test_definitions for all expectations. -# # Note: as of 6/20/2019, coverage is good, but not 100% -# test_files = glob.glob( -# "tests/test_definitions/*/expect*.json" -# ) -# -# all_true = True -# failure_count, total_count = 0, 0 -# types = [] -# # Loop over all test_files, datasets, and tests: -# for filename in test_files: -# test_definitions = json.load(open(filename)) -# types.append(test_definitions["expectation_type"]) -# -# for dataset in test_definitions["datasets"]: -# -# for test in dataset["tests"]: -# # Construct an expectation from the test. -# if type(test["in"]) == dict: -# fake_expectation = { -# "expectation_type": test_definitions["expectation_type"], -# "kwargs": test["in"], -# } -# else: -# # This would be a good place to put a kwarg-to-arg converter -# continue -# -# try: -# # Attempt to render it -# render_result = PrescriptiveBulletListContentBlockRenderer.render( -# fake_expectation) -# # print(fake_expectation) -# -# # Assert that the rendered result matches the intended format -# # Note: THIS DOES NOT TEST CONTENT AT ALL. -# # Abe 6/22/2019: For the moment, I think it's fine to not test content. -# # I'm on the fence about the right end state for testing renderers at this level. -# # Spot checks, perhaps? -# assert render_result != None -# assert type(render_result) == list -# for el in render_result: -# assert set(el.keys()) == { -# 'template', 'params'} -# -# # Assert that the template is renderable, with all the right arguments, etc. -# pTemplate(el["template"]).substitute( -# el["params"] -# ) -# -# except AssertionError: -# # If the assertions fail, then print the expectation to allow debugging. -# # Do NOT trap other errors, so that developers can debug using the full traceback. -# print(fake_expectation) -# all_true = False -# failure_count += 1 -# -# except Exception as e: -# print(fake_expectation) -# raise(e) -# -# total_count += 1 -# -# # print(len(types)) -# # print(len(set(types))) -# print(total_count-failure_count, "of", total_count, -# "suceeded (", 1-failure_count*1./total_count, ")") -# -# # assert all_true +@pytest.mark.smoketest +def test_all_expectations_using_test_definitions(): + # Fetch test_definitions for all expectations. + # Note: as of 6/20/2019, coverage is good, but not 100% + test_files = glob.glob( + "tests/test_definitions/*/expect*.json" + ) + + all_true = True + failure_count, total_count = 0, 0 + types = [] + # Loop over all test_files, datasets, and tests: + + test_results = {} + for filename in test_files: + test_definitions = json.load(open(filename)) + types.append(test_definitions["expectation_type"]) + + test_results[test_definitions["expectation_type"]] = [] + + for dataset in test_definitions["datasets"]: + + for test in dataset["tests"]: + # Construct an expectation from the test. + if type(test["in"]) == dict: + fake_expectation = { + "expectation_type": test_definitions["expectation_type"], + "kwargs": test["in"], + } + else: + # This would be a good place to put a kwarg-to-arg converter + continue + + try: + # Attempt to render it + render_result = PrescriptiveBulletListContentBlockRenderer.render( + [fake_expectation]) + # print(fake_expectation) + # Assert that the rendered result matches the intended format + # Note: THIS DOES NOT TEST CONTENT AT ALL. + # Abe 6/22/2019: For the moment, I think it's fine to not test content. + # I'm on the fence about the right end state for testing renderers at this level. + # Spot checks, perhaps? + assert isinstance(render_result, dict) + assert "content_block_type" in render_result + assert render_result["content_block_type"] in render_result + assert isinstance(render_result[render_result["content_block_type"]], list ) + + # TODO: Assert that the template is renderable, with all the right arguments, etc. + # rendered_template = pTemplate(el["template"]).substitute(el["params"]) + + test_results[test_definitions["expectation_type"]].append({ + test["title"]:render_result, + # "rendered_template":rendered_template + }) + + except Exception: + print(test['title']) + raise + + except AssertionError: + raise + # # If the assertions fail, then print the expectation to allow debugging. + # # Do NOT trap other errors, so that developers can debug using the full traceback. + # print(fake_expectation) + # all_true = False + # failure_count += 1 + + # except Exception as e: + # print(fake_expectation) + # raise(e) + + # total_count += 1 + + # print(len(types)) + # print(len(set(types))) + # print(total_count-failure_count, "of", total_count, + # "succeeded (", 1-failure_count*1./total_count, ")") + # TODO: accommodate case where multiple datasets exist within one expectation test definition + with open('./tests/render/output/test_render_bullet_list_content_block.json', 'w') as f: + json.dump(test_results, f, indent=2) + + # assert all_true From efd08e16bb67d11ee6ccf9d7fd44a111fa3d3947 Mon Sep 17 00:00:00 2001 From: Tal Gluck Date: Thu, 1 Aug 2019 10:58:46 -0400 Subject: [PATCH 5/6] Pulled out some OBE comments --- .../test_render_BulletListContentBlock.py | 73 +++++-------------- 1 file changed, 19 insertions(+), 54 deletions(-) diff --git a/tests/render/test_render_BulletListContentBlock.py b/tests/render/test_render_BulletListContentBlock.py index 7c90440ccaf5..38b885f95ba0 100644 --- a/tests/render/test_render_BulletListContentBlock.py +++ b/tests/render/test_render_BulletListContentBlock.py @@ -25,13 +25,8 @@ def test_substitute_none_for_missing(): assert my_kwargs == {"a": 1, "b": 2}, \ "substitute_none_for_missing should not change input kwargs in place." - -# Commenting out the test below. It is helpful during development, but is not a high confidence acceptance test. - @pytest.mark.smoketest def test_all_expectations_using_test_definitions(): - # Fetch test_definitions for all expectations. - # Note: as of 6/20/2019, coverage is good, but not 100% test_files = glob.glob( "tests/test_definitions/*/expect*.json" ) @@ -39,8 +34,8 @@ def test_all_expectations_using_test_definitions(): all_true = True failure_count, total_count = 0, 0 types = [] - # Loop over all test_files, datasets, and tests: + # Loop over all test_files, datasets, and tests: test_results = {} for filename in test_files: test_definitions = json.load(open(filename)) @@ -61,53 +56,23 @@ def test_all_expectations_using_test_definitions(): # This would be a good place to put a kwarg-to-arg converter continue - try: - # Attempt to render it - render_result = PrescriptiveBulletListContentBlockRenderer.render( - [fake_expectation]) - # print(fake_expectation) - # Assert that the rendered result matches the intended format - # Note: THIS DOES NOT TEST CONTENT AT ALL. - # Abe 6/22/2019: For the moment, I think it's fine to not test content. - # I'm on the fence about the right end state for testing renderers at this level. - # Spot checks, perhaps? - assert isinstance(render_result, dict) - assert "content_block_type" in render_result - assert render_result["content_block_type"] in render_result - assert isinstance(render_result[render_result["content_block_type"]], list ) - - # TODO: Assert that the template is renderable, with all the right arguments, etc. - # rendered_template = pTemplate(el["template"]).substitute(el["params"]) - - test_results[test_definitions["expectation_type"]].append({ - test["title"]:render_result, - # "rendered_template":rendered_template - }) - - except Exception: - print(test['title']) - raise - - except AssertionError: - raise - # # If the assertions fail, then print the expectation to allow debugging. - # # Do NOT trap other errors, so that developers can debug using the full traceback. - # print(fake_expectation) - # all_true = False - # failure_count += 1 - - # except Exception as e: - # print(fake_expectation) - # raise(e) - - # total_count += 1 - - # print(len(types)) - # print(len(set(types))) - # print(total_count-failure_count, "of", total_count, - # "succeeded (", 1-failure_count*1./total_count, ")") + # Attempt to render it + render_result = PrescriptiveBulletListContentBlockRenderer.render( + [fake_expectation]) + + assert isinstance(render_result, dict) + assert "content_block_type" in render_result + assert render_result["content_block_type"] in render_result + assert isinstance(render_result[render_result["content_block_type"]], list ) + + # TODO: Assert that the template is renderable, with all the right arguments, etc. + # rendered_template = pTemplate(el["template"]).substitute(el["params"]) + + test_results[test_definitions["expectation_type"]].append({ + test["title"]:render_result, + # "rendered_template":rendered_template + }) + # TODO: accommodate case where multiple datasets exist within one expectation test definition with open('./tests/render/output/test_render_bullet_list_content_block.json', 'w') as f: - json.dump(test_results, f, indent=2) - - # assert all_true + json.dump(test_results, f, indent=2) \ No newline at end of file From fcbd89febf9c9a02ae6fa25b40c6062ae5de3a1a Mon Sep 17 00:00:00 2001 From: Tal Gluck Date: Thu, 1 Aug 2019 13:39:07 -0400 Subject: [PATCH 6/6] Removed unicode json write for Python 2 functionality; and disabled KL divergence for date data types --- great_expectations/profile/basic_dataset_profiler.py | 5 +++-- tests/render/test_render_BulletListContentBlock.py | 7 ++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index 5c33e0cb6d0d..aac666be0f3b 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -182,8 +182,9 @@ def _profile(cls, dataset): df.expect_column_max_to_be_between(column, min_value=None, max_value=None) - df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None, - threshold=None, result_format='COMPLETE') + # Re-add once kl_divergence has been modified to support datetimes + # df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None, + # threshold=None, result_format='COMPLETE') if cardinality in ["one", "two", "very few", "few"]: df.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY") diff --git a/tests/render/test_render_BulletListContentBlock.py b/tests/render/test_render_BulletListContentBlock.py index 38b885f95ba0..3998cb2c66fb 100644 --- a/tests/render/test_render_BulletListContentBlock.py +++ b/tests/render/test_render_BulletListContentBlock.py @@ -9,7 +9,7 @@ import json import pytest from string import Template as pTemplate - +from six import PY2 def test_substitute_none_for_missing(): assert substitute_none_for_missing( @@ -74,5 +74,10 @@ def test_all_expectations_using_test_definitions(): }) # TODO: accommodate case where multiple datasets exist within one expectation test definition + + # We encountered unicode coding errors on Python 2, but since this is just a smoke test, review the smoke test results in python 3. + if PY2: + return + with open('./tests/render/output/test_render_bullet_list_content_block.json', 'w') as f: json.dump(test_results, f, indent=2) \ No newline at end of file