diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py index d62650e1137b..e47d937c4e5f 100644 --- a/great_expectations/data_context/data_context.py +++ b/great_expectations/data_context/data_context.py @@ -1538,7 +1538,12 @@ def render_full_static_site(self): data_asset_name = expectation_suite['data_asset_name'] expectation_suite_name = expectation_suite['expectation_suite_name'] - model = PrescriptivePageRenderer.render(expectation_suite) + try: + model = PrescriptivePageRenderer.render(expectation_suite) + except Exception as e: + print("Ran into an error in ", expectation_suite_filepath) + raise(e) + out_filepath = self.get_validation_doc_filepath( data_asset_name, expectation_suite_name diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py index 9b2d59e17851..76fece984025 100644 --- a/great_expectations/dataset/dataset.py +++ b/great_expectations/dataset/dataset.py @@ -149,12 +149,6 @@ class Dataset(MetaDataset): # That way, multiple backends can implement the same data_asset_type _data_asset_type = "Dataset" - INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"]) - FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"]) - STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"]) - BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"]) - - # getter functions with hashable arguments - can be cached hashable_getters = [ 'get_column_min', diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index 7cdf8df79424..830ba684ef71 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -15,23 +15,31 @@ class BasicDatasetProfiler(DatasetProfiler): Based on the column's type it provides a description of the column by computing a number of statistics, such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate. """ + INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"]) + FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"]) + STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"]) + BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"]) + DATETIME_TYPE_NAMES = set(["DATETIME", "DATE", "TIMESTAMP", "DateType", "TimestampType", "datetime64", "Timestamp"]) @classmethod def _get_column_type(cls, df, column): # list of types is used to support pandas and sqlalchemy try: - if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.INT_TYPE_NAMES)))["success"]: + if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.INT_TYPE_NAMES)))["success"]: type_ = "int" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.FLOAT_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.FLOAT_TYPE_NAMES)))["success"]: type_ = "float" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.STRING_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.STRING_TYPE_NAMES)))["success"]: type_ = "string" - elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.BOOLEAN_TYPE_NAMES)))["success"]: + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.BOOLEAN_TYPE_NAMES)))["success"]: type_ = "bool" - + + elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.DATETIME_TYPE_NAMES)))["success"]: + type_ = "datetime" + else: df.expect_column_values_to_be_in_type_list(column, type_list=None) type_ = "unknown" @@ -165,6 +173,20 @@ def _profile(cls, dataset): else: # print(column, type_, cardinality) pass + + elif type_ == "datetime": + df.expect_column_min_to_be_between(column, min_value=None, max_value=None) + + df.expect_column_max_to_be_between(column, min_value=None, max_value=None) + + # Re-add once kl_divergence has been modified to support datetimes + # df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None, + # threshold=None, result_format='COMPLETE') + + if cardinality in ["one", "two", "very few", "few"]: + df.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY") + + else: if cardinality == "unique": diff --git a/great_expectations/render/renderer/content_block/bullet_list_content_block.py b/great_expectations/render/renderer/content_block/bullet_list_content_block.py index ff841deb78b6..1499c7eb1059 100644 --- a/great_expectations/render/renderer/content_block/bullet_list_content_block.py +++ b/great_expectations/render/renderer/content_block/bullet_list_content_block.py @@ -58,6 +58,10 @@ def sparkline(cls, weights): """ mn, mx = min(weights), max(weights) extent = mx - mn + + if extent == 0: + extent = 1 + sparkline = ''.join(cls.bar[min([cls.barcount - 1, int((n - mn) / extent * cls.barcount)])] for n in weights) @@ -441,29 +445,35 @@ def expect_column_values_to_be_in_type_list(cls, expectation, styling=None, incl expectation["kwargs"], ["column", "type_list", "mostly"], ) + if params["type_list"] is not None: + for i, v in enumerate(params["type_list"]): + params["v__"+str(i)] = v + values_string = " ".join( + ["$v__"+str(i) for i, v in enumerate(params["type_list"])] + ) - for i, v in enumerate(params["type_list"]): - params["v__"+str(i)] = v - values_string = " ".join( - ["$v__"+str(i) for i, v in enumerate(params["type_list"])] - ) - - if params["mostly"] is not None: - params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,) + if params["mostly"] is not None: + params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,) - if include_column_name: - # NOTE: Localization will be tricky for this template_str. - template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." + if include_column_name: + # NOTE: Localization will be tricky for this template_str. + template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." + else: + # NOTE: Localization will be tricky for this template_str. + template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." else: - # NOTE: Localization will be tricky for this template_str. - template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time." + if include_column_name: + # NOTE: Localization will be tricky for this template_str. + template_str = "$column value types must belong to this set: "+values_string+"." + else: + # NOTE: Localization will be tricky for this template_str. + template_str = "value types must belong to this set: "+values_string+"." else: if include_column_name: - # NOTE: Localization will be tricky for this template_str. - template_str = "$column value types must belong to this set: "+values_string+"." + # NOTE: Localization will be tricky for this template_str. + template_str = "$column value types must belong to a set which has not yet been defined" else: - # NOTE: Localization will be tricky for this template_str. - template_str = "value types must belong to this set: "+values_string+"." + template_str = "value types must belong to a set which has not yet been defined" return [{ "template": template_str, @@ -471,6 +481,7 @@ def expect_column_values_to_be_in_type_list(cls, expectation, styling=None, incl "styling": styling, }] + @classmethod def expect_column_values_to_be_in_set(cls, expectation, styling=None, include_column_name=True): params = substitute_none_for_missing( diff --git a/great_expectations/render/renderer/other_section_renderer.py b/great_expectations/render/renderer/other_section_renderer.py index 2f45caf974d9..655f6ab29355 100644 --- a/great_expectations/render/renderer/other_section_renderer.py +++ b/great_expectations/render/renderer/other_section_renderer.py @@ -9,8 +9,7 @@ TableContentBlockRenderer, PrescriptiveBulletListContentBlockRenderer ) -from great_expectations.dataset.dataset import Dataset - +from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler class DescriptiveOverviewSectionRenderer(Renderer): @@ -258,16 +257,23 @@ def _get_column_types(cls, evrs): for evr in type_evrs: column = evr["expectation_config"]["kwargs"]["column"] if evr["expectation_config"]["expectation_type"] == "expect_column_values_to_be_in_type_list": - expected_types = set(evr["expectation_config"]["kwargs"]["type_list"]) + if evr["expectation_config"]["kwargs"]["type_list"] == None: + expected_types = {} + else: + expected_types = set(evr["expectation_config"]["kwargs"]["type_list"]) else: # assuming expect_column_values_to_be_of_type expected_types = set([evr["expectation_config"]["kwargs"]["type_"]]) - if Dataset.INT_TYPE_NAMES.issubset(expected_types): + if BasicDatasetProfiler.INT_TYPE_NAMES.issubset(expected_types): column_types[column] = "int" - elif Dataset.FLOAT_TYPE_NAMES.issubset(expected_types): + elif BasicDatasetProfiler.FLOAT_TYPE_NAMES.issubset(expected_types): column_types[column] = "float" - elif Dataset.STRING_TYPE_NAMES.issubset(expected_types): + elif BasicDatasetProfiler.STRING_TYPE_NAMES.issubset(expected_types): column_types[column] = "string" + elif BasicDatasetProfiler.DATETIME_TYPE_NAMES.issubset(expected_types): + column_types[column] = "datetime" + elif BasicDatasetProfiler.BOOLEAN_TYPE_NAMES.issubset(expected_types): + column_types[column] = "bool" else: warnings.warn("The expected type list is not a subset of any of the profiler type sets: {0:s}".format(str(expected_types))) column_types[column] = "--" diff --git a/tests/render/test_render_BulletListContentBlock.py b/tests/render/test_render_BulletListContentBlock.py index 305d8ee10789..3998cb2c66fb 100644 --- a/tests/render/test_render_BulletListContentBlock.py +++ b/tests/render/test_render_BulletListContentBlock.py @@ -7,8 +7,9 @@ import glob import json +import pytest from string import Template as pTemplate - +from six import PY2 def test_substitute_none_for_missing(): assert substitute_none_for_missing( @@ -24,75 +25,59 @@ def test_substitute_none_for_missing(): assert my_kwargs == {"a": 1, "b": 2}, \ "substitute_none_for_missing should not change input kwargs in place." +@pytest.mark.smoketest +def test_all_expectations_using_test_definitions(): + test_files = glob.glob( + "tests/test_definitions/*/expect*.json" + ) + + all_true = True + failure_count, total_count = 0, 0 + types = [] + + # Loop over all test_files, datasets, and tests: + test_results = {} + for filename in test_files: + test_definitions = json.load(open(filename)) + types.append(test_definitions["expectation_type"]) + + test_results[test_definitions["expectation_type"]] = [] + + for dataset in test_definitions["datasets"]: + + for test in dataset["tests"]: + # Construct an expectation from the test. + if type(test["in"]) == dict: + fake_expectation = { + "expectation_type": test_definitions["expectation_type"], + "kwargs": test["in"], + } + else: + # This would be a good place to put a kwarg-to-arg converter + continue + + # Attempt to render it + render_result = PrescriptiveBulletListContentBlockRenderer.render( + [fake_expectation]) + + assert isinstance(render_result, dict) + assert "content_block_type" in render_result + assert render_result["content_block_type"] in render_result + assert isinstance(render_result[render_result["content_block_type"]], list ) -# Commenting out the test below. It is helpful during development, but is not a high confidence acceptance test. + # TODO: Assert that the template is renderable, with all the right arguments, etc. + # rendered_template = pTemplate(el["template"]).substitute(el["params"]) -# def test_all_expectations_using_test_definitions(): -# # Fetch test_definitions for all expectations. -# # Note: as of 6/20/2019, coverage is good, but not 100% -# test_files = glob.glob( -# "tests/test_definitions/*/expect*.json" -# ) -# -# all_true = True -# failure_count, total_count = 0, 0 -# types = [] -# # Loop over all test_files, datasets, and tests: -# for filename in test_files: -# test_definitions = json.load(open(filename)) -# types.append(test_definitions["expectation_type"]) -# -# for dataset in test_definitions["datasets"]: -# -# for test in dataset["tests"]: -# # Construct an expectation from the test. -# if type(test["in"]) == dict: -# fake_expectation = { -# "expectation_type": test_definitions["expectation_type"], -# "kwargs": test["in"], -# } -# else: -# # This would be a good place to put a kwarg-to-arg converter -# continue -# -# try: -# # Attempt to render it -# render_result = PrescriptiveBulletListContentBlockRenderer.render( -# fake_expectation) -# # print(fake_expectation) -# -# # Assert that the rendered result matches the intended format -# # Note: THIS DOES NOT TEST CONTENT AT ALL. -# # Abe 6/22/2019: For the moment, I think it's fine to not test content. -# # I'm on the fence about the right end state for testing renderers at this level. -# # Spot checks, perhaps? -# assert render_result != None -# assert type(render_result) == list -# for el in render_result: -# assert set(el.keys()) == { -# 'template', 'params'} -# -# # Assert that the template is renderable, with all the right arguments, etc. -# pTemplate(el["template"]).substitute( -# el["params"] -# ) -# -# except AssertionError: -# # If the assertions fail, then print the expectation to allow debugging. -# # Do NOT trap other errors, so that developers can debug using the full traceback. -# print(fake_expectation) -# all_true = False -# failure_count += 1 -# -# except Exception as e: -# print(fake_expectation) -# raise(e) -# -# total_count += 1 -# -# # print(len(types)) -# # print(len(set(types))) -# print(total_count-failure_count, "of", total_count, -# "suceeded (", 1-failure_count*1./total_count, ")") -# -# # assert all_true + test_results[test_definitions["expectation_type"]].append({ + test["title"]:render_result, + # "rendered_template":rendered_template + }) + + # TODO: accommodate case where multiple datasets exist within one expectation test definition + + # We encountered unicode coding errors on Python 2, but since this is just a smoke test, review the smoke test results in python 3. + if PY2: + return + + with open('./tests/render/output/test_render_bullet_list_content_block.json', 'w') as f: + json.dump(test_results, f, indent=2) \ No newline at end of file