Merge fcbd89f into c6422f0

great-expectations · Aug 1, 2019 · 32abef4 · 32abef4
2 parents c6422f0 + fcbd89f
commit 32abef4
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 106 deletions.
diff --git a/great_expectations/data_context/data_context.py b/great_expectations/data_context/data_context.py
@@ -1538,7 +1538,12 @@ def render_full_static_site(self):
 
             data_asset_name = expectation_suite['data_asset_name']
             expectation_suite_name = expectation_suite['expectation_suite_name']
-            model = PrescriptivePageRenderer.render(expectation_suite)
+            try:
+                model = PrescriptivePageRenderer.render(expectation_suite)
+            except Exception as e:
+                print("Ran into an error in ", expectation_suite_filepath)
+                raise(e)
+
             out_filepath = self.get_validation_doc_filepath(
                 data_asset_name,
                 expectation_suite_name

diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py
@@ -149,12 +149,6 @@ class Dataset(MetaDataset):
     # That way, multiple backends can implement the same data_asset_type
     _data_asset_type = "Dataset"
 
-    INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"])
-    FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"])
-    STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"])
-    BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"])
-
-
     # getter functions with hashable arguments - can be cached
     hashable_getters = [
         'get_column_min',

diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py
@@ -15,23 +15,31 @@ class BasicDatasetProfiler(DatasetProfiler):
     Based on the column's type it provides a description of the column by computing a number of statistics,
     such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate.
     """
+    INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"])
+    FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"])
+    STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"])
+    BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"])
+    DATETIME_TYPE_NAMES = set(["DATETIME", "DATE", "TIMESTAMP", "DateType", "TimestampType", "datetime64", "Timestamp"])
 
     @classmethod
     def _get_column_type(cls, df, column):
         # list of types is used to support pandas and sqlalchemy
         try:
-            if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.INT_TYPE_NAMES)))["success"]:
+            if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.INT_TYPE_NAMES)))["success"]:
                 type_ = "int"
 
-            elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.FLOAT_TYPE_NAMES)))["success"]:
+            elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.FLOAT_TYPE_NAMES)))["success"]:
                 type_ = "float"
 
-            elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.STRING_TYPE_NAMES)))["success"]:
+            elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.STRING_TYPE_NAMES)))["success"]:
                 type_ = "string"
 
-            elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.BOOLEAN_TYPE_NAMES)))["success"]:
+            elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.BOOLEAN_TYPE_NAMES)))["success"]:
                 type_ = "bool"
-
+
+            elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.DATETIME_TYPE_NAMES)))["success"]:
+                type_ = "datetime"
+
             else:
                 df.expect_column_values_to_be_in_type_list(column, type_list=None)
                 type_ = "unknown"
@@ -168,6 +176,20 @@ def _profile(cls, dataset):
                 else:
                     # print(column, type_, cardinality)
                     pass
+
+            elif type_ == "datetime":
+                df.expect_column_min_to_be_between(column, min_value=None, max_value=None)
+
+                df.expect_column_max_to_be_between(column, min_value=None, max_value=None)
+
+                # Re-add once kl_divergence has been modified to support datetimes
+                # df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None,
+                #                                            threshold=None, result_format='COMPLETE')
+
+                if cardinality in ["one", "two", "very few", "few"]:
+                    df.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY")
+
+
 
             else:
                 if cardinality == "unique":

diff --git a/great_expectations/render/renderer/content_block/bullet_list_content_block.py b/great_expectations/render/renderer/content_block/bullet_list_content_block.py
@@ -58,6 +58,10 @@ def sparkline(cls, weights):
         """
         mn, mx = min(weights), max(weights)
         extent = mx - mn
+
+        if extent == 0:
+            extent = 1
+
         sparkline = ''.join(cls.bar[min([cls.barcount - 1,
                                          int((n - mn) / extent * cls.barcount)])]
                             for n in weights)
@@ -441,36 +445,43 @@ def expect_column_values_to_be_in_type_list(cls, expectation, styling=None, incl
             expectation["kwargs"],
             ["column", "type_list", "mostly"],
         )
+        if params["type_list"] is not None:    
+            for i, v in enumerate(params["type_list"]):
+                params["v__"+str(i)] = v
+            values_string = " ".join(
+                ["$v__"+str(i) for i, v in enumerate(params["type_list"])]
+            )
 
-        for i, v in enumerate(params["type_list"]):
-            params["v__"+str(i)] = v
-        values_string = " ".join(
-            ["$v__"+str(i) for i, v in enumerate(params["type_list"])]
-        )
-
-        if params["mostly"] is not None:
-            params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,)
+            if params["mostly"] is not None:
+                params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,)
 
-            if include_column_name:
-                # NOTE: Localization will be tricky for this template_str.
-                template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
+                if include_column_name:
+                    # NOTE: Localization will be tricky for this template_str.
+                    template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
+                else:
+                    # NOTE: Localization will be tricky for this template_str.
+                    template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
             else:
-                # NOTE: Localization will be tricky for this template_str.
-                template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
+                if include_column_name:
+                    # NOTE: Localization will be tricky for this template_str.
+                    template_str = "$column value types must belong to this set: "+values_string+"."
+                else:
+                    # NOTE: Localization will be tricky for this template_str.
+                    template_str = "value types must belong to this set: "+values_string+"."
         else:
             if include_column_name:
-                # NOTE: Localization will be tricky for this template_str.
-                template_str = "$column value types must belong to this set: "+values_string+"."
+                    # NOTE: Localization will be tricky for this template_str.
+                    template_str = "$column value types must belong to a set which has not yet been defined"
             else:
-                # NOTE: Localization will be tricky for this template_str.
-                template_str = "value types must belong to this set: "+values_string+"."
+                template_str = "value types must belong to a set which has not yet been defined"
 
         return [{
             "template": template_str,
             "params": params,
             "styling": styling,
         }]
 
+
     @classmethod
     def expect_column_values_to_be_in_set(cls, expectation, styling=None, include_column_name=True):
         params = substitute_none_for_missing(

diff --git a/great_expectations/render/renderer/other_section_renderer.py b/great_expectations/render/renderer/other_section_renderer.py
@@ -9,8 +9,7 @@
     TableContentBlockRenderer,
     PrescriptiveBulletListContentBlockRenderer
 )
-from great_expectations.dataset.dataset import Dataset
-
+from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler
 
 class DescriptiveOverviewSectionRenderer(Renderer):
 
@@ -258,16 +257,23 @@ def _get_column_types(cls, evrs):
         for evr in type_evrs:
             column = evr["expectation_config"]["kwargs"]["column"]
             if evr["expectation_config"]["expectation_type"] == "expect_column_values_to_be_in_type_list":
-                expected_types = set(evr["expectation_config"]["kwargs"]["type_list"])
+                if evr["expectation_config"]["kwargs"]["type_list"] == None:
+                    expected_types = {}
+                else:
+                    expected_types = set(evr["expectation_config"]["kwargs"]["type_list"])
             else: # assuming expect_column_values_to_be_of_type
                 expected_types = set([evr["expectation_config"]["kwargs"]["type_"]])
 
-            if Dataset.INT_TYPE_NAMES.issubset(expected_types):
+            if BasicDatasetProfiler.INT_TYPE_NAMES.issubset(expected_types):
                 column_types[column] = "int"
-            elif Dataset.FLOAT_TYPE_NAMES.issubset(expected_types):
+            elif BasicDatasetProfiler.FLOAT_TYPE_NAMES.issubset(expected_types):
                 column_types[column] = "float"
-            elif Dataset.STRING_TYPE_NAMES.issubset(expected_types):
+            elif BasicDatasetProfiler.STRING_TYPE_NAMES.issubset(expected_types):
                 column_types[column] = "string"
+            elif BasicDatasetProfiler.DATETIME_TYPE_NAMES.issubset(expected_types):
+                column_types[column] = "datetime"
+            elif BasicDatasetProfiler.BOOLEAN_TYPE_NAMES.issubset(expected_types):
+                column_types[column] = "bool"
             else:
                 warnings.warn("The expected type list is not a subset of any of the profiler type sets: {0:s}".format(str(expected_types)))
                 column_types[column] = "--"

diff --git a/tests/render/test_render_BulletListContentBlock.py b/tests/render/test_render_BulletListContentBlock.py
@@ -7,8 +7,9 @@
 
 import glob
 import json
+import pytest
 from string import Template as pTemplate
-
+from six import PY2
 
 def test_substitute_none_for_missing():
     assert substitute_none_for_missing(
@@ -24,75 +25,59 @@ def test_substitute_none_for_missing():
     assert my_kwargs == {"a": 1, "b": 2}, \
         "substitute_none_for_missing should not change input kwargs in place."
 
+@pytest.mark.smoketest
+def test_all_expectations_using_test_definitions():
+    test_files = glob.glob(
+        "tests/test_definitions/*/expect*.json"
+    )
+
+    all_true = True
+    failure_count, total_count = 0, 0
+    types = []
+
+    # Loop over all test_files, datasets, and tests:
+    test_results = {}
+    for filename in test_files:
+        test_definitions = json.load(open(filename))
+        types.append(test_definitions["expectation_type"])
+
+        test_results[test_definitions["expectation_type"]] = []
+
+        for dataset in test_definitions["datasets"]:
+
+            for test in dataset["tests"]:
+                # Construct an expectation from the test.
+                if type(test["in"]) == dict:
+                    fake_expectation = {
+                        "expectation_type": test_definitions["expectation_type"],
+                        "kwargs": test["in"],
+                    }
+                else:
+                    # This would be a good place to put a kwarg-to-arg converter
+                    continue
+
+                # Attempt to render it
+                render_result = PrescriptiveBulletListContentBlockRenderer.render(
+                    [fake_expectation])
+
+                assert isinstance(render_result, dict)
+                assert "content_block_type" in render_result
+                assert render_result["content_block_type"] in render_result
+                assert isinstance(render_result[render_result["content_block_type"]], list )
 
-# Commenting out the test below. It is helpful during development, but is not a high confidence acceptance test.
+                # TODO: Assert that the template is renderable, with all the right arguments, etc.
+                # rendered_template = pTemplate(el["template"]).substitute(el["params"])
 
-# def test_all_expectations_using_test_definitions():
-#     # Fetch test_definitions for all expectations.
-#     # Note: as of 6/20/2019, coverage is good, but not 100%
-#     test_files = glob.glob(
-#         "tests/test_definitions/*/expect*.json"
-#     )
-#
-#     all_true = True
-#     failure_count, total_count = 0, 0
-#     types = []
-#     # Loop over all test_files, datasets, and tests:
-#     for filename in test_files:
-#         test_definitions = json.load(open(filename))
-#         types.append(test_definitions["expectation_type"])
-#
-#         for dataset in test_definitions["datasets"]:
-#
-#             for test in dataset["tests"]:
-#                 # Construct an expectation from the test.
-#                 if type(test["in"]) == dict:
-#                     fake_expectation = {
-#                         "expectation_type": test_definitions["expectation_type"],
-#                         "kwargs": test["in"],
-#                     }
-#                 else:
-#                     # This would be a good place to put a kwarg-to-arg converter
-#                     continue
-#
-#                 try:
-#                     # Attempt to render it
-#                     render_result = PrescriptiveBulletListContentBlockRenderer.render(
-#                         fake_expectation)
-#                     # print(fake_expectation)
-#
-#                     # Assert that the rendered result matches the intended format
-#                     # Note: THIS DOES NOT TEST CONTENT AT ALL.
-#                     # Abe 6/22/2019: For the moment, I think it's fine to not test content.
-#                     # I'm on the fence about the right end state for testing renderers at this level.
-#                     # Spot checks, perhaps?
-#                     assert render_result != None
-#                     assert type(render_result) == list
-#                     for el in render_result:
-#                         assert set(el.keys()) == {
-#                             'template', 'params'}
-#
-#                         # Assert that the template is renderable, with all the right arguments, etc.
-#                         pTemplate(el["template"]).substitute(
-#                             el["params"]
-#                         )
-#
-#                 except AssertionError:
-#                     # If the assertions fail, then print the expectation to allow debugging.
-#                     # Do NOT trap other errors, so that developers can debug using the full traceback.
-#                     print(fake_expectation)
-#                     all_true = False
-#                     failure_count += 1
-#
-#                 except Exception as e:
-#                     print(fake_expectation)
-#                     raise(e)
-#
-#                 total_count += 1
-#
-#     # print(len(types))
-#     # print(len(set(types)))
-#     print(total_count-failure_count, "of", total_count,
-#           "suceeded (", 1-failure_count*1./total_count, ")")
-#
-#     # assert all_true
+                test_results[test_definitions["expectation_type"]].append({
+                    test["title"]:render_result, 
+                    # "rendered_template":rendered_template
+                    })
+
+    # TODO: accommodate case where multiple datasets exist within one expectation test definition
+
+    # We encountered unicode coding errors on Python 2, but since this is just a smoke test, review the smoke test results in python 3.
+    if PY2: 
+        return
+
+    with open('./tests/render/output/test_render_bullet_list_content_block.json', 'w') as f:
+       json.dump(test_results, f, indent=2)