Skip to content

Commit

Permalink
Merge fcbd89f into c6422f0
Browse files Browse the repository at this point in the history
  • Loading branch information
talagluck committed Aug 1, 2019
2 parents c6422f0 + fcbd89f commit 32abef4
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 106 deletions.
7 changes: 6 additions & 1 deletion great_expectations/data_context/data_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -1538,7 +1538,12 @@ def render_full_static_site(self):

data_asset_name = expectation_suite['data_asset_name']
expectation_suite_name = expectation_suite['expectation_suite_name']
model = PrescriptivePageRenderer.render(expectation_suite)
try:
model = PrescriptivePageRenderer.render(expectation_suite)
except Exception as e:
print("Ran into an error in ", expectation_suite_filepath)
raise(e)

out_filepath = self.get_validation_doc_filepath(
data_asset_name,
expectation_suite_name
Expand Down
6 changes: 0 additions & 6 deletions great_expectations/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,6 @@ class Dataset(MetaDataset):
# That way, multiple backends can implement the same data_asset_type
_data_asset_type = "Dataset"

INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"])
FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"])
STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"])
BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"])


# getter functions with hashable arguments - can be cached
hashable_getters = [
'get_column_min',
Expand Down
32 changes: 27 additions & 5 deletions great_expectations/profile/basic_dataset_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,31 @@ class BasicDatasetProfiler(DatasetProfiler):
Based on the column's type it provides a description of the column by computing a number of statistics,
such as min, max, mean and median, for numeric columns, and distribution of values, when appropriate.
"""
INT_TYPE_NAMES = set(["INTEGER", "int", "INT", "TINYINT", "BYTEINT", "SMALLINT", "BIGINT", "IntegerType", "LongType", "DECIMAL"])
FLOAT_TYPE_NAMES = set(["FLOAT", "FLOAT4", "FLOAT8", "DOUBLE_PRECISION", "NUMERIC", "FloatType", "DoubleType", "float"])
STRING_TYPE_NAMES = set(["CHAR", "VARCHAR", "TEXT", "StringType", "string", "str"])
BOOLEAN_TYPE_NAMES = set(["BOOLEAN", "BOOL", "bool", "BooleanType"])
DATETIME_TYPE_NAMES = set(["DATETIME", "DATE", "TIMESTAMP", "DateType", "TimestampType", "datetime64", "Timestamp"])

@classmethod
def _get_column_type(cls, df, column):
# list of types is used to support pandas and sqlalchemy
try:
if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.INT_TYPE_NAMES)))["success"]:
if df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.INT_TYPE_NAMES)))["success"]:
type_ = "int"

elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.FLOAT_TYPE_NAMES)))["success"]:
elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.FLOAT_TYPE_NAMES)))["success"]:
type_ = "float"

elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.STRING_TYPE_NAMES)))["success"]:
elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.STRING_TYPE_NAMES)))["success"]:
type_ = "string"

elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(Dataset.BOOLEAN_TYPE_NAMES)))["success"]:
elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.BOOLEAN_TYPE_NAMES)))["success"]:
type_ = "bool"


elif df.expect_column_values_to_be_in_type_list(column, type_list=sorted(list(cls.DATETIME_TYPE_NAMES)))["success"]:
type_ = "datetime"

else:
df.expect_column_values_to_be_in_type_list(column, type_list=None)
type_ = "unknown"
Expand Down Expand Up @@ -168,6 +176,20 @@ def _profile(cls, dataset):
else:
# print(column, type_, cardinality)
pass

elif type_ == "datetime":
df.expect_column_min_to_be_between(column, min_value=None, max_value=None)

df.expect_column_max_to_be_between(column, min_value=None, max_value=None)

# Re-add once kl_divergence has been modified to support datetimes
# df.expect_column_kl_divergence_to_be_less_than(column, partition_object=None,
# threshold=None, result_format='COMPLETE')

if cardinality in ["one", "two", "very few", "few"]:
df.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY")



else:
if cardinality == "unique":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ def sparkline(cls, weights):
"""
mn, mx = min(weights), max(weights)
extent = mx - mn

if extent == 0:
extent = 1

sparkline = ''.join(cls.bar[min([cls.barcount - 1,
int((n - mn) / extent * cls.barcount)])]
for n in weights)
Expand Down Expand Up @@ -441,36 +445,43 @@ def expect_column_values_to_be_in_type_list(cls, expectation, styling=None, incl
expectation["kwargs"],
["column", "type_list", "mostly"],
)
if params["type_list"] is not None:
for i, v in enumerate(params["type_list"]):
params["v__"+str(i)] = v
values_string = " ".join(
["$v__"+str(i) for i, v in enumerate(params["type_list"])]
)

for i, v in enumerate(params["type_list"]):
params["v__"+str(i)] = v
values_string = " ".join(
["$v__"+str(i) for i, v in enumerate(params["type_list"])]
)

if params["mostly"] is not None:
params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,)
if params["mostly"] is not None:
params["mostly_pct"] = "%.1f" % (params["mostly"] * 100,)

if include_column_name:
# NOTE: Localization will be tricky for this template_str.
template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
if include_column_name:
# NOTE: Localization will be tricky for this template_str.
template_str = "$column value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
else:
# NOTE: Localization will be tricky for this template_str.
template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
else:
# NOTE: Localization will be tricky for this template_str.
template_str = "value types must belong to this set: " + values_string + ", at least $mostly_pct % of the time."
if include_column_name:
# NOTE: Localization will be tricky for this template_str.
template_str = "$column value types must belong to this set: "+values_string+"."
else:
# NOTE: Localization will be tricky for this template_str.
template_str = "value types must belong to this set: "+values_string+"."
else:
if include_column_name:
# NOTE: Localization will be tricky for this template_str.
template_str = "$column value types must belong to this set: "+values_string+"."
# NOTE: Localization will be tricky for this template_str.
template_str = "$column value types must belong to a set which has not yet been defined"
else:
# NOTE: Localization will be tricky for this template_str.
template_str = "value types must belong to this set: "+values_string+"."
template_str = "value types must belong to a set which has not yet been defined"

return [{
"template": template_str,
"params": params,
"styling": styling,
}]


@classmethod
def expect_column_values_to_be_in_set(cls, expectation, styling=None, include_column_name=True):
params = substitute_none_for_missing(
Expand Down
18 changes: 12 additions & 6 deletions great_expectations/render/renderer/other_section_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
TableContentBlockRenderer,
PrescriptiveBulletListContentBlockRenderer
)
from great_expectations.dataset.dataset import Dataset

from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler

class DescriptiveOverviewSectionRenderer(Renderer):

Expand Down Expand Up @@ -258,16 +257,23 @@ def _get_column_types(cls, evrs):
for evr in type_evrs:
column = evr["expectation_config"]["kwargs"]["column"]
if evr["expectation_config"]["expectation_type"] == "expect_column_values_to_be_in_type_list":
expected_types = set(evr["expectation_config"]["kwargs"]["type_list"])
if evr["expectation_config"]["kwargs"]["type_list"] == None:
expected_types = {}
else:
expected_types = set(evr["expectation_config"]["kwargs"]["type_list"])
else: # assuming expect_column_values_to_be_of_type
expected_types = set([evr["expectation_config"]["kwargs"]["type_"]])

if Dataset.INT_TYPE_NAMES.issubset(expected_types):
if BasicDatasetProfiler.INT_TYPE_NAMES.issubset(expected_types):
column_types[column] = "int"
elif Dataset.FLOAT_TYPE_NAMES.issubset(expected_types):
elif BasicDatasetProfiler.FLOAT_TYPE_NAMES.issubset(expected_types):
column_types[column] = "float"
elif Dataset.STRING_TYPE_NAMES.issubset(expected_types):
elif BasicDatasetProfiler.STRING_TYPE_NAMES.issubset(expected_types):
column_types[column] = "string"
elif BasicDatasetProfiler.DATETIME_TYPE_NAMES.issubset(expected_types):
column_types[column] = "datetime"
elif BasicDatasetProfiler.BOOLEAN_TYPE_NAMES.issubset(expected_types):
column_types[column] = "bool"
else:
warnings.warn("The expected type list is not a subset of any of the profiler type sets: {0:s}".format(str(expected_types)))
column_types[column] = "--"
Expand Down
127 changes: 56 additions & 71 deletions tests/render/test_render_BulletListContentBlock.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

import glob
import json
import pytest
from string import Template as pTemplate

from six import PY2

def test_substitute_none_for_missing():
assert substitute_none_for_missing(
Expand All @@ -24,75 +25,59 @@ def test_substitute_none_for_missing():
assert my_kwargs == {"a": 1, "b": 2}, \
"substitute_none_for_missing should not change input kwargs in place."

@pytest.mark.smoketest
def test_all_expectations_using_test_definitions():
test_files = glob.glob(
"tests/test_definitions/*/expect*.json"
)

all_true = True
failure_count, total_count = 0, 0
types = []

# Loop over all test_files, datasets, and tests:
test_results = {}
for filename in test_files:
test_definitions = json.load(open(filename))
types.append(test_definitions["expectation_type"])

test_results[test_definitions["expectation_type"]] = []

for dataset in test_definitions["datasets"]:

for test in dataset["tests"]:
# Construct an expectation from the test.
if type(test["in"]) == dict:
fake_expectation = {
"expectation_type": test_definitions["expectation_type"],
"kwargs": test["in"],
}
else:
# This would be a good place to put a kwarg-to-arg converter
continue

# Attempt to render it
render_result = PrescriptiveBulletListContentBlockRenderer.render(
[fake_expectation])

assert isinstance(render_result, dict)
assert "content_block_type" in render_result
assert render_result["content_block_type"] in render_result
assert isinstance(render_result[render_result["content_block_type"]], list )

# Commenting out the test below. It is helpful during development, but is not a high confidence acceptance test.
# TODO: Assert that the template is renderable, with all the right arguments, etc.
# rendered_template = pTemplate(el["template"]).substitute(el["params"])

# def test_all_expectations_using_test_definitions():
# # Fetch test_definitions for all expectations.
# # Note: as of 6/20/2019, coverage is good, but not 100%
# test_files = glob.glob(
# "tests/test_definitions/*/expect*.json"
# )
#
# all_true = True
# failure_count, total_count = 0, 0
# types = []
# # Loop over all test_files, datasets, and tests:
# for filename in test_files:
# test_definitions = json.load(open(filename))
# types.append(test_definitions["expectation_type"])
#
# for dataset in test_definitions["datasets"]:
#
# for test in dataset["tests"]:
# # Construct an expectation from the test.
# if type(test["in"]) == dict:
# fake_expectation = {
# "expectation_type": test_definitions["expectation_type"],
# "kwargs": test["in"],
# }
# else:
# # This would be a good place to put a kwarg-to-arg converter
# continue
#
# try:
# # Attempt to render it
# render_result = PrescriptiveBulletListContentBlockRenderer.render(
# fake_expectation)
# # print(fake_expectation)
#
# # Assert that the rendered result matches the intended format
# # Note: THIS DOES NOT TEST CONTENT AT ALL.
# # Abe 6/22/2019: For the moment, I think it's fine to not test content.
# # I'm on the fence about the right end state for testing renderers at this level.
# # Spot checks, perhaps?
# assert render_result != None
# assert type(render_result) == list
# for el in render_result:
# assert set(el.keys()) == {
# 'template', 'params'}
#
# # Assert that the template is renderable, with all the right arguments, etc.
# pTemplate(el["template"]).substitute(
# el["params"]
# )
#
# except AssertionError:
# # If the assertions fail, then print the expectation to allow debugging.
# # Do NOT trap other errors, so that developers can debug using the full traceback.
# print(fake_expectation)
# all_true = False
# failure_count += 1
#
# except Exception as e:
# print(fake_expectation)
# raise(e)
#
# total_count += 1
#
# # print(len(types))
# # print(len(set(types)))
# print(total_count-failure_count, "of", total_count,
# "suceeded (", 1-failure_count*1./total_count, ")")
#
# # assert all_true
test_results[test_definitions["expectation_type"]].append({
test["title"]:render_result,
# "rendered_template":rendered_template
})

# TODO: accommodate case where multiple datasets exist within one expectation test definition

# We encountered unicode coding errors on Python 2, but since this is just a smoke test, review the smoke test results in python 3.
if PY2:
return

with open('./tests/render/output/test_render_bullet_list_content_block.json', 'w') as f:
json.dump(test_results, f, indent=2)

0 comments on commit 32abef4

Please sign in to comment.