Skip to content

Commit

Permalink
Merge pull request #505 from great-expectations/dev_eug_profiling_201…
Browse files Browse the repository at this point in the history
…90618

Basic dataset profiler implementation + changes that it depends on
  • Loading branch information
jcampbell committed Jun 21, 2019
2 parents 072e7cf + a770b55 commit acb2963
Show file tree
Hide file tree
Showing 17 changed files with 499 additions and 230 deletions.
2 changes: 1 addition & 1 deletion great_expectations/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def render(render_object):
help='Maximum number of named data assets to profile.')
@click.option('--profile_all_data_assets', '-A', is_flag=True, default=False,
help='Profile ALL data assets within the target data source. If True, this will override --max_data_assets.')
@click.option('--target_directory', '-d', default="./",
@click.option('--target_directory', '-d', default="./great_expectations",
help='The root of a project directory containing a great_expectations/ config.')
def profile(datasource_name, max_data_assets, profile_all_data_assets, target_directory):
"""Profile a great expectations object.
Expand Down
10 changes: 8 additions & 2 deletions great_expectations/cli/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,15 @@ def add_datasource(context):
"""

msg_sqlalchemy_go_to_notebook = """
To create expectations for your SQL queries start Jupyter and open notebook
great_expectations/notebooks/using_great_expectations_with_sql.ipynb -
To create expectations for your SQL data assets start Jupyter and open the notebook
great_expectations/notebooks/create_expectations_sql.ipynb.
it will walk you through configuring the database connection and next steps.
To launch with jupyter notebooks:
<blue>jupyter notebook great_expectations/notebooks/create_expectations_sql.ipynb</blue>
To launch with jupyter lab:
<blue>jupyter lab great_expectations/notebooks/create_expectations_sql.ipynb</blue>
"""

msg_spark_go_to_notebook = """
Expand Down
1 change: 0 additions & 1 deletion great_expectations/data_asset/data_asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,6 @@ def _format_map_output(
}
)


if result_format['result_format'] == 'SUMMARY':
return return_obj

Expand Down
3 changes: 3 additions & 0 deletions great_expectations/data_asset/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ def recursively_convert_to_json_serializable(test_obj):
# Note: Use np.floating to avoid FutureWarning from numpy
return float(round(test_obj, sys.float_info.dig))

elif isinstance(test_obj, pd.Series):
return recursively_convert_to_json_serializable(test_obj.to_dict())

elif isinstance(test_obj, pd.DataFrame):
return recursively_convert_to_json_serializable(test_obj.to_dict(orient='records'))

Expand Down
105 changes: 101 additions & 4 deletions great_expectations/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1647,6 +1647,95 @@ def expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
"""
raise NotImplementedError

@DocInherit
@MetaDataset.column_aggregate_expectation
def expect_column_distinct_values_to_be_in_set(self,
column,
value_set,
parse_strings_as_datetimes=None,
result_format=None, include_config=False, catch_exceptions=None, meta=None):
"""Expect the set of distinct column values to be contained by a given set.
The success value for this expectation will match that of expect_column_values_to_be_in_set, but this is an aggregate expectation
and so will provide aggregate semantics including an observed value.
For example:
::
# my_df.my_col = [1,2,2,3,3,3]
>>> my_df.expect_column_distinct_values_to_be_in_set(
"my_col",
[2, 3, 4]
)
{
"success": false
"result": {
"observed_value": [1,2,3],
"details": {
"value_counts": {
"1": 1,
"2": 2,
"3": 3
}
}
},
}
expect_column_distinct_values_to_be_in_set is a :func:`column_aggregate_expectation <great_expectations.data_asset.dataset.Dataset.column_aggregate_expectation>`.
Args:
column (str): \
The column name.
value_set (set-like): \
A set of objects used for comparison.
Keyword Args:
parse_strings_as_datetimes (boolean or None) : If True values provided in value_set will be parsed as \
datetimes before making comparisons.
Other Parameters:
result_format (str or None): \
Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
For more detail, see :ref:`result_format <result_format>`.
include_config (boolean): \
If True, then include the expectation config as part of the result object. \
For more detail, see :ref:`include_config`.
catch_exceptions (boolean or None): \
If True, then catch exceptions and include them as part of the result object. \
For more detail, see :ref:`catch_exceptions`.
meta (dict or None): \
A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. \
For more detail, see :ref:`meta`.
Returns:
A JSON-serializable expectation result object.
Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
:ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
See Also:
expect_column_distinct_values_to_contain_set
"""
if parse_strings_as_datetimes:
parsed_value_set = self._parse_value_set(value_set)
else:
parsed_value_set = value_set

observed_value_counts = self.get_column_value_counts(column)
expected_value_set = set(parsed_value_set)
observed_value_set = set(observed_value_counts.index)

return {
"success": observed_value_set.issubset(expected_value_set),
"result": {
"observed_value": sorted(list(observed_value_set)),
"details": {
"value_counts": observed_value_counts
}
}
}

@DocInherit
@MetaDataset.column_aggregate_expectation
def expect_column_distinct_values_to_equal_set(self,
Expand Down Expand Up @@ -1714,13 +1803,17 @@ def expect_column_distinct_values_to_equal_set(self,
else:
parsed_value_set = value_set

observed_value_counts = self.get_column_value_counts(column)
expected_value_set = set(parsed_value_set)
observed_value_set = set(self.get_column_value_counts(column).index)
observed_value_set = set(observed_value_counts.index)

return {
"success": observed_value_set == expected_value_set,
"result": {
"observed_value": sorted(list(observed_value_set))
"observed_value": sorted(list(observed_value_set)),
"details": {
"value_counts": observed_value_counts
}
}
}

Expand Down Expand Up @@ -1792,13 +1885,17 @@ def expect_column_distinct_values_to_contain_set(self,
else:
parsed_value_set = value_set

observed_value_counts = self.get_column_value_counts(column)
expected_value_set = set(parsed_value_set)
observed_value_set = set(self.get_column_value_counts(column).index)
observed_value_set = set(observed_value_counts.index)

return {
"success": observed_value_set.issuperset(expected_value_set),
"result": {
"observed_value": sorted(list(observed_value_set))
"observed_value": sorted(list(observed_value_set)),
"details": {
"value_counts": observed_value_counts
}
}
}

Expand Down
8 changes: 7 additions & 1 deletion great_expectations/dataset/pandas_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,13 @@ def expect_column_values_to_be_in_type_list(self, column, type_list,
# Build one type list with each specified type list from type_map
target_type_list = list()
for type_ in type_list:
target_type_list += type_map[type_]
try:
target_type_list += type_map[type_]
except KeyError:
logger.warning("Unrecognized type: %s" % type_)

if len(target_type_list) == 0:
raise ValueError("No recognized pandas types in type_list")

return column.map(lambda x: isinstance(x, tuple(target_type_list)))

Expand Down
2 changes: 1 addition & 1 deletion great_expectations/datasource/sqlalchemy_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(self, datasource, name="default"):

def _get_iterator(self, data_asset_name, **kwargs):
if self._queries_path:
if data_asset_name in [path for path in os.walk(self._queries_path) if str(path).endswith(".sql")]:
if data_asset_name in [path[:-4] for path in os.listdir(self._queries_path) if str(path).endswith(".sql")]:
with open(os.path.join(self._queries_path, data_asset_name) + ".sql", "r") as data:
return iter([{
"query": data.read(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
"metadata": {},
"outputs": [],
"source": [
"# context = ge.data_context.DataContext('../../', expectation_explorer=True)\n",
"context = ge.data_context.DataContext('../../', expectation_explorer=False)"
"# context = ge.data_context.DataContext('../', expectation_explorer=True)\n",
"context = ge.data_context.DataContext('../', expectation_explorer=False)"
]
},
{
Expand Down Expand Up @@ -121,7 +121,7 @@
"metadata": {},
"outputs": [],
"source": [
"great_expectations.jupyter_ux.get_available_data_asset_names(context, data_source_name=data_source_name)"
"great_expectations.jupyter_ux.list_available_data_asset_names(context, data_source_name=data_source_name)"
]
},
{
Expand Down
Loading

0 comments on commit acb2963

Please sign in to comment.