Skip to content

Commit

Permalink
Add summary output for profiler
Browse files Browse the repository at this point in the history
  • Loading branch information
abegong committed Jun 12, 2019
1 parent b463b7c commit 63bf508
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
32 changes: 29 additions & 3 deletions great_expectations/data_context/data_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,8 +659,9 @@ def profile_datasource(self, datasource_name, profiler_name="PseudoPandasProfili
#!!! Abe 2019/06/11: This seems brittle. I don't understand why this object is packaged this way.
#!!! Note: need to review this to make sure the names are properly qualified.
data_asset_name_list = list(data_asset_names[0]["available_data_asset_names"])
# logger.info("Found %d named data assets" % (len(data_asset_name_list)))
print("Found %d named data assets" % (len(data_asset_name_list)))
total_data_assets = len(data_asset_name_list)
# logger.info("Found %d named data assets" % (total_data_assets))
print("Found %d named data assets" % (total_data_assets))

if max_data_assets == None or max_data_assets >= len(data_asset_name_list):
# logger.info("Profiling all %d." % (len(data_asset_name_list)))
Expand All @@ -671,6 +672,8 @@ def profile_datasource(self, datasource_name, profiler_name="PseudoPandasProfili
data_asset_name_list.sort()
data_asset_name_list = data_asset_name_list[:max_data_assets]

total_columns, total_expectations, total_rows, skipped_data_assets = 0, 0, 0, 0
total_start_time = datetime.datetime.now()
for name in data_asset_name_list:
try:
start_time = datetime.datetime.now()
Expand All @@ -681,18 +684,41 @@ def profile_datasource(self, datasource_name, profiler_name="PseudoPandasProfili
#Note: This logic is specific to DatasetProfilers, which profile a single batch. Multi-batch profilers will have more to unpack.
expectations_config, validation_result = PseudoPandasProfiler.profile(batch)

row_count = batch.shape[0]
total_rows += row_count
new_column_count = len(set([exp["kwargs"]["column"] for exp in expectations_config["expectations"] if "column" in exp["kwargs"]]))
total_columns += new_column_count
new_expectation_count = len(expectations_config["expectations"])
total_expectations += new_expectation_count

self.save_expectations(expectations_config)#, name)
# self.save_validation_result(validation_result, name)

duration = (datetime.datetime.now() - start_time).total_seconds()

print("\tProfiled %d rows from %s (%.3f sec)" % (batch.shape[0], name, duration))
print("\tProfiled %d rows from %s (%.3f sec)" % (row_count, name, duration))

#!!! FIXME: THIS IS WAAAAY TO GENERAL. As soon as BatchKwargsError is fully implemented, we'll want to switch to that.
except:
#!!! FIXME: This error message could be a lot more helpful than it is
# print("\tWARNING: Unable to load %s. Skipping profiling." % (name))
print("\tWARNING: Something went wrong when profiling %s. (Perhaps a loading error?) Skipping." % (name))
skipped_data_assets += 1

total_duration = (datetime.datetime.now() - total_start_time).total_seconds()
print("""
Profiled %d of %d named data assets, with %d total rows and %d columns in %.2f sec.
%d data assets were skipped.
Generated, evaluated, and stored %d candidate Expectations.
Note: You will need to review and revise Expectations before using them in production.""" % (
len(data_asset_name_list),
total_data_assets,
total_rows,
total_columns,
total_duration,
skipped_data_assets,
total_expectations,
))



Expand Down
2 changes: 2 additions & 0 deletions tests/test_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def test_context_profiler(empty_data_context, filesystem_csv_2):

assert len(profiled_expectations["expectations"]) > 0

# assert False


# FIXME: This test needs a different home.
# def test_validate_on_a_context_loaded_batch(empty_data_context, filesystem_csv_2):
Expand Down

0 comments on commit 63bf508

Please sign in to comment.