From 0aa1209aa0f71cea5f3036d07e8393cdce9d5e99 Mon Sep 17 00:00:00 2001 From: Michael Davis <36012613+micdavis@users.noreply.github.com> Date: Thu, 18 May 2023 12:12:21 -0400 Subject: [PATCH] [FEATURE] DataProfilerColumnDomainBuilder (#7920) --- .../data_profiler_table_column_list.py | 23 +++++++- ...data_profiler_structured_data_assistant.py | 4 ++ .../data_profiler_column_domain_builder.py | 30 +++++++--- .../tests/conftest.py | 4 ++ ...data_profiler_structured_data_assistant.py | 58 ++++--------------- ...est_data_profiler_column_domain_builder.py | 52 +++++++++++++---- 6 files changed, 104 insertions(+), 67 deletions(-) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py index cf4141332787..a8dd822731f7 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py @@ -16,7 +16,11 @@ class DataProfilerTableColumnList(DataProfilerProfileMetricProvider): metric_name = "data_profiler.table_column_list" - value_keys = ("profile_path",) + value_keys = ( + "profile_path", + "profile_report_filtering_key", + "profile_report_accepted_filtering_values", + ) @metric_value(engine=PandasExecutionEngine) def _pandas( @@ -27,6 +31,12 @@ def _pandas( metrics, runtime_configuration, ): + profile_report_filtering_key = metric_value_kwargs[ + "profile_report_filtering_key" + ] + profile_report_accepted_filtering_values = metric_value_kwargs[ + "profile_report_accepted_filtering_values" + ] profile_report_column_data_stats: dict = metrics[ "data_profiler.table_column_infos" ] @@ -37,7 +47,16 @@ def _pandas( column_names=profile_report_column_names, batch_columns_list=metrics["table.columns"], ) - return profile_report_column_names + profile_report_filtered_column_names: list = [] + for col in profile_report_column_names: + if ( + metrics["data_profiler.table_column_infos"][col][ + profile_report_filtering_key + ] + in profile_report_accepted_filtering_values + ): + profile_report_filtered_column_names.append(col) + return profile_report_filtered_column_names @classmethod def _get_evaluation_dependencies( diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py index 187f1bb63e6c..1761bf405f07 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py @@ -176,6 +176,8 @@ def _build_numeric_rule() -> Rule: "strict_min": False, "strict_max": False, "profile_path": "default_profiler_path", + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": ["int", "float", "string"], } parameter_builders: List[ParameterBuilder] = [ @@ -287,6 +289,8 @@ def _build_float_rule() -> Rule: "strict_min": False, "strict_max": False, "profile_path": "default_profiler_path", + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": ["float"], } parameter_builders: List[ParameterBuilder] = [ diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py index 6695e598d2fc..2b330d510f5a 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py @@ -33,7 +33,6 @@ class DataProfilerColumnDomainBuilder(ColumnDomainBuilder): def __init__( self, - profile_path: str = f"{VARIABLES_KEY}profile_path", include_column_names: Optional[Union[str, Optional[List[str]]]] = None, exclude_column_names: Optional[Union[str, Optional[List[str]]]] = None, include_column_name_suffixes: Optional[Union[str, Iterable, List[str]]] = None, @@ -75,12 +74,6 @@ def __init__( data_context=data_context, ) - self._profile_path = profile_path - - @property - def profile_path(self) -> str: - return self._profile_path - def _get_domains( self, rule_name: str, @@ -110,18 +103,39 @@ def _get_domains( # Obtain profile_path from "rule state" (i.e., variables and parameters); from instance variable otherwise. profile_path: str = get_parameter_value_and_validate_return_type( domain=None, - parameter_reference=self.profile_path, + parameter_reference=f"{VARIABLES_KEY}profile_path", expected_return_type=str, variables=variables, parameters=None, ) + profile_report_filtering_key: str = ( + get_parameter_value_and_validate_return_type( + domain=None, + parameter_reference=f"{VARIABLES_KEY}profile_report_filtering_key", + expected_return_type=str, + variables=variables, + parameters=None, + ) + ) + + profile_report_accepted_filtering_values: str = get_parameter_value_and_validate_return_type( + domain=None, + parameter_reference=f"{VARIABLES_KEY}profile_report_accepted_filtering_values", + expected_return_type=list, + variables=variables, + parameters=None, + ) + + # get metrics and profile path from variables and then pass them into here profile_report_column_names: List[str] = validator.get_metric( # type: ignore[assignment] # could be None metric=MetricConfiguration( metric_name="data_profiler.table_column_list", metric_domain_kwargs={}, metric_value_kwargs={ "profile_path": profile_path, + "profile_report_filtering_key": profile_report_filtering_key, + "profile_report_accepted_filtering_values": profile_report_accepted_filtering_values, }, ) ) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py index 73cb910c897f..4d73dc964f71 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py @@ -41,15 +41,19 @@ def report(self, report_options: dict = None) -> dict: "data_stats": [ { "column_name": "vendor_id", + "data_type": "int", }, { "column_name": "passenger_count", + "data_type": "int", }, { "column_name": "total_amount", + "data_type": "float", }, { "column_name": "congestion_surcharge", + "data_type": "float", }, ], } diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py index c45cd2f48dc8..00a7b12408f8 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py @@ -2,7 +2,6 @@ import os import unittest -from pathlib import Path from typing import TYPE_CHECKING, Dict, List, Optional, cast from unittest import mock @@ -50,26 +49,6 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena "data_asset_name": "my_reports", "data_connector_query": {"index": -1}, } - exclude_column_names = [ - "vendor_id", - "pickup_datetime", - "dropoff_datetime", - "passenger_count", - # "trip_distance", - "rate_code_id", - "store_and_fwd_flag", - "pickup_location_id", - "dropoff_location_id", - "payment_type", - # "fare_amount", - # "extra", - # "mta_tax", - # "tip_amount", - # "tolls_amount", - # "improvement_surcharge", - # "total_amount", - "congestion_surcharge", - ] data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run( batch_request=batch_request, @@ -79,15 +58,18 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena "data_profiler_files", "profile.pkl", ), + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": ["int", "float", "string"], }, float_rule={ - "profile_path": Path( + "profile_path": os.path.join( # noqa: PTH118 test_root_path, "data_profiler_files", "profile.pkl", ), + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": ["float"], }, - exclude_column_names=exclude_column_names, estimation="flag_outliers", ) @@ -109,36 +91,16 @@ def bobby_profile_data_profiler_structured_data_assistant_result( "data_connector_query": {"index": -1}, } - exclude_column_names = [ - "vendor_id", - "pickup_datetime", - "dropoff_datetime", - "passenger_count", - # "trip_distance", - "rate_code_id", - "store_and_fwd_flag", - "pickup_location_id", - "dropoff_location_id", - "payment_type", - # "fare_amount", - # "extra", - # "mta_tax", - # "tip_amount", - # "tolls_amount", - # "improvement_surcharge", - # "total_amount", - "congestion_surcharge", - ] - data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run( batch_request=batch_request, - exclude_column_names=exclude_column_names, numeric_rule={ "profile_path": os.path.join( # noqa: PTH118 test_root_path, "data_profiler_files", "profile.pkl", ), + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": ["int", "float", "string"], }, float_rule={ "profile_path": os.path.join( # noqa: PTH118 @@ -146,6 +108,8 @@ def bobby_profile_data_profiler_structured_data_assistant_result( "data_profiler_files", "profile.pkl", ), + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": ["float"], }, estimation="flag_outliers", ) @@ -236,7 +200,9 @@ def test_profile_data_profiler_structured_data_assistant_metrics_count( bobby_profile_data_profiler_structured_data_assistant_result.metrics_by_domain.items() ): num_metrics += len(parameter_values_for_fully_qualified_parameter_names) - assert num_metrics == 32 + assert ( + num_metrics == 50 + ) # 2 * ((numeric_rule: 6 int + 9 float + 1 string) + (float_rule: 9 float)) @pytest.mark.integration diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py index e51d5f300187..f5655b85148d 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py @@ -19,7 +19,6 @@ from great_expectations.data_context import FileDataContext from great_expectations.rule_based_profiler.domain_builder import DomainBuilder from great_expectations.rule_based_profiler.parameter_container import ( - VARIABLES_KEY, ParameterContainer, build_parameter_container_for_variables, ) @@ -55,9 +54,16 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value( ) variables_configs: dict = { - "estimator": "quantiles", - "false_positive_rate": 1.0e-2, - "mostly": 1.0, + "strict_min": False, + "strict_max": False, + "profile_path": profile_path, + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": [ + "int", + "float", + "string", + "datetime", + ], } variables: ParameterContainer = build_parameter_container_for_variables( variables_configs=variables_configs @@ -71,7 +77,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value( } domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( - profile_path=profile_path, data_context=data_context, ) domains: List[Domain] = domain_builder.get_domains( @@ -325,6 +330,13 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_default_refere "estimator": "quantiles", "false_positive_rate": 1.0e-2, "mostly": 1.0, + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": [ + "int", + "float", + "string", + "datetime", + ], } variables: ParameterContainer = build_parameter_container_for_variables( variables_configs=variables_configs @@ -587,10 +599,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference( ) variables_configs: dict = { - "my_profile_path": profile_path, + "profile_path": profile_path, "estimator": "quantiles", "false_positive_rate": 1.0e-2, "mostly": 1.0, + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": [ + "int", + "float", + "string", + "datetime", + ], } variables: ParameterContainer = build_parameter_container_for_variables( variables_configs=variables_configs @@ -604,7 +623,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference( } domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( - profile_path=f"{VARIABLES_KEY}my_profile_path", data_context=data_context, ) domains: List[Domain] = domain_builder.get_domains( @@ -854,10 +872,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with ) variables_configs: dict = { - "my_profile_path": profile_path, + "profile_path": profile_path, "estimator": "quantiles", "false_positive_rate": 1.0e-2, "mostly": 1.0, + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": [ + "int", + "float", + "string", + "datetime", + ], } variables: ParameterContainer = build_parameter_container_for_variables( variables_configs=variables_configs @@ -871,7 +896,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with } domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( - profile_path=f"{VARIABLES_KEY}my_profile_path", exclude_column_names=[ "store_and_fwd_flag", "congestion_surcharge", @@ -1057,10 +1081,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with ) variables_configs: dict = { - "my_profile_path": profile_path, + "profile_path": profile_path, "estimator": "quantiles", "false_positive_rate": 1.0e-2, "mostly": 1.0, + "profile_report_filtering_key": "data_type", + "profile_report_accepted_filtering_values": [ + "int", + "float", + "string", + "datetime", + ], } variables: ParameterContainer = build_parameter_container_for_variables( variables_configs=variables_configs @@ -1074,7 +1105,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with } domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( - profile_path=f"{VARIABLES_KEY}my_profile_path", data_context=data_context, ) with mock.patch(