great-expectations · alexsherstinsky · May 18, 2023 · May 18, 2023 · May 18, 2023
diff --git a/...ataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py b/...ataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py
@@ -16,7 +16,11 @@
 class DataProfilerTableColumnList(DataProfilerProfileMetricProvider):
     metric_name = "data_profiler.table_column_list"
 
-    value_keys = ("profile_path",)
+    value_keys = (
+        "profile_path",
+        "profile_report_filtering_key",
+        "profile_report_accepted_filtering_values",
+    )
 
     @metric_value(engine=PandasExecutionEngine)
     def _pandas(
@@ -27,6 +31,12 @@ def _pandas(
         metrics,
         runtime_configuration,
     ):
+        profile_report_filtering_key = metric_value_kwargs[
+            "profile_report_filtering_key"
+        ]
+        profile_report_accepted_filtering_values = metric_value_kwargs[
+            "profile_report_accepted_filtering_values"
+        ]
         profile_report_column_data_stats: dict = metrics[
             "data_profiler.table_column_infos"
         ]
@@ -37,7 +47,16 @@ def _pandas(
             column_names=profile_report_column_names,
             batch_columns_list=metrics["table.columns"],
         )
-        return profile_report_column_names
+        profile_report_filtered_column_names: list = []
+        for col in profile_report_column_names:
+            if (
+                metrics["data_profiler.table_column_infos"][col][
+                    profile_report_filtering_key
+                ]
+                in profile_report_accepted_filtering_values
+            ):
+                profile_report_filtered_column_names.append(col)
+        return profile_report_filtered_column_names
 
     @classmethod
     def _get_evaluation_dependencies(

diff --git a/...xpectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py b/...xpectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py
@@ -176,6 +176,8 @@ def _build_numeric_rule() -> Rule:
             "strict_min": False,
             "strict_max": False,
             "profile_path": "default_profiler_path",
+            "profile_report_filtering_key": "data_type",
+            "profile_report_accepted_filtering_values": ["int", "float", "string"],
         }
 
         parameter_builders: List[ParameterBuilder] = [
@@ -287,6 +289,8 @@ def _build_float_rule() -> Rule:
             "strict_min": False,
             "strict_max": False,
             "profile_path": "default_profiler_path",
+            "profile_report_filtering_key": "data_type",
+            "profile_report_accepted_filtering_values": ["float"],
         }
 
         parameter_builders: List[ParameterBuilder] = [

diff --git a/...er_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py b/...er_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py
@@ -33,7 +33,6 @@ class DataProfilerColumnDomainBuilder(ColumnDomainBuilder):
 
     def __init__(
         self,
-        profile_path: str = f"{VARIABLES_KEY}profile_path",
         include_column_names: Optional[Union[str, Optional[List[str]]]] = None,
         exclude_column_names: Optional[Union[str, Optional[List[str]]]] = None,
         include_column_name_suffixes: Optional[Union[str, Iterable, List[str]]] = None,
@@ -75,12 +74,6 @@ def __init__(
             data_context=data_context,
         )
 
-        self._profile_path = profile_path
-
-    @property
-    def profile_path(self) -> str:
-        return self._profile_path
-
     def _get_domains(
         self,
         rule_name: str,
@@ -110,18 +103,39 @@ def _get_domains(
         # Obtain profile_path from "rule state" (i.e., variables and parameters); from instance variable otherwise.
         profile_path: str = get_parameter_value_and_validate_return_type(
             domain=None,
-            parameter_reference=self.profile_path,
+            parameter_reference=f"{VARIABLES_KEY}profile_path",
             expected_return_type=str,
             variables=variables,
             parameters=None,
         )
 
+        profile_report_filtering_key: str = (
+            get_parameter_value_and_validate_return_type(
+                domain=None,
+                parameter_reference=f"{VARIABLES_KEY}profile_report_filtering_key",
+                expected_return_type=str,
+                variables=variables,
+                parameters=None,
+            )
+        )
+
+        profile_report_accepted_filtering_values: str = get_parameter_value_and_validate_return_type(
+            domain=None,
+            parameter_reference=f"{VARIABLES_KEY}profile_report_accepted_filtering_values",
+            expected_return_type=list,
+            variables=variables,
+            parameters=None,
+        )
+
+        # get metrics and profile path from variables and then pass them into here
         profile_report_column_names: List[str] = validator.get_metric(  # type: ignore[assignment] # could be None
             metric=MetricConfiguration(
                 metric_name="data_profiler.table_column_list",
                 metric_domain_kwargs={},
                 metric_value_kwargs={
                     "profile_path": profile_path,
+                    "profile_report_filtering_key": profile_report_filtering_key,
+                    "profile_report_accepted_filtering_values": profile_report_accepted_filtering_values,
                 },
             )
         )

diff --git a/...pitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py b/...pitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py
@@ -41,15 +41,19 @@ def report(self, report_options: dict = None) -> dict:
             "data_stats": [
                 {
                     "column_name": "vendor_id",
+                    "data_type": "int",
                 },
                 {
                     "column_name": "passenger_count",
+                    "data_type": "int",
                 },
                 {
                     "column_name": "total_amount",
+                    "data_type": "float",
                 },
                 {
                     "column_name": "congestion_surcharge",
+                    "data_type": "float",
                 },
             ],
         }

diff --git a/.../tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py b/.../tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py
@@ -2,7 +2,6 @@
 
 import os
 import unittest
-from pathlib import Path
 from typing import TYPE_CHECKING, Dict, List, Optional, cast
 from unittest import mock
 
@@ -50,26 +49,6 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
         "data_asset_name": "my_reports",
         "data_connector_query": {"index": -1},
     }
-    exclude_column_names = [
-        "vendor_id",
-        "pickup_datetime",
-        "dropoff_datetime",
-        "passenger_count",
-        # "trip_distance",
-        "rate_code_id",
-        "store_and_fwd_flag",
-        "pickup_location_id",
-        "dropoff_location_id",
-        "payment_type",
-        # "fare_amount",
-        # "extra",
-        # "mta_tax",
-        # "tip_amount",
-        # "tolls_amount",
-        # "improvement_surcharge",
-        # "total_amount",
-        "congestion_surcharge",
-    ]
 
     data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run(
         batch_request=batch_request,
@@ -79,15 +58,18 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
                 "data_profiler_files",
                 "profile.pkl",
             ),
+            "profile_report_filtering_key": "data_type",
+            "profile_report_accepted_filtering_values": ["int", "float", "string"],
         },
         float_rule={
-            "profile_path": Path(
+            "profile_path": os.path.join(  # noqa: PTH118
                 test_root_path,
                 "data_profiler_files",
                 "profile.pkl",
             ),
+            "profile_report_filtering_key": "data_type",
+            "profile_report_accepted_filtering_values": ["float"],
         },
-        exclude_column_names=exclude_column_names,
         estimation="flag_outliers",
     )
 
@@ -109,43 +91,25 @@ def bobby_profile_data_profiler_structured_data_assistant_result(
         "data_connector_query": {"index": -1},
     }
 
-    exclude_column_names = [
-        "vendor_id",
-        "pickup_datetime",
-        "dropoff_datetime",
-        "passenger_count",
-        # "trip_distance",
-        "rate_code_id",
-        "store_and_fwd_flag",
-        "pickup_location_id",
-        "dropoff_location_id",
-        "payment_type",
-        # "fare_amount",
-        # "extra",
-        # "mta_tax",
-        # "tip_amount",
-        # "tolls_amount",
-        # "improvement_surcharge",
-        # "total_amount",
-        "congestion_surcharge",
-    ]
-
     data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run(
         batch_request=batch_request,
-        exclude_column_names=exclude_column_names,
         numeric_rule={
             "profile_path": os.path.join(  # noqa: PTH118
                 test_root_path,
                 "data_profiler_files",
                 "profile.pkl",
             ),
+            "profile_report_filtering_key": "data_type",
+            "profile_report_accepted_filtering_values": ["int", "float", "string"],
         },
         float_rule={
             "profile_path": os.path.join(  # noqa: PTH118
                 test_root_path,
                 "data_profiler_files",
                 "profile.pkl",
             ),
+            "profile_report_filtering_key": "data_type",
+            "profile_report_accepted_filtering_values": ["float"],
         },
         estimation="flag_outliers",
     )
@@ -236,7 +200,9 @@ def test_profile_data_profiler_structured_data_assistant_metrics_count(
         bobby_profile_data_profiler_structured_data_assistant_result.metrics_by_domain.items()
     ):
         num_metrics += len(parameter_values_for_fully_qualified_parameter_names)
-    assert num_metrics == 32
+    assert (
+        num_metrics == 50
+    )  # 2 * ((numeric_rule: 6 int + 9 float + 1 string) + (float_rule: 9 float))
 
 
 @pytest.mark.integration

diff --git a/...ions/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py b/...ions/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py
@@ -19,7 +19,6 @@
 from great_expectations.data_context import FileDataContext
 from great_expectations.rule_based_profiler.domain_builder import DomainBuilder
 from great_expectations.rule_based_profiler.parameter_container import (
-    VARIABLES_KEY,
     ParameterContainer,
     build_parameter_container_for_variables,
 )
@@ -55,9 +54,16 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value(
     )
 
     variables_configs: dict = {
-        "estimator": "quantiles",
-        "false_positive_rate": 1.0e-2,
-        "mostly": 1.0,
+        "strict_min": False,
+        "strict_max": False,
+        "profile_path": profile_path,
+        "profile_report_filtering_key": "data_type",
+        "profile_report_accepted_filtering_values": [
+            "int",
+            "float",
+            "string",
+            "datetime",
+        ],
     }
     variables: ParameterContainer = build_parameter_container_for_variables(
         variables_configs=variables_configs
@@ -71,7 +77,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value(
     }
 
     domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
-        profile_path=profile_path,
         data_context=data_context,
     )
     domains: List[Domain] = domain_builder.get_domains(
@@ -325,6 +330,13 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_default_refere
         "estimator": "quantiles",
         "false_positive_rate": 1.0e-2,
         "mostly": 1.0,
+        "profile_report_filtering_key": "data_type",
+        "profile_report_accepted_filtering_values": [
+            "int",
+            "float",
+            "string",
+            "datetime",
+        ],
     }
     variables: ParameterContainer = build_parameter_container_for_variables(
         variables_configs=variables_configs
@@ -587,10 +599,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference(
     )
 
     variables_configs: dict = {
-        "my_profile_path": profile_path,
+        "profile_path": profile_path,
         "estimator": "quantiles",
         "false_positive_rate": 1.0e-2,
         "mostly": 1.0,
+        "profile_report_filtering_key": "data_type",
+        "profile_report_accepted_filtering_values": [
+            "int",
+            "float",
+            "string",
+            "datetime",
+        ],
     }
     variables: ParameterContainer = build_parameter_container_for_variables(
         variables_configs=variables_configs
@@ -604,7 +623,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference(
     }
 
     domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
-        profile_path=f"{VARIABLES_KEY}my_profile_path",
         data_context=data_context,
     )
     domains: List[Domain] = domain_builder.get_domains(
@@ -854,10 +872,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
     )
 
     variables_configs: dict = {
-        "my_profile_path": profile_path,
+        "profile_path": profile_path,
         "estimator": "quantiles",
         "false_positive_rate": 1.0e-2,
         "mostly": 1.0,
+        "profile_report_filtering_key": "data_type",
+        "profile_report_accepted_filtering_values": [
+            "int",
+            "float",
+            "string",
+            "datetime",
+        ],
     }
     variables: ParameterContainer = build_parameter_container_for_variables(
         variables_configs=variables_configs
@@ -871,7 +896,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
     }
 
     domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
-        profile_path=f"{VARIABLES_KEY}my_profile_path",
         exclude_column_names=[
             "store_and_fwd_flag",
             "congestion_surcharge",
@@ -1057,10 +1081,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
     )
 
     variables_configs: dict = {
-        "my_profile_path": profile_path,
+        "profile_path": profile_path,
         "estimator": "quantiles",
         "false_positive_rate": 1.0e-2,
         "mostly": 1.0,
+        "profile_report_filtering_key": "data_type",
+        "profile_report_accepted_filtering_values": [
+            "int",
+            "float",
+            "string",
+            "datetime",
+        ],
     }
     variables: ParameterContainer = build_parameter_container_for_variables(
         variables_configs=variables_configs
@@ -1074,7 +1105,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
     }
 
     domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
-        profile_path=f"{VARIABLES_KEY}my_profile_path",
         data_context=data_context,
     )
     with mock.patch(