Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] DataProfilerColumnDomainBuilder #7920

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
class DataProfilerTableColumnList(DataProfilerProfileMetricProvider):
metric_name = "data_profiler.table_column_list"

value_keys = ("profile_path",)
value_keys = (
"profile_path",
"profile_report_filtering_key",
"profile_report_accepted_filtering_values",
)

@metric_value(engine=PandasExecutionEngine)
def _pandas(
Expand All @@ -27,6 +31,12 @@ def _pandas(
metrics,
runtime_configuration,
):
profile_report_filtering_key = metric_value_kwargs[
"profile_report_filtering_key"
]
profile_report_accepted_filtering_values = metric_value_kwargs[
"profile_report_accepted_filtering_values"
]
profile_report_column_data_stats: dict = metrics[
"data_profiler.table_column_infos"
]
Expand All @@ -37,7 +47,16 @@ def _pandas(
column_names=profile_report_column_names,
batch_columns_list=metrics["table.columns"],
)
return profile_report_column_names
profile_report_filtered_column_names: list = []
for col in profile_report_column_names:
if (
metrics["data_profiler.table_column_infos"][col][
profile_report_filtering_key
]
in profile_report_accepted_filtering_values
):
profile_report_filtered_column_names.append(col)
return profile_report_filtered_column_names

@classmethod
def _get_evaluation_dependencies(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def _build_numeric_rule() -> Rule:
"strict_min": False,
"strict_max": False,
"profile_path": "default_profiler_path",
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["int", "float", "string"],
}

parameter_builders: List[ParameterBuilder] = [
Expand Down Expand Up @@ -287,6 +289,8 @@ def _build_float_rule() -> Rule:
"strict_min": False,
"strict_max": False,
"profile_path": "default_profiler_path",
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["float"],
}

parameter_builders: List[ParameterBuilder] = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class DataProfilerColumnDomainBuilder(ColumnDomainBuilder):

def __init__(
self,
profile_path: str = f"{VARIABLES_KEY}profile_path",
include_column_names: Optional[Union[str, Optional[List[str]]]] = None,
exclude_column_names: Optional[Union[str, Optional[List[str]]]] = None,
include_column_name_suffixes: Optional[Union[str, Iterable, List[str]]] = None,
Expand Down Expand Up @@ -75,12 +74,6 @@ def __init__(
data_context=data_context,
)

self._profile_path = profile_path

@property
def profile_path(self) -> str:
return self._profile_path

def _get_domains(
self,
rule_name: str,
Expand Down Expand Up @@ -110,18 +103,39 @@ def _get_domains(
# Obtain profile_path from "rule state" (i.e., variables and parameters); from instance variable otherwise.
profile_path: str = get_parameter_value_and_validate_return_type(
domain=None,
parameter_reference=self.profile_path,
parameter_reference=f"{VARIABLES_KEY}profile_path",
expected_return_type=str,
variables=variables,
parameters=None,
)

profile_report_filtering_key: str = (
get_parameter_value_and_validate_return_type(
domain=None,
parameter_reference=f"{VARIABLES_KEY}profile_report_filtering_key",
expected_return_type=str,
variables=variables,
parameters=None,
)
)

profile_report_accepted_filtering_values: str = get_parameter_value_and_validate_return_type(
domain=None,
parameter_reference=f"{VARIABLES_KEY}profile_report_accepted_filtering_values",
expected_return_type=list,
variables=variables,
parameters=None,
)

# get metrics and profile path from variables and then pass them into here
profile_report_column_names: List[str] = validator.get_metric( # type: ignore[assignment] # could be None
metric=MetricConfiguration(
metric_name="data_profiler.table_column_list",
metric_domain_kwargs={},
metric_value_kwargs={
"profile_path": profile_path,
"profile_report_filtering_key": profile_report_filtering_key,
"profile_report_accepted_filtering_values": profile_report_accepted_filtering_values,
},
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,19 @@ def report(self, report_options: dict = None) -> dict:
"data_stats": [
{
"column_name": "vendor_id",
"data_type": "int",
},
{
"column_name": "passenger_count",
"data_type": "int",
},
{
"column_name": "total_amount",
"data_type": "float",
},
{
"column_name": "congestion_surcharge",
"data_type": "float",
},
],
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import os
import unittest
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, cast
from unittest import mock

Expand Down Expand Up @@ -50,26 +49,6 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
"data_asset_name": "my_reports",
"data_connector_query": {"index": -1},
}
exclude_column_names = [
"vendor_id",
"pickup_datetime",
"dropoff_datetime",
"passenger_count",
# "trip_distance",
"rate_code_id",
"store_and_fwd_flag",
"pickup_location_id",
"dropoff_location_id",
"payment_type",
# "fare_amount",
# "extra",
# "mta_tax",
# "tip_amount",
# "tolls_amount",
# "improvement_surcharge",
# "total_amount",
"congestion_surcharge",
]

data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run(
batch_request=batch_request,
Expand All @@ -79,15 +58,18 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["int", "float", "string"],
},
float_rule={
"profile_path": Path(
"profile_path": os.path.join( # noqa: PTH118
test_root_path,
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["float"],
},
exclude_column_names=exclude_column_names,
estimation="flag_outliers",
)

Expand All @@ -109,43 +91,25 @@ def bobby_profile_data_profiler_structured_data_assistant_result(
"data_connector_query": {"index": -1},
}

exclude_column_names = [
"vendor_id",
"pickup_datetime",
"dropoff_datetime",
"passenger_count",
# "trip_distance",
"rate_code_id",
"store_and_fwd_flag",
"pickup_location_id",
"dropoff_location_id",
"payment_type",
# "fare_amount",
# "extra",
# "mta_tax",
# "tip_amount",
# "tolls_amount",
# "improvement_surcharge",
# "total_amount",
"congestion_surcharge",
]

data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run(
batch_request=batch_request,
exclude_column_names=exclude_column_names,
numeric_rule={
"profile_path": os.path.join( # noqa: PTH118
test_root_path,
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["int", "float", "string"],
},
float_rule={
"profile_path": os.path.join( # noqa: PTH118
test_root_path,
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["float"],
},
estimation="flag_outliers",
)
Expand Down Expand Up @@ -236,7 +200,9 @@ def test_profile_data_profiler_structured_data_assistant_metrics_count(
bobby_profile_data_profiler_structured_data_assistant_result.metrics_by_domain.items()
):
num_metrics += len(parameter_values_for_fully_qualified_parameter_names)
assert num_metrics == 32
assert (
num_metrics == 50
) # 2 * ((numeric_rule: 6 int + 9 float + 1 string) + (float_rule: 9 float))


@pytest.mark.integration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from great_expectations.data_context import FileDataContext
from great_expectations.rule_based_profiler.domain_builder import DomainBuilder
from great_expectations.rule_based_profiler.parameter_container import (
VARIABLES_KEY,
ParameterContainer,
build_parameter_container_for_variables,
)
Expand Down Expand Up @@ -55,9 +54,16 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value(
)

variables_configs: dict = {
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"strict_min": False,
"strict_max": False,
"profile_path": profile_path,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -71,7 +77,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value(
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=profile_path,
data_context=data_context,
)
domains: List[Domain] = domain_builder.get_domains(
Expand Down Expand Up @@ -325,6 +330,13 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_default_refere
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand Down Expand Up @@ -587,10 +599,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference(
)

variables_configs: dict = {
"my_profile_path": profile_path,
"profile_path": profile_path,
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -604,7 +623,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference(
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=f"{VARIABLES_KEY}my_profile_path",
data_context=data_context,
)
domains: List[Domain] = domain_builder.get_domains(
Expand Down Expand Up @@ -854,10 +872,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
)

variables_configs: dict = {
"my_profile_path": profile_path,
"profile_path": profile_path,
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -871,7 +896,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=f"{VARIABLES_KEY}my_profile_path",
exclude_column_names=[
"store_and_fwd_flag",
"congestion_surcharge",
Expand Down Expand Up @@ -1057,10 +1081,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
)

variables_configs: dict = {
"my_profile_path": profile_path,
"profile_path": profile_path,
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -1074,7 +1105,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=f"{VARIABLES_KEY}my_profile_path",
data_context=data_context,
)
with mock.patch(
Expand Down