Skip to content

Commit

Permalink
Merge develop into feature/DX-469/DX-441/alexsherstinsky/link/docusau…
Browse files Browse the repository at this point in the history
…rus_documents_should_use_python_code_script_tags_referring_to_integration_tests-2023_05_17-25
  • Loading branch information
github-actions[bot] committed May 18, 2023
2 parents 72b6ab8 + 0aa1209 commit 1c919a5
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 67 deletions.
Expand Up @@ -16,7 +16,11 @@
class DataProfilerTableColumnList(DataProfilerProfileMetricProvider):
metric_name = "data_profiler.table_column_list"

value_keys = ("profile_path",)
value_keys = (
"profile_path",
"profile_report_filtering_key",
"profile_report_accepted_filtering_values",
)

@metric_value(engine=PandasExecutionEngine)
def _pandas(
Expand All @@ -27,6 +31,12 @@ def _pandas(
metrics,
runtime_configuration,
):
profile_report_filtering_key = metric_value_kwargs[
"profile_report_filtering_key"
]
profile_report_accepted_filtering_values = metric_value_kwargs[
"profile_report_accepted_filtering_values"
]
profile_report_column_data_stats: dict = metrics[
"data_profiler.table_column_infos"
]
Expand All @@ -37,7 +47,16 @@ def _pandas(
column_names=profile_report_column_names,
batch_columns_list=metrics["table.columns"],
)
return profile_report_column_names
profile_report_filtered_column_names: list = []
for col in profile_report_column_names:
if (
metrics["data_profiler.table_column_infos"][col][
profile_report_filtering_key
]
in profile_report_accepted_filtering_values
):
profile_report_filtered_column_names.append(col)
return profile_report_filtered_column_names

@classmethod
def _get_evaluation_dependencies(
Expand Down
Expand Up @@ -176,6 +176,8 @@ def _build_numeric_rule() -> Rule:
"strict_min": False,
"strict_max": False,
"profile_path": "default_profiler_path",
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["int", "float", "string"],
}

parameter_builders: List[ParameterBuilder] = [
Expand Down Expand Up @@ -287,6 +289,8 @@ def _build_float_rule() -> Rule:
"strict_min": False,
"strict_max": False,
"profile_path": "default_profiler_path",
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["float"],
}

parameter_builders: List[ParameterBuilder] = [
Expand Down
Expand Up @@ -33,7 +33,6 @@ class DataProfilerColumnDomainBuilder(ColumnDomainBuilder):

def __init__(
self,
profile_path: str = f"{VARIABLES_KEY}profile_path",
include_column_names: Optional[Union[str, Optional[List[str]]]] = None,
exclude_column_names: Optional[Union[str, Optional[List[str]]]] = None,
include_column_name_suffixes: Optional[Union[str, Iterable, List[str]]] = None,
Expand Down Expand Up @@ -75,12 +74,6 @@ def __init__(
data_context=data_context,
)

self._profile_path = profile_path

@property
def profile_path(self) -> str:
return self._profile_path

def _get_domains(
self,
rule_name: str,
Expand Down Expand Up @@ -110,18 +103,39 @@ def _get_domains(
# Obtain profile_path from "rule state" (i.e., variables and parameters); from instance variable otherwise.
profile_path: str = get_parameter_value_and_validate_return_type(
domain=None,
parameter_reference=self.profile_path,
parameter_reference=f"{VARIABLES_KEY}profile_path",
expected_return_type=str,
variables=variables,
parameters=None,
)

profile_report_filtering_key: str = (
get_parameter_value_and_validate_return_type(
domain=None,
parameter_reference=f"{VARIABLES_KEY}profile_report_filtering_key",
expected_return_type=str,
variables=variables,
parameters=None,
)
)

profile_report_accepted_filtering_values: str = get_parameter_value_and_validate_return_type(
domain=None,
parameter_reference=f"{VARIABLES_KEY}profile_report_accepted_filtering_values",
expected_return_type=list,
variables=variables,
parameters=None,
)

# get metrics and profile path from variables and then pass them into here
profile_report_column_names: List[str] = validator.get_metric( # type: ignore[assignment] # could be None
metric=MetricConfiguration(
metric_name="data_profiler.table_column_list",
metric_domain_kwargs={},
metric_value_kwargs={
"profile_path": profile_path,
"profile_report_filtering_key": profile_report_filtering_key,
"profile_report_accepted_filtering_values": profile_report_accepted_filtering_values,
},
)
)
Expand Down
Expand Up @@ -41,15 +41,19 @@ def report(self, report_options: dict = None) -> dict:
"data_stats": [
{
"column_name": "vendor_id",
"data_type": "int",
},
{
"column_name": "passenger_count",
"data_type": "int",
},
{
"column_name": "total_amount",
"data_type": "float",
},
{
"column_name": "congestion_surcharge",
"data_type": "float",
},
],
}
Expand Down
Expand Up @@ -2,7 +2,6 @@

import os
import unittest
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, cast
from unittest import mock

Expand Down Expand Up @@ -50,26 +49,6 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
"data_asset_name": "my_reports",
"data_connector_query": {"index": -1},
}
exclude_column_names = [
"vendor_id",
"pickup_datetime",
"dropoff_datetime",
"passenger_count",
# "trip_distance",
"rate_code_id",
"store_and_fwd_flag",
"pickup_location_id",
"dropoff_location_id",
"payment_type",
# "fare_amount",
# "extra",
# "mta_tax",
# "tip_amount",
# "tolls_amount",
# "improvement_surcharge",
# "total_amount",
"congestion_surcharge",
]

data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run(
batch_request=batch_request,
Expand All @@ -79,15 +58,18 @@ def bobby_profile_data_profiler_structured_data_assistant_result_usage_stats_ena
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["int", "float", "string"],
},
float_rule={
"profile_path": Path(
"profile_path": os.path.join( # noqa: PTH118
test_root_path,
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["float"],
},
exclude_column_names=exclude_column_names,
estimation="flag_outliers",
)

Expand All @@ -109,43 +91,25 @@ def bobby_profile_data_profiler_structured_data_assistant_result(
"data_connector_query": {"index": -1},
}

exclude_column_names = [
"vendor_id",
"pickup_datetime",
"dropoff_datetime",
"passenger_count",
# "trip_distance",
"rate_code_id",
"store_and_fwd_flag",
"pickup_location_id",
"dropoff_location_id",
"payment_type",
# "fare_amount",
# "extra",
# "mta_tax",
# "tip_amount",
# "tolls_amount",
# "improvement_surcharge",
# "total_amount",
"congestion_surcharge",
]

data_assistant_result: DataAssistantResult = context.assistants.data_profiler.run(
batch_request=batch_request,
exclude_column_names=exclude_column_names,
numeric_rule={
"profile_path": os.path.join( # noqa: PTH118
test_root_path,
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["int", "float", "string"],
},
float_rule={
"profile_path": os.path.join( # noqa: PTH118
test_root_path,
"data_profiler_files",
"profile.pkl",
),
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": ["float"],
},
estimation="flag_outliers",
)
Expand Down Expand Up @@ -236,7 +200,9 @@ def test_profile_data_profiler_structured_data_assistant_metrics_count(
bobby_profile_data_profiler_structured_data_assistant_result.metrics_by_domain.items()
):
num_metrics += len(parameter_values_for_fully_qualified_parameter_names)
assert num_metrics == 32
assert (
num_metrics == 50
) # 2 * ((numeric_rule: 6 int + 9 float + 1 string) + (float_rule: 9 float))


@pytest.mark.integration
Expand Down
Expand Up @@ -19,7 +19,6 @@
from great_expectations.data_context import FileDataContext
from great_expectations.rule_based_profiler.domain_builder import DomainBuilder
from great_expectations.rule_based_profiler.parameter_container import (
VARIABLES_KEY,
ParameterContainer,
build_parameter_container_for_variables,
)
Expand Down Expand Up @@ -55,9 +54,16 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value(
)

variables_configs: dict = {
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"strict_min": False,
"strict_max": False,
"profile_path": profile_path,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -71,7 +77,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value(
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=profile_path,
data_context=data_context,
)
domains: List[Domain] = domain_builder.get_domains(
Expand Down Expand Up @@ -325,6 +330,13 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_default_refere
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand Down Expand Up @@ -587,10 +599,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference(
)

variables_configs: dict = {
"my_profile_path": profile_path,
"profile_path": profile_path,
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -604,7 +623,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference(
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=f"{VARIABLES_KEY}my_profile_path",
data_context=data_context,
)
domains: List[Domain] = domain_builder.get_domains(
Expand Down Expand Up @@ -854,10 +872,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
)

variables_configs: dict = {
"my_profile_path": profile_path,
"profile_path": profile_path,
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -871,7 +896,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=f"{VARIABLES_KEY}my_profile_path",
exclude_column_names=[
"store_and_fwd_flag",
"congestion_surcharge",
Expand Down Expand Up @@ -1057,10 +1081,17 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
)

variables_configs: dict = {
"my_profile_path": profile_path,
"profile_path": profile_path,
"estimator": "quantiles",
"false_positive_rate": 1.0e-2,
"mostly": 1.0,
"profile_report_filtering_key": "data_type",
"profile_report_accepted_filtering_values": [
"int",
"float",
"string",
"datetime",
],
}
variables: ParameterContainer = build_parameter_container_for_variables(
variables_configs=variables_configs
Expand All @@ -1074,7 +1105,6 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with
}

domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder(
profile_path=f"{VARIABLES_KEY}my_profile_path",
data_context=data_context,
)
with mock.patch(
Expand Down

0 comments on commit 1c919a5

Please sign in to comment.