Skip to content

Commit

Permalink
integer semantic type detection utility
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Sherstinsky committed Apr 29, 2022
1 parent 3ecbf42 commit e68b640
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,12 @@ def _get_domains(
)

column_name: str
semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = dict(
zip(
effective_column_names,
[
self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
column_name
]
for column_name in effective_column_names
],
)
)
semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = {
column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
column_name
]
for column_name in effective_column_names
}

domains: List[Domain] = [
Domain(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,17 +72,12 @@ def _get_domains(
)

column_name: str
semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = dict(
zip(
effective_column_names,
[
self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
column_name
]
for column_name in effective_column_names
],
)
)
semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = {
column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
column_name
]
for column_name in effective_column_names
}

domains: List[Domain] = [
Domain(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def cardinality_within_limit(self, metric_value: float) -> bool:
of unique values.
Returns:
boolean of whether the cardinality is within the configured limit
Boolean of whether the cardinality is within the configured limit
"""
self._validate_metric_value(metric_value=metric_value)
if isinstance(self._limit_mode, AbsoluteCardinalityLimit):
Expand Down
36 changes: 36 additions & 0 deletions great_expectations/rule_based_profiler/helpers/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,42 @@ def convert_variables_to_dict(
return variables_as_dict


def integer_semantic_domain_type(domain: Domain) -> bool:
"""
This method examines "INFERRED_SEMANTIC_TYPE_KEY" attribute of "Domain" argument to check whether or not underlying
"SemanticDomainTypes" enum value is an "integer". Because explicitly designated "SemanticDomainTypes.INTEGER" type
is unavaiable, "SemanticDomainTypes.LOGIC", "SemanticDomainTypes.BINARY", and "SemanticDomainTypes.IDENTIFIER",
are intepreted as taking on "integer" values. Note: In certain settings, this method should be used as pre-filter
to "NumericMetricRangeMultiBatchParameterBuilder._get_round_decimals_using_heuristics()".
Note: Inability to assess underlying "SemanticDomainTypes" details of "Domain" object produces "False" return value.
Args:
domain: "Domain" object to inspect for underlying "SemanticDomainTypes" details
Returns:
Boolean value indicating whether or not specified "Domain" is inferred to denote "integer" values
"""

inferred_semantic_domain_type: Dict[str, SemanticDomainTypes] = domain.details.get(
INFERRED_SEMANTIC_TYPE_KEY
)

semantic_domain_type: SemanticDomainTypes
return inferred_semantic_domain_type and all(
[
semantic_domain_type
in [
SemanticDomainTypes.LOGIC,
SemanticDomainTypes.BINARY,
SemanticDomainTypes.IDENTIFIER,
]
for semantic_domain_type in (inferred_semantic_domain_type.values())
]
)


def compute_quantiles(
metric_values: np.ndarray,
false_positive_rate: np.float64,
Expand Down
45 changes: 38 additions & 7 deletions tests/rule_based_profiler/domain_builder/test_domain.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import pytest

from great_expectations.rule_based_profiler.helpers.util import (
integer_semantic_domain_type,
)
from great_expectations.rule_based_profiler.types import (
INFERRED_SEMANTIC_TYPE_KEY,
Domain,
Expand Down Expand Up @@ -66,7 +69,7 @@ def test_semantic_domain_serialization():
"estimator": "categorical",
"cardinality": "low",
INFERRED_SEMANTIC_TYPE_KEY: {
"passenger_count": SemanticDomainTypes.NUMERIC.value,
"passenger_count": SemanticDomainTypes.NUMERIC,
},
},
)
Expand Down Expand Up @@ -98,7 +101,7 @@ def test_semantic_domain_comparisons():
domain_kwargs={"column": "VendorID"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"VendorID": SemanticDomainTypes.NUMERIC.value,
"VendorID": SemanticDomainTypes.NUMERIC,
},
},
)
Expand All @@ -108,7 +111,7 @@ def test_semantic_domain_comparisons():
domain_kwargs={"column": "passenger_count"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"passenger_count": SemanticDomainTypes.NUMERIC.value,
"passenger_count": SemanticDomainTypes.NUMERIC,
},
},
)
Expand All @@ -118,7 +121,7 @@ def test_semantic_domain_comparisons():
domain_kwargs={"column": "passenger_count"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"passenger_count": SemanticDomainTypes.NUMERIC.value,
"passenger_count": SemanticDomainTypes.NUMERIC,
},
},
)
Expand All @@ -132,7 +135,7 @@ def test_semantic_domain_comparisons():
domain_kwargs={"column": "VendorID"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"VendorID": SemanticDomainTypes.NUMERIC.value,
"VendorID": SemanticDomainTypes.NUMERIC,
},
},
)
Expand All @@ -142,7 +145,7 @@ def test_semantic_domain_comparisons():
domain_kwargs={"column": "passenger_count"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"passenger_count": SemanticDomainTypes.NUMERIC.value,
"passenger_count": SemanticDomainTypes.NUMERIC,
},
},
)
Expand All @@ -152,7 +155,7 @@ def test_semantic_domain_comparisons():
domain_kwargs={"column": "passenger_count"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"passenger_count": SemanticDomainTypes.NUMERIC.value,
"passenger_count": SemanticDomainTypes.NUMERIC,
},
},
)
Expand Down Expand Up @@ -201,3 +204,31 @@ def test_semantic_domain_comparisons():
"'unknown_semantic_type_as_string' is not a valid SemanticDomainTypes"
in str(excinfo.value)
)


def test_integer_semantic_domain_type():
domain: Domain

domain = Domain(
rule_name="my_rule",
domain_type="column",
domain_kwargs={"column": "passenger_count"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"VendorID": SemanticDomainTypes.NUMERIC,
},
},
)
assert not integer_semantic_domain_type(domain=domain)

domain = Domain(
rule_name="my_rule",
domain_type="column",
domain_kwargs={"column": "VendorID"},
details={
INFERRED_SEMANTIC_TYPE_KEY: {
"VendorID": SemanticDomainTypes.IDENTIFIER,
},
},
)
assert integer_semantic_domain_type(domain=domain)

0 comments on commit e68b640

Please sign in to comment.