[HACKATHON] add expect_column_values_to_be_valid_us_state_or_territor…

…y_abbreviation (#4653) * add expect_column_values_to_be_valid_us_state_or_territory_abbreviation * adding requirements and json * adding line break Co-authored-by: Austin Ziech Robinson <44794138+austiezr@users.noreply.github.com>
great-expectations · Apr 4, 2022 · 18d28b5 · 18d28b5
1 parent abc1f2c
commit 18d28b5
Showing 1 changed file with 163 additions and 0 deletions.
diff --git a/...ental/expectations/expect_column_values_to_be_valid_us_state_or_territory_abbreviation.py b/...ental/expectations/expect_column_values_to_be_valid_us_state_or_territory_abbreviation.py
@@ -0,0 +1,163 @@
+import json
+from typing import Optional
+
+import us
+
+from great_expectations.core.expectation_configuration import ExpectationConfiguration
+from great_expectations.exceptions import InvalidExpectationConfigurationError
+from great_expectations.execution_engine import PandasExecutionEngine
+from great_expectations.expectations.expectation import ColumnMapExpectation
+from great_expectations.expectations.metrics import (
+    ColumnMapMetricProvider,
+    column_condition_partial,
+)
+
+
+def is_valid_state_or_territory_abbreviation(state: str, dc_statehood: bool):
+    list_of_state_and_territory_abbrs = [
+        x.abbr for x in us.states.STATES_AND_TERRITORIES
+    ]
+    if dc_statehood == True:
+        list_of_state_and_territory_abbrs.append("DC")
+    else:
+        pass
+    if len(state) != 2:
+        return False
+    elif type(state) != str:
+        return False
+    elif state in list_of_state_and_territory_abbrs:
+        return True
+    else:
+        return False
+
+
+# This class defines a Metric to support your Expectation.
+# For most ColumnMapExpectations, the main business logic for calculation will live in this class.
+class ColumnValuesToBeValidUSStateOrTerritoryAbbreviation(ColumnMapMetricProvider):
+
+    # This is the id string that will be used to reference your metric.
+    condition_metric_name = "column_values.valid_state_or_territory_abbreviation"
+
+    # This method implements the core logic for the PandasExecutionEngine
+    @column_condition_partial(engine=PandasExecutionEngine)
+    def _pandas(cls, column, dc_statehood=True, **kwargs):
+        return column.apply(
+            lambda x: is_valid_state_or_territory_abbreviation(x, dc_statehood)
+        )
+
+    # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
+    # @column_condition_partial(engine=SqlAlchemyExecutionEngine)
+    # def _sqlalchemy(cls, column, _dialect, **kwargs):
+    #     raise NotImplementedError
+
+    # This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine
+    # @column_condition_partial(engine=SparkDFExecutionEngine)
+    # def _spark(cls, column, **kwargs):
+    #     raise NotImplementedError
+
+
+# This class defines the Expectation itself
+class ExpectColumnValuesToBeValidUSStateOrTerritoryAbbreviation(ColumnMapExpectation):
+    """Expect values in this column to be valid state or territory abbreviations.
+    See https://pypi.org/project/us/ for more information.
+    DC statehood is a perennial issue in data science, and the owners of the us repo addressed it differently than we have: https://github.com/unitedstates/python-us/issues/50
+    dc_statehood defaults to True, though can be overriden by end users
+    """
+
+    # These examples will be shown in the public gallery.
+    # They will also be executed as unit tests for your Expectation.
+    examples = [
+        {
+            "data": {
+                "valid_state_or_territory_abbreviation": [
+                    "KS",
+                    "AS",
+                    "GU",
+                    "PR",
+                    "VI",
+                    "MP",
+                ],
+                "invalid_state_or_territory_abbreviation": [
+                    "",
+                    "1234",
+                    "WVV",
+                    "AA",
+                    "WX",
+                    "2V",
+                ],
+            },
+            "tests": [
+                {
+                    "title": "basic_positive_test",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {"column": "valid_state_or_territory_abbreviation"},
+                    "out": {"success": True},
+                },
+                {
+                    "title": "basic_negative_test",
+                    "exact_match_out": False,
+                    "include_in_gallery": True,
+                    "in": {"column": "invalid_state_or_territory_abbreviation"},
+                    "out": {"success": False},
+                },
+            ],
+        }
+    ]
+
+    # This is the id string of the Metric used by this Expectation.
+    # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
+    map_metric = "column_values.valid_state_or_territory_abbreviation"
+
+    # This is a list of parameter names that can affect whether the Expectation evaluates to True or False
+    success_keys = ("mostly",)
+
+    # This dictionary contains default values for any parameters that should have default values
+    default_kwarg_values = {}
+
+    def validate_configuration(
+        self, configuration: Optional[ExpectationConfiguration]
+    ) -> None:
+        """
+        Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
+        necessary configuration arguments have been provided for the validation of the expectation.
+
+        Args:
+            configuration (OPTIONAL[ExpectationConfiguration]): \
+                An optional Expectation Configuration entry that will be used to configure the expectation
+        Returns:
+            None. Raises InvalidExpectationConfigurationError if the config is not validated successfully
+        """
+
+        super().validate_configuration(configuration)
+        if configuration is None:
+            configuration = self.configuration
+
+        # # Check other things in configuration.kwargs and raise Exceptions if needed
+        # try:
+        #     assert (
+        #         ...
+        #     ), "message"
+        #     assert (
+        #         ...
+        #     ), "message"
+        # except AssertionError as e:
+        #     raise InvalidExpectationConfigurationError(str(e))
+
+    # This object contains metadata for display in the public Gallery
+    library_metadata = {
+        "maturity": "experimental",  # "experimental", "beta", or "production"
+        "tags": [
+            "hackathon",
+            "typed-entities",
+        ],  # Tags for this Expectation in the Gallery
+        "contributors": [  # Github handles for all contributors to this Expectation.
+            "@luismdiaz01",
+            "@derekma73",  # Don't forget to add your github handle here!
+        ],
+        "requirements": ["us"],
+    }
+
+
+if __name__ == "__main__":
+    ExpectColumnValuesToBeValidUSStateOrTerritoryAbbreviation().print_diagnostic_checklist()