This notebook is for the Expectation Suite creation and validation

In [None]:
# Step 1 - Install required software

!pip install -r requirements.txt

In [None]:
# Step 2 - Imports

import great_expectations as gx
from great_expectations.data_context import FileDataContext
from great_expectations.core.expectation_configuration import ExpectationConfiguration

In [None]:
# Step 3 - Initiate a Filesystem Data Context
# Note: Replace /Users/fernandoembrioni/Documents/Fer/repos/ for the path previous to this repository

path_to_empty_folder = "/Users/fernandoembrioni/Documents/Fer/repos/fer-gx-validator/filecontext"
context = FileDataContext.create(project_root_dir=path_to_empty_folder)

In [None]:
# Step 4 - Create a Validator by connecting to data

validator = context.sources.pandas_default.read_csv(
    "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv"
)

In [None]:
# Step 5 - Create Expectations and save them to the context

# IMPORTANT:
# Each expectation created is validated against the data source, and its result (success or failure)
# is used to add the Expectation or not to the expectation suite.
# In this case, I expect the 'rate_code_id' column values to be in set {1}. But the 
# datasource also has values of {2,3,4,5,99} for this column. It means that the
# validation will fail and the expectation will not be added to the expectation suite.
# How to solve this?
# Saving the Expectation Suite without discarding failed expectations.

column_list = [
            "vendor_id",
            "pickup_datetime",
            "dropoff_datetime",
            "passenger_count",
            "trip_distance",
            "rate_code_id",
            "store_and_fwd_flag",
            "pickup_location_id",
            "dropoff_location_id",
            "payment_type",
            "fare_amount",
            "extra",
            "mta_tax",
            "tip_amount",
            "tolls_amount",
            "improvement_surcharge",
            "total_amount",
            "congestion_surcharge",
        ]

validator.expect_table_columns_to_match_ordered_list(column_list=column_list)
validator.expect_column_values_to_be_in_set(column='rate_code_id', value_set={1}, mostly=1.0)
validator.expect_column_values_to_not_be_null(column='vendor_id', mostly=0.95)
validator.save_expectation_suite(discard_failed_expectations=False)

In [None]:
# Step 6 - Add a checkpoint

checkpoint = context.add_or_update_checkpoint(
    name="my_checkpoint",
    validator=validator,
)

In [None]:
# Step 7 - Recover checkpoint from context

checkpoint = context.get_checkpoint("my_checkpoint")
result = checkpoint.run()

In [None]:
# Step 8 - View an HTML representation of the validation results

context.view_validation_result(result)