In [None]:
import os

import great_expectations as gx

context = gx.get_context()

In [None]:
project_name = os.getenv("GCP_PROJECT_NAME")

In [None]:
# Note: To use GCS as a metadata store, set the GCS_METADATA_STORES_BUCKET_NAME environment variable to the name of your bucket.
# Alternatively, you can override the metadata_stores_bucket_name variable with the name of your bucket.
metadata_stores_bucket_name = os.getenv("GCS_METADATA_STORES_BUCKET_NAME")

In [None]:
# Set up metadata stores
if metadata_stores_bucket_name:
    expectations_store_name = "gcs_expectations_store"
    validations_store_name = "gcs_validations_store"
    context.add_store(
        expectations_store_name,
        {
            "class_name": "ExpectationsStore",
            "store_backend": {
                "class_name": "TupleGCSStoreBackend",
                "project": project_name,
                "bucket": metadata_stores_bucket_name,
                "prefix": "expectations",
            },
        },
    )
    context.add_store(
        validations_store_name,
        {
            "class_name": "ValidationsStore",
            "store_backend": {
                "class_name": "TupleGCSStoreBackend",
                "project": project_name,
                "bucket": metadata_stores_bucket_name,
                "prefix": "validations",
            },
        },
    )
    # Set these stores as the active stores
    context.expectations_store_name = expectations_store_name
    context.validations_store_name = validations_store_name
else:
    print(
        "No bucket name provided for metadata stores, reverting to local file based storage."
    )

In [None]:
# Set up data docs site
if metadata_stores_bucket_name:
    new_site_name = "gcs_site"
    new_site_config = {
        "class_name": "SiteBuilder",
        "store_backend": {
            "class_name": "TupleGCSStoreBackend",
            "project": project_name,
            "bucket": metadata_stores_bucket_name,
            "prefix": "data_docs",
        },
        "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
    }

    context.add_data_docs_site(new_site_name, new_site_config)

else:
    print(
        "No bucket name provided for data docs site stores, reverting to local file based storage."
    )

In [None]:
# Explicitly create data docs site to use filesystem store with known file location.
# This is done to simplify hosting data docs within the containers, the default is to write to a temp directory.

context.add_data_docs_site(
    site_config={
        "class_name": "SiteBuilder",
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "base_directory": "/gx/gx_stores/data_docs",
        },
        "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
    },
    site_name="local_site_for_hosting",
)

In [None]:
datasource_name = "pandas_gcs_example"
bucket_name_with_data_to_validate = "taxi_reference_data"
datasource = context.sources.add_pandas_gcs(
    name="gcs_datasource", bucket_or_name=bucket_name_with_data_to_validate
)

In [None]:
batching_regex = r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"
gcs_prefix = "data/"
data_asset = datasource.add_csv_asset(
    name="csv_taxi_gcs_asset", batching_regex=batching_regex, gcs_prefix=gcs_prefix
)

In [None]:
print("data_asset.batch_request_options:", data_asset.batch_request_options)

In [None]:
batch_request = data_asset.build_batch_request(options={"month": "02"})

In [None]:
batches = data_asset.get_batch_list_from_batch_request(batch_request)
print("len(batches):", len(batches))

In [None]:
for idx, batch in enumerate(batches):
    print(f"batch.batch_spec {idx + 1}:", batch.batch_spec)

In [None]:
expectation_suite_name = "test_gcs_suite"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)

In [None]:
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)
validator.head()

In [None]:
print("columns:", validator.active_batch.data.dataframe.columns)

In [None]:
validator.expect_column_values_to_not_be_null("pickup_datetime")
validator.expect_column_values_to_be_between("passenger_count", auto=True)

In [None]:
validator.save_expectation_suite(discard_failed_expectations=False)

In [None]:
checkpoint = context.add_or_update_checkpoint(
    name="my_quickstart_checkpoint",
    validator=validator,
)

In [None]:
checkpoint_result = checkpoint.run()

In [None]:
checkpoint_result.success