In [29]:
import great_expectations as gx
import os

context = gx.get_context()

This notebook requires the setup of two environment variables:
- `AZURE_STORAGE_ACCOUNT_URL`: Which refers to the Storage Account that contains your data. More information can be found in the (Azure Documentation)[https://learn.microsoft.com/en-us/azure/storage/common/storage-account-overview]
- `AZURE_CREDENTIAL`: which contains the credential string.

In [46]:
datasource_name = "pandas_abs_example"
azure_options = {
    "account_url": "${AZURE_STORAGE_ACCOUNT_URL}",
    "credential": "${AZURE_CREDENTIAL}",
}
datasource = context.sources.add_pandas_abs(
    name=datasource_name, azure_options=azure_options
)

DataContextError: Can not write the fluent datasource pandas_abs_example because a datasource of that name already exists in the data context.

In [32]:
assert datasource_name in context.datasources

In [33]:
asset_name = "my_taxi_data_asset"
abs_container = "superconductive-public"
abs_name_starts_with = "data/taxi_yellow_tripdata_samples/"
batching_regex = r"yellow_tripdata_sample_(?P<year>\d{4})-(?P<month>\d{2})\.csv"

In [34]:
data_asset = datasource.add_csv_asset(
    name=asset_name,
    batching_regex=batching_regex,
    abs_container=abs_container,
    abs_name_starts_with=abs_name_starts_with,
)


In [None]:
print("data_asset.batch_request_options:", data_asset.batch_request_options)

In [35]:
batch_request = data_asset.build_batch_request({"year": "2019", "month": "03"})

In [36]:
batches = data_asset.get_batch_list_from_batch_request(batch_request)
print("len(batches):", len(batches))

len(batches): 1


In [38]:
expectation_suite_name = "my_expectation_suite"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)

{
  "data_asset_type": null,
  "ge_cloud_id": null,
  "expectations": [],
  "meta": {
    "great_expectations_version": "0.17.1+20.ge5c83f641.dirty"
  },
  "expectation_suite_name": "my_expectation_suite"
}

In [39]:
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-03-01 18:43:31,2019-03-01 18:48:42,2,1.1,1,N,143,238,1,6.0,3.5,0.5,1.55,0.0,0.3,11.85,2.5
1,2,2019-03-20 16:30:11,2019-03-20 16:44:49,1,2.63,1,N,231,88,1,12.5,1.0,0.5,3.36,0.0,0.3,20.16,2.5
2,1,2019-03-07 19:01:51,2019-03-07 19:04:36,1,0.3,1,N,237,237,1,4.0,3.5,0.5,1.66,0.0,0.3,9.96,2.5
3,1,2019-03-02 16:33:01,2019-03-02 16:37:41,1,0.9,1,N,42,41,1,5.5,0.0,0.5,0.0,0.0,0.3,6.3,0.0
4,2,2019-03-28 14:10:47,2019-03-28 14:49:37,1,15.68,3,N,231,1,1,63.0,0.0,0.0,14.76,10.5,0.3,88.56,0.0


In [40]:
print("columns:", validator.active_batch.data.dataframe.columns)

columns: Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'rate_code_id', 'store_and_fwd_flag',
       'pickup_location_id', 'dropoff_location_id', 'payment_type',
       'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'congestion_surcharge'],
      dtype='object')


In [41]:
validator.expect_column_values_to_not_be_null("pickup_datetime")
validator.expect_column_values_to_be_between("passenger_count", auto=True)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]




Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "meta": {},
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "max_value": 6,
      "min_value": 0,
      "mostly": 1.0,
      "strict_max": false,
      "strict_min": false,
      "column": "passenger_count"
    },
    "meta": {
      "auto_generated_at": "20230628T212654.060408Z",
      "great_expectations_version": "0.17.1+20.ge5c83f641.dirty"
    }
  }
}

In [42]:
validator.save_expectation_suite(discard_failed_expectations=False)

In [43]:
checkpoint = gx.checkpoint.SimpleCheckpoint(
    name="my_quickstart_checkpoint",
    data_context=context,
    validator=validator,
)

In [44]:
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]

In [45]:
checkpoint_result.success

True