# Great Expectation

In [1]:
import great_expectations as gx
import pandas as pd

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir=".")

# You can take a look at all configurations related to GX here
print(context)

{
  "anonymous_usage_statistics": {
    "explicit_url": false,
    "explicit_id": true,
    "data_context_id": "3625801e-5e9b-4dd0-a419-679a2e56a524",
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "enabled": true
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_variables_file_path": "uncommitted/config_variables.yml",
  "config_version": 3.0,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "uncommitted/data_docs/local_site"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "fluent_datasources": {
    "dunghc-diabetes": {
      "type": "postgres",
     

## Load another file as a data source

In [31]:
# You have been working with a single file, but how to work with a folder or a DB with several tables?
# Now, we have some new terms, the data source contains some data assets, and each asset will be divided into several batches

# First, add a folder as a data source
context.sources.add_pandas_filesystem(
    name="my_ds_7", base_directory="../data/diabetes-kafka"
)

PandasFilesystemDatasource(type='pandas_filesystem', name='my_ds_7', id=None, assets=[], base_directory=PosixPath('../data/diabetes-kafka'), data_context_root_directory=None)

In [32]:
my_ds = context.datasources["my_ds_7"]

# my_batching_regex = "yellow_tripdata_2022-.*.parquet"

# # Create the data asset (as one or more files from our data source)
# my_asset = my_ds.add_parquet_asset(
#     name="my_tripdata_data_asset", batching_regex=my_batching_regex
# )

# # Define a Batch Request to include all batches in the available data set
# my_batch_request = my_asset.build_batch_request()
# batches = my_asset.get_batch_list_from_batch_request(my_batch_request)

In [33]:
my_batching_regex = "diabetes_new.csv"

# Create the data asset (as one or more files from our data source)
my_asset = my_ds.add_csv_asset(
    name="diabetes_data_asset", batching_regex=my_batching_regex
)

# Define a Batch Request to include all batches in the available data set
my_batch_request = my_asset.build_batch_request()
batches = my_asset.get_batch_list_from_batch_request(my_batch_request)

In [34]:
# Let's verify what we have for each batch
for batch in batches:
    print(batch.batch_spec)

{'path': '../data/diabetes-kafka/diabetes_new.csv', 'reader_method': 'read_csv', 'reader_options': {}}


## Validate using our default expectation suite

In [35]:
# Now, define the validator and validate it against batches
# context.add_or_update_expectation_suite("my_asset_expectation_suite")

asset_validator = context.get_validator(
    batch_request=my_batch_request,
    expectation_suite_name="my_expectation_suite",
)
asset_validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,Outcome
0,17,161,97,25,730,13.082414,0.702647,70,1
1,9,169,50,13,539,8.51654,1.503838,69,1
2,2,22,67,85,707,66.115909,0.669432,23,1
3,2,188,82,87,155,36.95747,0.372881,66,1
4,8,58,71,23,264,47.094574,0.133443,49,1


In [36]:
# Similar to a single file, create a checkpoint to validate the result
# Define the checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="diabetes_asset_checkpoint_2",
    validator=asset_validator
)

# Get the result after validation
checkpoint_result = checkpoint.run()

# Quick view on the validation result
context.view_validation_result(checkpoint_result)

Calculating Metrics: 0it [00:00, ?it/s]