In [523]:


import great_expectations as ge
from great_expectations import DataContext

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.exceptions import DataContextError


In [524]:
context: DataContext = ge.get_context()

In [525]:
# Feel free to change the name of your suite here. Renaming this will not remove the other one.
expectation_suite_name = "metadata"
try:
    suite = context.get_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Loaded ExpectationSuite "{suite.expectation_suite_name}" containing {len(suite.expectations)} expectations.')
except DataContextError:
    suite = context.create_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Created ExpectationSuite "{suite.expectation_suite_name}".')

Loaded ExpectationSuite "metadata" containing 6 expectations.


In [526]:
# Create an Expectation
longitude_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_between",
   kwargs={
      "column": "longitude",
      "min_value": -180, "max_value": 180, 
      "result_format": {'result_format': 'COMPLETE'}
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=longitude_expectation)

{"kwargs": {"column": "longitude", "min_value": -180, "max_value": 180, "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_between", "meta": {}}

In [527]:
# Create an Expectation
latitude_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_between",
   kwargs={
      "column": "latitude",
      "min_value": -90, "max_value": 90, 
      "result_format": {'result_format': 'COMPLETE'}
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=latitude_expectation)

{"kwargs": {"column": "latitude", "min_value": -90, "max_value": 90, "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_between", "meta": {}}

In [528]:
# Create an Expectation
collection_time_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_match_strftime_format",
   kwargs={
      "column": "collection_time",
      "strftime_format": "%H:%M:%S",
      "result_format": {'result_format': 'COMPLETE'}
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=collection_time_expectation)

{"kwargs": {"column": "collection_time", "strftime_format": "%H:%M:%S", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_match_strftime_format", "meta": {}}

In [529]:


material_type_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_of_type",
   kwargs={
      "column": "material",
      "type_": "str",
      "result_format": {'result_format': 'COMPLETE'}
   }
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=material_type_expectation)

{"kwargs": {"column": "material", "type_": "str", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_of_type", "meta": {}}

In [530]:
country_type_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_of_type",
   kwargs={
      "column": "country",
      "type_": "str",
      "result_format": {'result_format': 'COMPLETE'}
   }
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=country_type_expectation)

{"kwargs": {"column": "country", "type_": "str", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_of_type", "meta": {}}

In [531]:
feature_type_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_of_type",
   kwargs={
      "column": "feature",
      "type_": "str",
      "result_format": {'result_format': 'COMPLETE'}
   }
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=feature_type_expectation)

{"kwargs": {"column": "feature", "type_": "str", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_of_type", "meta": {}}

In [532]:
#print(context.get_expectation_suite(expectation_suite_name=expectation_suite_name))
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)

#suite_identifier = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name)
#context.build_data_docs(resource_identifiers=[suite_identifier])
#context.open_data_docs(resource_identifier=suite_identifier)

'/Users/dgg32/Documents/mgrast_ge/great_expectations/expectations/metadata.json'

In [533]:
datasource_name = "metadata_datasource"

data_folder = "../tsv"

datasource_config = {
    "name": f"{datasource_name}",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "default_inferred_data_connector_name": {
            "class_name": "InferredAssetFilesystemDataConnector",
            "base_directory": f"{data_folder}",
            "default_regex": {"group_names": ["data_asset_name"], "pattern": "(.*)"},
        },
        "default_runtime_data_connector_name": {
          "class_name": "RuntimeDataConnector",
          "batch_identifiers": ["default_identifier_name"]
          }
    },
}
context.add_datasource(**datasource_config)

context.list_datasources()

[{'module_name': 'great_expectations.datasource', 'data_connectors': {'default_inferred_data_connector_name': {'module_name': 'great_expectations.datasource.data_connector', 'base_directory': '../tsv', 'class_name': 'InferredAssetFilesystemDataConnector', 'default_regex': {'group_names': ['data_asset_name'], 'pattern': '(.*)'}}, 'default_runtime_data_connector_name': {'batch_identifiers': ['default_identifier_name'], 'module_name': 'great_expectations.datasource.data_connector', 'class_name': 'RuntimeDataConnector'}}, 'class_name': 'Datasource', 'execution_engine': {'module_name': 'great_expectations.execution_engine', 'class_name': 'PandasExecutionEngine'}, 'name': 'metadata_datasource'}]

In [534]:
my_checkpoint_name = "metadata" # This was populated from your CLI command.
which_file_to_check = "metadata.tsv"

In [535]:
action_list = [
    {'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, 
    {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}, 
    {'name': 'send_slack_notification_on_validation_result', 'action': {'class_name': 'SlackNotificationAction', 'slack_webhook': '${validation_notification_slack_webhook}', 'notify_on': 'failure', 'renderer': {'module_name': 'great_expectations.render.renderer.slack_renderer', 'class_name': 'SlackRenderer'}}}
    ]


In [536]:



checkpoint_config = {
    "name": f"{my_checkpoint_name}",
    "config_version": 1,
    "class_name": "SimpleCheckpoint",
    "run_name_template": "%Y%m%d-%H%M%S-check",
    "validations": [
        {
            "batch_request": {
                "datasource_name": f"{datasource_name}",
                "data_connector_name": "default_inferred_data_connector_name",
                "data_asset_name": f"{which_file_to_check}",
                "batch_spec_passthrough": {
                    "reader_method": "read_csv",
                    "reader_options": {"sep": "\t"}
                }
            },
            "expectation_suite_name": f"{expectation_suite_name}",
            "action_list": action_list
        }
    ],
}
context.add_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "metadata",
  "profilers": [],
  "run_name_template": "%Y%m%d-%H%M%S-check",
  "runtime_configuration": {},
  "validations": [
    {
      "batch_request": {
        "datasource_name": "metadata_datasource",
        "data_connector_name": "default_inferred_data_connector_name",
        "data_asset_name": "metadata.tsv",
        "batch_spec_passthrough": {
          "reader_method": "read_csv

In [537]:
#print(context.get_available_data_asset_names())

In [538]:
context.run_checkpoint(checkpoint_name=my_checkpoint_name)

Calculating Metrics:   0%|          | 0/33 [00:00<?, ?it/s]

{
  "run_id": {
    "run_name": "20220604-185805-check",
    "run_time": "2022-06-04T18:58:05.557761+00:00"
  },
  "run_results": {
    "ValidationResultIdentifier::metadata/20220604-185805-check/20220604T185805.557761Z/1783362dcd1dbf06044288ecf6a2a7eb": {
      "validation_result": {
        "results": [
          {
            "exception_info": {
              "raised_exception": false,
              "exception_traceback": null,
              "exception_message": null
            },
            "meta": {},
            "success": true,
            "result": {
              "element_count": 49,
              "unexpected_count": 0,
              "unexpected_percent": 0.0,
              "partial_unexpected_list": [],
              "missing_count": 0,
              "missing_percent": 0.0,
              "unexpected_percent_total": 0.0,
              "unexpected_percent_nonmissing": 0.0,
              "partial_unexpected_index_list": null,
              "partial_unexpected_counts": []
     

In [539]:
context.open_data_docs()

In [540]:
!great_expectations checkpoint run metadata

Using v3 (Batch Request) API
Calculating Metrics: 100%|█████████████████████| 33/33 [00:00<00:00, 466.81it/s]
Validation failed!

Suite Name                                   Status     Expectations met
- metadata                                   ✖ Failed   5 of 6 (83.33 %)


In [541]:
ge_df = ge.read_csv('tsv/metadata.tsv', sep='\t')

In [542]:
result = ge_df.validate(suite, result_format="COMPLETE")

In [543]:
for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]])
        print ("\n")

latitude expect_column_values_to_be_between failed
       sample   latitude  longitude       location   country collection_time  \
48  mgs192740 -151.82739  -11.41924  Pacific Ocean  Kiribati        10:00:00   

   material     feature metagenome_id  
48    water  coral reef  mgm4466596.3  


