In [1]:


import great_expectations as ge
from great_expectations import DataContext

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.exceptions import DataContextError


In [2]:
context_metadata: DataContext = ge.get_context()

In [3]:
datasource_name = "metadata_datasource"

data_folder = "../tsv"

datasource_config = {
    "name": f"{datasource_name}",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "default_inferred_data_connector_name": {
            "class_name": "InferredAssetFilesystemDataConnector",
            "base_directory": f"{data_folder}",
            "default_regex": {"group_names": ["data_asset_name"], "pattern": "(.*\.tsv)"},
        }
    }
}
context_metadata.add_datasource(**datasource_config)

context_metadata.list_datasources()

  "default_regex": {"group_names": ["data_asset_name"], "pattern": "(.*\.tsv)"},


[{'class_name': 'Datasource',
  'execution_engine': {'class_name': 'PandasExecutionEngine',
   'module_name': 'great_expectations.execution_engine'},
  'module_name': 'great_expectations.datasource',
  'data_connectors': {'default_inferred_data_connector_name': {'class_name': 'InferredAssetFilesystemDataConnector',
    'base_directory': '../tsv',
    'module_name': 'great_expectations.datasource.data_connector',
    'default_regex': {'group_names': ['data_asset_name'],
     'pattern': '(.*\\.tsv)'}}},
  'name': 'metadata_datasource'}]

In [4]:
# Feel free to change the name of your suite here. Renaming this will not remove the other one.
expectation_suite_name = "metadata_suite"
try:
    suite = context_metadata.get_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Loaded ExpectationSuite "{suite.expectation_suite_name}" containing {len(suite.expectations)} expectations.')
except DataContextError:
    suite = context_metadata.create_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Created ExpectationSuite "{suite.expectation_suite_name}".')

Loaded ExpectationSuite "metadata_suite" containing 7 expectations.


In [5]:
# Create an Expectation
longitude_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_between",
   kwargs={
      "column": "longitude",
      "min_value": -180, "max_value": 180, 
      "result_format": {'result_format': 'COMPLETE'}
   }
)
suite.add_expectation(expectation_configuration=longitude_expectation)

{"kwargs": {"column": "longitude", "min_value": -180, "max_value": 180, "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_between", "meta": {}}

In [6]:
# Create an Expectation
latitude_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_between",
   kwargs={
      "column": "latitude",
      "min_value": -90, "max_value": 90, 
      "result_format": {'result_format': 'COMPLETE'}
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=latitude_expectation)

{"kwargs": {"column": "latitude", "min_value": -90, "max_value": 90, "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_between", "meta": {}}

In [7]:
# Create an Expectation
collection_date_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_match_strftime_format",
   kwargs={
      "column": "collection_date",
      "strftime_format": "%Y-%m-%d",
      "result_format": {'result_format': 'COMPLETE'}
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=collection_date_expectation)

{"kwargs": {"column": "collection_date", "strftime_format": "%Y-%m-%d", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_match_strftime_format", "meta": {}}

In [8]:
# Create an Expectation
collection_time_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_match_strftime_format",
   kwargs={
      "column": "collection_time",
      "strftime_format": "%H:%M:%S",
      "result_format": {'result_format': 'COMPLETE'}
   },
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=collection_time_expectation)

{"kwargs": {"column": "collection_time", "strftime_format": "%H:%M:%S", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_match_strftime_format", "meta": {}}

In [9]:


material_type_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_of_type",
   kwargs={
      "column": "material",
      "type_": "str",
      "result_format": {'result_format': 'COMPLETE'}
   }
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=material_type_expectation)

{"kwargs": {"column": "material", "type_": "str", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_of_type", "meta": {}}

In [10]:
country_type_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_of_type",
   kwargs={
      "column": "country",
      "type_": "str",
      "result_format": {'result_format': 'COMPLETE'}
   }
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=country_type_expectation)

{"kwargs": {"column": "country", "type_": "str", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_of_type", "meta": {}}

In [11]:
feature_type_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_of_type",
   kwargs={
      "column": "feature",
      "type_": "str",
      "result_format": {'result_format': 'COMPLETE'}
   }
   # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=feature_type_expectation)

{"kwargs": {"column": "feature", "type_": "str", "result_format": {"result_format": "COMPLETE"}}, "expectation_type": "expect_column_values_to_be_of_type", "meta": {}}

In [12]:
#print(context.get_expectation_suite(expectation_suite_name=expectation_suite_name))
context_metadata.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)

#suite_identifier = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name)
#context.build_data_docs(resource_identifiers=[suite_identifier])
#context.open_data_docs(resource_identifier=suite_identifier)

'c:\\Users\\dgg32\\Documents\\mgrast_ge\\great_expectations\\expectations/metadata_suite.json'

In [13]:
my_checkpoint_name = "metadata_checkpoint" # This was populated from your CLI command.
which_file_to_check = "metadata.tsv"

In [14]:
action_list = [
    {'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, 
    {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}, 
    {'name': 'send_slack_notification_on_validation_result', 'action': {'class_name': 'SlackNotificationAction', 'slack_webhook': '${validation_notification_slack_webhook}', 'notify_on': 'failure', 'renderer': {'module_name': 'great_expectations.render.renderer.slack_renderer', 'class_name': 'SlackRenderer'}}}
    ]


In [15]:



checkpoint_config = {
    "name": f"{my_checkpoint_name}",
    "config_version": 1,
    "class_name": "SimpleCheckpoint",
    "run_name_template": "%Y%m%d-%H%M%S-check",
    "validations": [
        {
            "batch_request": {
                "datasource_name": datasource_name,
                "data_connector_name": "default_inferred_data_connector_name",
                "data_asset_name": which_file_to_check,
                "batch_spec_passthrough": {
                    "reader_method": "read_csv",
                    "reader_options": {"sep": "\t"}
                }
            },
            "expectation_suite_name": expectation_suite_name,
            "action_list": action_list
        }
    ],
}
context_metadata.add_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "metadata_checkpoint",
  "profilers": [],
  "run_name_template": "%Y%m%d-%H%M%S-check",
  "runtime_configuration": {},
  "validations": [
    {
      "batch_request": {
        "datasource_name": "metadata_datasource",
        "data_connector_name": "default_inferred_data_connector_name",
        "data_asset_name": "metadata.tsv",
        "batch_spec_passthrough": {
          "reader_method"

In [16]:
print(context_metadata.get_available_data_asset_names())

{'metadata_datasource': {'default_inferred_data_connector_name': ['metadata.tsv', 'seq_stats.tsv', 'test_seq_stats.tsv', 'test_metadata.tsv']}}


In [17]:
context_metadata.run_checkpoint(checkpoint_name=my_checkpoint_name)

Calculating Metrics:   0%|          | 0/38 [00:00<?, ?it/s]

Request to Slack webhook returned error 404: no_service


{
  "run_id": {
    "run_time": "2022-06-05T20:39:25.169659+00:00",
    "run_name": "20220605-203925-check"
  },
  "run_results": {
    "ValidationResultIdentifier::metadata_suite/20220605-203925-check/20220605T203925.169659Z/1783362dcd1dbf06044288ecf6a2a7eb": {
      "validation_result": {
        "results": [
          {
            "result": {
              "element_count": 49,
              "unexpected_count": 0,
              "unexpected_percent": 0.0,
              "partial_unexpected_list": [],
              "missing_count": 0,
              "missing_percent": 0.0,
              "unexpected_percent_total": 0.0,
              "unexpected_percent_nonmissing": 0.0,
              "partial_unexpected_index_list": null,
              "partial_unexpected_counts": []
            },
            "success": true,
            "exception_info": {
              "raised_exception": false,
              "exception_traceback": null,
              "exception_message": null
            },
        

In [18]:
context_metadata.open_data_docs()

Or run the checkpoint with the CLI

In [23]:
!great_expectations checkpoint run metadata_checkpoint

Using v3 (Batch Request) API
Validation failed!

Suite Name                                   Status     Expectations met
- metadata_suite                             ✖ Failed   6 of 7 (85.71 %)



Calculating Metrics:   0%|          | 0/38 [00:00<?, ?it/s]
Calculating Metrics:   0%|          | 0/38 [00:00<?, ?it/s]
Calculating Metrics:   5%|▌         | 2/38 [00:00<?, ?it/s]
Calculating Metrics:   8%|▊         | 3/38 [00:00<00:00, 158.06it/s]
Calculating Metrics:  45%|████▍     | 17/38 [00:00<00:00, 354.88it/s]
Calculating Metrics: 100%|██████████| 38/38 [00:00<00:00, 488.46it/s]
Calculating Metrics: 100%|██████████| 38/38 [00:00<00:00, 418.70it/s]
Calculating Metrics: 100%|██████████| 38/38 [00:00<00:00, 414.02it/s]
Request to Slack webhook returned error 404: no_service


In [24]:
context_metadata.open_data_docs()

In [22]:
ge_df = ge.read_csv('tsv/metadata.tsv', sep='\t')

result = ge_df.validate(suite, result_format="COMPLETE")

for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]])
        print ("\n")

latitude expect_column_values_to_be_between failed
       sample   latitude  longitude       location   country collection_date  \
44  mgs192740 -151.82739  -11.41924  Pacific Ocean  Kiribati      2009-03-30   

   collection_time material     feature metagenome_id  
44        10:00:00    water  coral reef  mgm4466596.3  


