In [28]:
import great_expectations as ge

In [29]:
context_metadata = ge.data_context.DataContext()

In [30]:
suite = context_metadata.create_expectation_suite(
    'check_meta_data',
    overwrite_existing=True
)

In [31]:
context_metadata.add_datasource("tsv", class_name='PandasDatasource')

<great_expectations.datasource.pandas_datasource.PandasDatasource at 0x29096207c40>

In [32]:
batch_kwargs = {
    'path': 'tsv/metadata.tsv',
    'datasource': 'tsv',
    'data_asset_name': 'metadata',
    'reader_method': 'read_csv',
    'reader_options': {
        'sep': '\t'
    }
}
batch = context_metadata.get_batch(batch_kwargs, suite)

In [33]:
batch.head()

Unnamed: 0,sample,latitude,longitude,location,country,collection_time,material,feature,metagenome_id
0,mgs561368,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713898.3
1,mgs561365,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713905.3
2,mgs561362,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713900.3
3,mgs561356,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713906.3
4,mgs561380,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713901.3


In [34]:
batch.expect_column_values_to_be_between('latitude', min_value=-90, max_value=90, result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1,
    "unexpected_percent": 2.0408163265306123,
    "unexpected_percent_total": 2.0408163265306123,
    "unexpected_percent_nonmissing": 2.0408163265306123,
    "partial_unexpected_list": [
      -151.82739
    ],
    "partial_unexpected_index_list": [
      48
    ],
    "partial_unexpected_counts": [
      {
        "value": -151.82739,
        "count": 1
      }
    ],
    "unexpected_list": [
      -151.82739
    ],
    "unexpected_index_list": [
      48
    ]
  },
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [35]:
batch.expect_column_values_to_be_between('longitude', min_value=-180, max_value=180, result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [36]:
batch.expect_column_values_to_match_strftime_format('collection_time', "%H:%M:%S", result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [37]:
batch.expect_column_values_to_be_of_type('material', 'str', result_format={'result_format': 'COMPLETE'})
batch.expect_column_values_to_be_of_type('country', 'str', result_format={'result_format': 'COMPLETE'})
batch.expect_column_values_to_be_of_type('feature', 'str', result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [38]:
metadata_expectation_suite = batch.get_expectation_suite(discard_failed_expectations=False)

In [39]:
batch.save_expectation_suite(discard_failed_expectations=False)

In [40]:
validation_operator_name = 'metadata_validation_operator'
validation_operator_config = {
    'class_name': 'ActionListValidationOperator',
    'action_list': [{'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}, {'name': 'send_slack_notification_on_validation_result', 
    'action': {'class_name': 'SlackNotificationAction', 'slack_webhook': '${validation_notification_slack_webhook}', 'notify_on': 'failure', 'renderer': {'module_name': 'great_expectations.render.renderer.slack_renderer', 'class_name': 'SlackRenderer'}}}]
}
context_metadata.add_validation_operator(validation_operator_name, validation_operator_config)

<great_expectations.validation_operators.validation_operators.ActionListValidationOperator at 0x29096dc09a0>

In [41]:

result = context_metadata.run_validation_operator('metadata_validation_operator', assets_to_validate=[batch])

In [42]:

temp_results = list(result.run_results.values())[0]["validation_result"]["results"]
seq_df = ge.read_csv('tsv/metadata.tsv', sep='\t')

for i in range(len(temp_results)):
    r = temp_results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (seq_df.iloc[r.result["partial_unexpected_index_list"]])
        #print ("\n")

latitude expect_column_values_to_be_between failed
       sample   latitude  longitude       location   country collection_time  \
48  mgs192740 -151.82739  -11.41924  Pacific Ocean  Kiribati        10:00:00   

   material     feature metagenome_id  
48    water  coral reef  mgm4466596.3  


In [43]:
context_metadata.open_data_docs()

Use checkpoint to check new data

In [44]:
%%writefile great_expectations/checkpoints/test_meta_data.yml

validation_operator_name: metadata_validation_operator
batches:
  - batch_kwargs:
      path: tsv/test_metadata.tsv
      datasource: tsv
      data_asset_name: test_metadata
      reader_method: read_csv
      reader_options:
        'sep': '\t'
    expectation_suite_names:
      - check_meta_data

Overwriting great_expectations/checkpoints/test_meta_data.yml


In [45]:
context_metadata.run_checkpoint(checkpoint_name="test_meta_data")

Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.
  df = reader_fn(path, **reader_options)


{
  "run_id": {
    "run_name": "20220601T134558.220337Z",
    "run_time": "2022-06-01T13:45:58.220337+00:00"
  },
  "evaluation_parameters": null,
  "run_results": {
    "ValidationResultIdentifier::check_meta_data/20220601T134558.220337Z/20220601T134558.220337Z/9adb7e25bd109ca97e05b45644292b8d": {
      "validation_result": {
        "statistics": {
          "evaluated_expectations": 6,
          "successful_expectations": 5,
          "unsuccessful_expectations": 1,
          "success_percent": 83.33333333333334
        },
        "evaluation_parameters": {},
        "results": [
          {
            "expectation_config": {
              "expectation_type": "expect_column_values_to_be_between",
              "kwargs": {
                "column": "latitude",
                "max_value": 90,
                "min_value": -90,
                "result_format": {
                  "result_format": "SUMMARY"
                }
              },
              "meta": {}
            },
   

Run checkpoint with CLI

In [46]:
!great_expectations checkpoint run test_meta_data

Using v3 (Batch Request) API
Validation failed!

Suite Name                                   Status     Expectations met
- check_meta_data                            ✖ Failed   5 of 6 (83.33 %)


Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.
Could not find Validation Operator "metadata_validation_operator" when running Checkpoint "test_meta_data". Using default action_list_operator.


Or we can demand the details of the test results, so we can see which sample fails which test

In [47]:
ge_df = ge.read_csv('tsv/test_metadata.tsv', sep='\t')

In [48]:
result = ge_df.validate(metadata_expectation_suite, result_format="COMPLETE")

In [49]:

for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]])
        print ("\n")

latitude expect_column_values_to_be_between failed
       sample    latitude  longitude   location country collection_time  \
26  mgs879310  141.579113  38.872848  Nogimachi   Japan        00:00:00   

       material                     feature metagenome_id  
26  waste water  wastewater treatment plant  mgm4968247.3  




In [50]:
for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]]["metagenome_id"])
        print ("\n")

latitude expect_column_values_to_be_between failed
26    mgm4968247.3
Name: metagenome_id, dtype: object


