In [71]:
import great_expectations as ge
import pandas as pd

In [72]:
context_metadata = ge.data_context.DataContext()

In [73]:
suite = context_metadata.create_expectation_suite(
    'check_meta_data',
    overwrite_existing=True
)

In [74]:
context_metadata.add_datasource("tsv", class_name='PandasDatasource')

<great_expectations.datasource.pandas_datasource.PandasDatasource at 0x7fbaa20123a0>

In [75]:
batch_kwargs = {
    'path': 'tsv/metadata.tsv',
    'datasource': 'tsv',
    'data_asset_name': 'metadata',
    'reader_method': 'read_csv',
    'reader_options': {
        'sep': '\t'
    }
}
batch = context_metadata.get_batch(batch_kwargs, suite)

In [76]:
batch.head()

Unnamed: 0,sample,latitude,longitude,location,country,collection_time,material,feature,metagenome_id
0,mgs561368,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713898.3
1,mgs561365,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713905.3
2,mgs561362,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713900.3
3,mgs561356,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713906.3
4,mgs561380,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713901.3


In [77]:
batch.expect_column_values_to_be_between('latitude', min_value=-90, max_value=90, result_format={'result_format': 'COMPLETE'})

{
  "success": false,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1,
    "unexpected_percent": 2.0408163265306123,
    "unexpected_percent_total": 2.0408163265306123,
    "unexpected_percent_nonmissing": 2.0408163265306123,
    "partial_unexpected_list": [
      -151.82739
    ],
    "partial_unexpected_index_list": [
      48
    ],
    "partial_unexpected_counts": [
      {
        "value": -151.82739,
        "count": 1
      }
    ],
    "unexpected_list": [
      -151.82739
    ],
    "unexpected_index_list": [
      48
    ]
  }
}

In [78]:
batch.expect_column_values_to_be_between('longitude', min_value=-180, max_value=180, result_format={'result_format': 'COMPLETE'})

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  }
}

In [79]:
batch.expect_column_values_to_match_strftime_format('collection_time', "%H:%M:%S", result_format={'result_format': 'COMPLETE'})

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  }
}

In [80]:
batch.expect_column_values_to_be_of_type('material', 'str', result_format={'result_format': 'COMPLETE'})
batch.expect_column_values_to_be_of_type('country', 'str', result_format={'result_format': 'COMPLETE'})
batch.expect_column_values_to_be_of_type('feature', 'str', result_format={'result_format': 'COMPLETE'})

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  }
}

In [81]:
metadata_expectation_suite = batch.get_expectation_suite(discard_failed_expectations=False)

In [82]:
batch.save_expectation_suite(discard_failed_expectations=False)

In [83]:
validation_operator_name = 'metadata_validation_operator'
validation_operator_config = {
    'class_name': 'ActionListValidationOperator',
    'action_list': [{'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}]
}
context_metadata.add_validation_operator(validation_operator_name, validation_operator_config)

<great_expectations.validation_operators.validation_operators.ActionListValidationOperator at 0x7fbaa22cf6a0>

In [84]:
#context_metadata.get_validator()
result = context_metadata.run_validation_operator('metadata_validation_operator', assets_to_validate=[batch])

  return concat(self.root_render_func(self.new_context(vars)))


In [85]:
#ge_df.iloc[result.results[0].result["unexpected_index_list"]]

temp_results = list(result.run_results.values())[0]["validation_result"]["results"]
seq_df = pd.read_csv('tsv/metadata.tsv', sep='\t')


for i in range(len(temp_results)):
    r = temp_results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (seq_df.iloc[r.result["partial_unexpected_index_list"]])
        #print ("\n")

latitude expect_column_values_to_be_between failed
       sample   latitude  longitude       location   country collection_time  \
48  mgs192740 -151.82739  -11.41924  Pacific Ocean  Kiribati        10:00:00   

   material     feature metagenome_id  
48    water  coral reef  mgm4466596.3  


In [86]:
context_metadata.open_data_docs()

Use checkpoint to check new data

In [87]:
%%writefile great_expectations/checkpoints/test_meta_data.yml

validation_operator_name: metadata_validation_operator
batches:
  - batch_kwargs:
      path: tsv/test_metadata.tsv
      datasource: tsv
      data_asset_name: test_metadata
      reader_method: read_csv
      reader_options:
        'sep': '\t'
    expectation_suite_names:
      - check_meta_data

Overwriting great_expectations/checkpoints/test_meta_data.yml


In [88]:
context_metadata.run_checkpoint(checkpoint_name="test_meta_data")

Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.
  df = reader_fn(path, **reader_options)
  return concat(self.root_render_func(self.new_context(vars)))


{
  "success": false,
  "run_id": {
    "run_name": "20220601T063253.177303Z",
    "run_time": "2022-06-01T06:32:53.177303+00:00"
  },
  "evaluation_parameters": null,
  "run_results": {
    "ValidationResultIdentifier::check_meta_data/20220601T063253.177303Z/20220601T063253.177303Z/9adb7e25bd109ca97e05b45644292b8d": {
      "validation_result": {
        "success": false,
        "meta": {
          "great_expectations_version": "0.15.7",
          "expectation_suite_name": "check_meta_data",
          "run_id": {
            "run_name": "20220601T063253.177303Z",
            "run_time": "2022-06-01T06:32:53.177303+00:00"
          },
          "batch_kwargs": {
            "path": "tsv/test_metadata.tsv",
            "datasource": "tsv",
            "data_asset_name": "test_metadata",
            "reader_method": "read_csv",
            "reader_options": {
              "sep": "\\t"
            }
          },
          "batch_markers": {
            "ge_load_time": "20220601T063253.1

In [89]:
!great_expectations checkpoint run test_meta_data

Using v3 (Batch Request) API[0m
Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.
Could not find Validation Operator "metadata_validation_operator" when running Checkpoint "test_meta_data". Using default action_list_operator.
Validation failed![0m

Suite Name                                   Status     Expectations met[0m
- check_meta_data                            [31m✖ Failed[0m   5 of 6 (83.33 %)[0m
[0m

Or we can demand the details of the test results

In [90]:
test_meta_df = pd.read_csv('tsv/test_metadata.tsv', sep='\t')
ge_df = ge.from_pandas(test_meta_df)

In [91]:
result = ge_df.validate(metadata_expectation_suite, result_format="COMPLETE")

In [92]:
#ge_df.iloc[result.results[0].result["unexpected_index_list"]]

for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]])
        print ("\n")

latitude expect_column_values_to_be_between failed
       sample    latitude  longitude   location country collection_time  \
26  mgs879310  141.579113  38.872848  Nogimachi   Japan        00:00:00   

       material                     feature metagenome_id  
26  waste water  wastewater treatment plant  mgm4968247.3  


