In [35]:
import great_expectations as ge
import pandas as pd

In [36]:
context_metadata = ge.data_context.DataContext()

In [37]:
suite = context_metadata.create_expectation_suite(
    'check_meta_data',
    overwrite_existing=True
)

In [38]:
batch_kwargs = {
    'path': 'tsv/metadata.tsv',
    'datasource': 'tsv',
    'data_asset_name': 'metadata',
    'reader_method': 'read_csv',
    'reader_options': {
        'sep': '\t'
    }
}
batch = context_metadata.get_batch(batch_kwargs, suite)

In [39]:
batch.head()

Unnamed: 0,sample,latitude,longitude,location,country,collection_time,material,feature,metagenome_id
0,mgs561374,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713902.3
1,mgs561365,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713905.3
2,mgs561353,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713907.3
3,mgs561359,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713903.3
4,mgs561350,35.689,139.69,Tokyo,Japan,13:00:00,air,city,mgm4713899.3


In [40]:
batch.expect_column_values_to_be_between('latitude', min_value=-90, max_value=90, result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 76,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [41]:
batch.expect_column_values_to_be_between('longitude', min_value=-180, max_value=180, result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 76,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [42]:
batch.expect_column_values_to_match_strftime_format('collection_time', "%H:%M:%S", result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 76,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [43]:
batch.expect_column_values_to_be_of_type('material', 'str', result_format={'result_format': 'COMPLETE'})
batch.expect_column_values_to_be_of_type('country', 'str', result_format={'result_format': 'COMPLETE'})
batch.expect_column_values_to_be_of_type('feature', 'str', result_format={'result_format': 'COMPLETE'})

{
  "result": {
    "element_count": 76,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [44]:
batch.get_expectation_suite()

{
  "expectation_suite_name": "check_meta_data",
  "data_asset_type": "Dataset",
  "meta": {
    "great_expectations_version": "0.15.7"
  },
  "ge_cloud_id": null,
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_be_between",
      "meta": {},
      "kwargs": {
        "column": "latitude",
        "min_value": -90,
        "max_value": 90
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_between",
      "meta": {},
      "kwargs": {
        "column": "longitude",
        "min_value": -180,
        "max_value": 180
      }
    },
    {
      "expectation_type": "expect_column_values_to_match_strftime_format",
      "meta": {},
      "kwargs": {
        "column": "collection_time",
        "strftime_format": "%H:%M:%S"
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_of_type",
      "meta": {},
      "kwargs": {
        "column": "material",
        "type_": "str"
      }
    },
    {
      "expectation_type"

In [45]:
batch.save_expectation_suite()

In [46]:
validation_operator_name = 'metadata_validation_operator'
validation_operator_config = {
    'class_name': 'ActionListValidationOperator',
    'action_list': [{'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}]
}
context_metadata.add_validation_operator(validation_operator_name, validation_operator_config)

<great_expectations.validation_operators.validation_operators.ActionListValidationOperator at 0x7fa002010d30>

In [47]:
results = context_metadata.run_validation_operator('metadata_validation_operator', assets_to_validate=[batch])

  return concat(self.root_render_func(self.new_context(vars)))


In [48]:
results

{
  "validation_operator_config": {
    "class_name": "ActionListValidationOperator",
    "module_name": "great_expectations.validation_operators",
    "name": "metadata_validation_operator",
    "kwargs": {
      "action_list": [
        {
          "name": "store_validation_result",
          "action": {
            "class_name": "StoreValidationResultAction"
          }
        },
        {
          "name": "update_data_docs",
          "action": {
            "class_name": "UpdateDataDocsAction"
          }
        }
      ],
      "result_format": {
        "result_format": "SUMMARY",
        "partial_unexpected_count": 20
      }
    }
  },
  "run_results": {
    "ValidationResultIdentifier::check_meta_data/20220531T090008.272469Z/20220531T090008.272469Z/e1495c52ca70b3573a099aa2f54115da": {
      "validation_result": {
        "statistics": {
          "evaluated_expectations": 6,
          "successful_expectations": 6,
          "unsuccessful_expectations": 0,
          "succes

In [49]:
context_metadata.open_data_docs()

In [50]:
%%writefile great_expectations/checkpoints/test_meta_data.yml

validation_operator_name: metadata_validation_operator
batches:
  - batch_kwargs:
      path: tsv/test_metadata.tsv
      datasource: tsv
      data_asset_name: test_metadata
      reader_method: read_csv
      reader_options:
        'sep': '\t'
    expectation_suite_names:
      - check_meta_data

Overwriting great_expectations/checkpoints/test_meta_data.yml


In [51]:
!great_expectations checkpoint run test_meta_data

Using v3 (Batch Request) API[0m
Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.
Could not find Validation Operator "metadata_validation_operator" when running Checkpoint "test_meta_data". Using default action_list_operator.
Validation failed![0m

Suite Name                                   Status     Expectations met[0m
- check_meta_data                            [31m✖ Failed[0m   5 of 6 (83.33 %)[0m
[0m

In [52]:
test_meta_df = pd.read_csv('tsv/test_metadata.tsv', sep='\t')
ge_df = ge.from_pandas(test_meta_df)

In [53]:
result = ge_df.validate(batch.get_expectation_suite(), result_format="COMPLETE")

In [54]:
ge_df.iloc[result.results[0].result["unexpected_index_list"]]

Unnamed: 0,sample,latitude,longitude,location,country,collection_time,material,feature,metagenome_id
25,mgs879325,141.579113,38.872848,Nogimachi,Japan,00:00:00,waste water,wastewater treatment plant,mgm4968249.3
