In [50]:


import great_expectations as ge
from great_expectations import DataContext

from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.exceptions import DataContextError


In [51]:
context_seq: DataContext = ge.get_context()

In [52]:
datasource_name = "seq_datasource"

data_folder = "../tsv"

datasource_config = {
    "name": f"{datasource_name}",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "default_inferred_data_connector_name": {
            "class_name": "InferredAssetFilesystemDataConnector",
            "base_directory": f"{data_folder}",
            "default_regex": {"group_names": ["data_asset_name"], "pattern": "(.*\.tsv)"},
        }
    }
}
context_seq.add_datasource(**datasource_config)

context_seq.list_datasources()

  "default_regex": {"group_names": ["data_asset_name"], "pattern": "(.*\.tsv)"},


[{'execution_engine': {'class_name': 'PandasExecutionEngine',
   'module_name': 'great_expectations.execution_engine'},
  'class_name': 'Datasource',
  'data_connectors': {'default_inferred_data_connector_name': {'class_name': 'InferredAssetFilesystemDataConnector',
    'module_name': 'great_expectations.datasource.data_connector',
    'base_directory': '../tsv',
    'default_regex': {'group_names': ['data_asset_name'],
     'pattern': '(.*\\.tsv)'}}},
  'module_name': 'great_expectations.datasource',
  'name': 'metadata_datasource'},
 {'execution_engine': {'class_name': 'PandasExecutionEngine',
   'module_name': 'great_expectations.execution_engine'},
  'class_name': 'Datasource',
  'data_connectors': {'default_inferred_data_connector_name': {'class_name': 'InferredAssetFilesystemDataConnector',
    'module_name': 'great_expectations.datasource.data_connector',
    'base_directory': '../tsv',
    'default_regex': {'group_names': ['data_asset_name'],
     'pattern': '(.*\\.tsv)'}}},
  

In [53]:
# Feel free to change the name of your suite here. Renaming this will not remove the other one.
expectation_suite_name = "seq_suite"
try:
    suite = context_seq.get_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Loaded ExpectationSuite "{suite.expectation_suite_name}" containing {len(suite.expectations)} expectations.')
except DataContextError:
    suite = context_seq.create_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f'Created ExpectationSuite "{suite.expectation_suite_name}".')

Loaded ExpectationSuite "seq_suite" containing 2 expectations.


In [54]:
# Create an Expectation
ctg_l50_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_between",
   kwargs={
      "column": "ctg_L50",
      "min_value": 80, "max_value": 600, 
      "result_format": {'result_format': 'COMPLETE'}
   }
)
suite.add_expectation(expectation_configuration=ctg_l50_expectation)

{"meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "ctg_L50", "min_value": 80, "max_value": 600, "result_format": {"result_format": "COMPLETE"}}}

In [55]:
# Create an Expectation
n_contigs_expectation = ExpectationConfiguration(
   expectation_type="expect_column_values_to_be_between",
   kwargs={
      "column": "n_contigs",
      "min_value": 90, "max_value": 100, 
      "result_format": {'result_format': 'COMPLETE'}
   }
)
suite.add_expectation(expectation_configuration=n_contigs_expectation)

{"meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "n_contigs", "min_value": 90, "max_value": 100, "result_format": {"result_format": "COMPLETE"}}}

In [56]:
#print(context.get_expectation_suite(expectation_suite_name=expectation_suite_name))
context_seq.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)

#suite_identifier = ExpectationSuiteIdentifier(expectation_suite_name=expectation_suite_name)
#context.build_data_docs(resource_identifiers=[suite_identifier])
#context.open_data_docs(resource_identifier=suite_identifier)

'c:\\Users\\dgg32\\Documents\\mgrast_ge\\great_expectations\\expectations/seq_suite.json'

In [57]:
my_checkpoint_name = "seq_checkpoint" # This was populated from your CLI command.
which_file_to_check = "seq_stats.tsv"

In [58]:
action_list = [
    {'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, 
    {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}, 
    {'name': 'send_slack_notification_on_validation_result', 'action': {'class_name': 'SlackNotificationAction', 'slack_webhook': '${validation_notification_slack_webhook}', 'notify_on': 'failure', 'renderer': {'module_name': 'great_expectations.render.renderer.slack_renderer', 'class_name': 'SlackRenderer'}}}
    ]


In [59]:



checkpoint_config = {
    "name": f"{my_checkpoint_name}",
    "config_version": 1,
    "class_name": "SimpleCheckpoint",
    "run_name_template": "%Y%m%d-%H%M%S-check",
    "validations": [
        {
            "batch_request": {
                "datasource_name": datasource_name,
                "data_connector_name": "default_inferred_data_connector_name",
                "data_asset_name": which_file_to_check,
                "batch_spec_passthrough": {
                    "reader_method": "read_csv",
                    "reader_options": {"sep": "\t"}
                }
            },
            "expectation_suite_name": expectation_suite_name,
            "action_list": action_list
        }
    ],
}
context_seq.add_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "seq_checkpoint",
  "profilers": [],
  "run_name_template": "%Y%m%d-%H%M%S-check",
  "runtime_configuration": {},
  "validations": [
    {
      "batch_request": {
        "datasource_name": "seq_datasource",
        "data_connector_name": "default_inferred_data_connector_name",
        "data_asset_name": "seq_stats.tsv",
        "batch_spec_passthrough": {
          "reader_method": "read_c

In [60]:
print(context_seq.get_available_data_asset_names())

{'metadata_datasource': {'default_inferred_data_connector_name': ['test_metadata.tsv', 'seq_stats.tsv', 'metadata.tsv', 'test_seq_stats.tsv']}, 'seq_datasource': {'default_inferred_data_connector_name': ['test_metadata.tsv', 'seq_stats.tsv', 'metadata.tsv', 'test_seq_stats.tsv']}}


In [61]:
context_seq.run_checkpoint(checkpoint_name=my_checkpoint_name)

Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

{
  "run_id": {
    "run_time": "2022-06-06T00:22:26.462606+00:00",
    "run_name": "20220606-002226-check"
  },
  "run_results": {
    "ValidationResultIdentifier::seq_suite/20220606-002226-check/20220606T002226.462606Z/e688278e3ee08b4f55445e802be9f557": {
      "validation_result": {
        "meta": {
          "great_expectations_version": "0.15.7",
          "expectation_suite_name": "seq_suite",
          "run_id": {
            "run_time": "2022-06-06T00:22:26.462606+00:00",
            "run_name": "20220606-002226-check"
          },
          "batch_spec": {
            "path": "c:\\Users\\dgg32\\Documents\\mgrast_ge\\great_expectations\\..\\tsv\\seq_stats.tsv",
            "reader_method": "read_csv",
            "reader_options": {
              "sep": "\t"
            }
          },
          "batch_markers": {
            "ge_load_time": "20220605T162226.497513Z",
            "pandas_data_fingerprint": "13bedfb10548710cfa4855faa8109586"
          },
          "active_batch_

In [62]:
context_seq.open_data_docs()

Or run the checkpoint with the CLI

In [63]:
!great_expectations checkpoint run seq_checkpoint

Using v3 (Batch Request) API
Validation failed!

Suite Name                                   Status     Expectations met
- seq_suite                                  ✖ Failed   1 of 2 (50.0 %)



Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]
Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]
Calculating Metrics:  15%|█▌        | 2/13 [00:00<00:00, 2001.58it/s]
Calculating Metrics:  23%|██▎       | 3/13 [00:00<00:00, 429.55it/s] 
Calculating Metrics:  54%|█████▍    | 7/13 [00:00<00:00, 412.83it/s]
Calculating Metrics: 100%|██████████| 13/13 [00:00<00:00, 478.70it/s]
Calculating Metrics: 100%|██████████| 13/13 [00:00<00:00, 417.39it/s]
Calculating Metrics: 100%|██████████| 13/13 [00:00<00:00, 417.39it/s]


In [64]:
context_seq.open_data_docs()

In [65]:
ge_df = ge.read_csv('tsv/seq_stats.tsv', sep='\t')

result = ge_df.validate(suite, result_format="COMPLETE")

for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]])
        print ("\n")

n_contigs expect_column_values_to_be_between failed
    n_contigs  contig_bp  gap_pct  ctg_N50  ctg_L50  ctg_N90  ctg_L90  \
23          2        180      0.0        2       90        2       90   
26         34       3038      0.0       32       90       32       90   
27          3        270      0.0        3       90        3       90   
33          7        630      0.0        7       90        7       90   
35         52       4667      0.0       51       90       51       90   

    ctg_max   gc_avg   gc_std  \
23       90  0.56111  0.00550   
26       90  0.55497  0.02717   
27       90  0.55926  0.04477   
33       90  0.57460  0.03022   
35       90  0.54746  0.03373   

                                             filename  
23  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
26  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
27  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
33  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
35  /home/huangsixing/Document