In [1]:
import great_expectations as ge

In [2]:
context_seq = ge.data_context.DataContext()

In [3]:
suite = context_seq.create_expectation_suite(
    'check_seq_data',
    overwrite_existing=True
)

In [4]:
context_seq.add_datasource("tsv", class_name='PandasDatasource')

<great_expectations.datasource.pandas_datasource.PandasDatasource at 0x1e192862ca0>

In [5]:
batch_kwargs = {
    'path': 'tsv/seq_stats.tsv',
    'datasource': 'tsv',
    'data_asset_name': 'seq_stats',
    'reader_method': 'read_csv',
    'reader_options': {
        'sep': '\t'
    }
}
batch = context_seq.get_batch(batch_kwargs, suite)

In [6]:
batch.head()

Unnamed: 0,n_contigs,contig_bp,gap_pct,ctg_N50,ctg_L50,ctg_N90,ctg_L90,ctg_max,gc_avg,gc_std,filename
0,100,11670,0.0,34,130,83,74,265,0.47215,0.12871,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
1,100,22815,0.018,25,420,69,84,512,0.51269,0.10778,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
2,100,22025,0.005,26,375,74,91,495,0.54561,0.10954,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
3,100,23148,0.009,26,378,70,99,503,0.53599,0.09698,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
4,100,12116,0.0,30,139,81,67,493,0.48382,0.10773,/home/huangsixing/Documents/mgrast_ge/fasta/mg...


In [7]:
batch.expect_column_values_to_be_between('ctg_L50', min_value=80, max_value=600, result_format={'result_format': 'COMPLETE'})

{
  "success": true,
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [8]:
batch.expect_column_values_to_be_between('n_contigs', min_value=90, max_value=100, result_format={'result_format': 'COMPLETE'})

{
  "success": false,
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 5,
    "unexpected_percent": 10.204081632653061,
    "unexpected_percent_total": 10.204081632653061,
    "unexpected_percent_nonmissing": 10.204081632653061,
    "partial_unexpected_list": [
      2,
      34,
      3,
      7,
      52
    ],
    "partial_unexpected_index_list": [
      23,
      26,
      27,
      33,
      35
    ],
    "partial_unexpected_counts": [
      {
        "value": 2,
        "count": 1
      },
      {
        "value": 3,
        "count": 1
      },
      {
        "value": 34,
        "count": 1
      },
      {
        "value": 52,
        "count": 1
      },
      {
        "value": 7,
        "count": 1
      }
    ],
    "unexpected_list": [
      2,
      34,
      3,
      7,
      52
    ],
    "unexpected_index_list": [
      23,
      26,
      27,
      33,
      35
    ]
  },
  "exception_info": {
    "raise

In [9]:
batch.expect_column_values_to_be_between('gc_avg', min_value=0.4, max_value=0.6, result_format={'result_format': 'COMPLETE'})

{
  "success": true,
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [10]:
seq_expectation_suite = batch.get_expectation_suite(discard_failed_expectations=False)

In [11]:
batch.save_expectation_suite(discard_failed_expectations=False)

In [12]:
validation_operator_name = 'seq_validation_operator'
validation_operator_config = {
    'class_name': 'ActionListValidationOperator',
    'action_list': [{'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}, {'name': 'send_slack_notification_on_validation_result', 
    'action': {'class_name': 'SlackNotificationAction', 'slack_webhook': '${validation_notification_slack_webhook}', 'notify_on': 'failure', 'renderer': {'module_name': 'great_expectations.render.renderer.slack_renderer', 'class_name': 'SlackRenderer'}}}]
}
context_seq.add_validation_operator(validation_operator_name, validation_operator_config)

<great_expectations.validation_operators.validation_operators.ActionListValidationOperator at 0x1e19285bf10>

In [13]:
result = context_seq.run_validation_operator('seq_validation_operator', assets_to_validate=[batch])

Request to Slack webhook returned error 404: no_service


In [15]:
#ge_df.iloc[result.results[0].result["unexpected_index_list"]]

temp_results = list(result.run_results.values())[0]["validation_result"]["results"]
seq_df = ge.read_csv('tsv/seq_stats.tsv', sep='\t')


for i in range(len(temp_results)):
    r = temp_results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (seq_df.iloc[r.result["partial_unexpected_index_list"]])
        #print ("\n")

n_contigs expect_column_values_to_be_between failed
    n_contigs  contig_bp  gap_pct  ctg_N50  ctg_L50  ctg_N90  ctg_L90  \
23          2        180      0.0        2       90        2       90   
26         34       3038      0.0       32       90       32       90   
27          3        270      0.0        3       90        3       90   
33          7        630      0.0        7       90        7       90   
35         52       4667      0.0       51       90       51       90   

    ctg_max   gc_avg   gc_std  \
23       90  0.56111  0.00550   
26       90  0.55497  0.02717   
27       90  0.55926  0.04477   
33       90  0.57460  0.03022   
35       90  0.54746  0.03373   

                                             filename  
23  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
26  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
27  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
33  /home/huangsixing/Documents/mgrast_ge/fasta/mg...  
35  /home/huangsixing/Document

In [16]:
context_seq.open_data_docs()

Use checkpoint to check new data

In [17]:
%%writefile great_expectations/checkpoints/test_seq_data.yml

validation_operator_name: seq_validation_operator
batches:
  - batch_kwargs:
      path: tsv/test_seq_stats.tsv
      datasource: tsv
      data_asset_name: test_seq_stats
      reader_method: read_csv
      reader_options:
        'sep': '\t'
    expectation_suite_names:
      - check_seq_data

Overwriting great_expectations/checkpoints/test_seq_data.yml


In [18]:
context_seq.run_checkpoint(checkpoint_name="test_seq_data")

Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.
  df = reader_fn(path, **reader_options)
Request to Slack webhook returned error 404: no_service


{
  "success": false,
  "run_id": {
    "run_time": "2022-06-01T13:17:55.079979+00:00",
    "run_name": "20220601T131755.079979Z"
  },
  "evaluation_parameters": null,
  "validation_operator_config": {
    "class_name": "ActionListValidationOperator",
    "module_name": "great_expectations.validation_operators",
    "name": "seq_validation_operator",
    "kwargs": {
      "action_list": [
        {
          "name": "store_validation_result",
          "action": {
            "class_name": "StoreValidationResultAction"
          }
        },
        {
          "name": "update_data_docs",
          "action": {
            "class_name": "UpdateDataDocsAction"
          }
        },
        {
          "name": "send_slack_notification_on_validation_result",
          "action": {
            "class_name": "SlackNotificationAction",
            "slack_webhook": "https://hooks.slack.com/services/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",
            "notify_on": "failure",
         

Use CLI to run the checkpoint, but this approach does not trigger any slack notification

In [19]:
!great_expectations checkpoint run test_seq_data

Using v3 (Batch Request) API
Validation failed!

Suite Name                                   Status     Expectations met
- check_seq_data                             ✖ Failed   0 of 3 (0.0 %)


Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.


Or we can demand the details of the test results

In [20]:
ge_df = ge.read_csv('tsv/test_seq_stats.tsv', sep='\t')

In [21]:
result = ge_df.validate(seq_expectation_suite, result_format="COMPLETE")

In [22]:
#ge_df.iloc[result.results[0].result["unexpected_index_list"]]

for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]])
        print ("\n")

ctg_L50 expect_column_values_to_be_between failed
   n_contigs  contig_bp  gap_pct  ctg_N50  ctg_L50  ctg_N90  ctg_L90  ctg_max  \
0         31       2775      0.0       30       60       30       90       90   

    gc_avg   gc_std                                           filename  
0  0.55423  0.02767  /home/huangsixing/Documents/mgrast_ge/test_fas...  


n_contigs expect_column_values_to_be_between failed
   n_contigs  contig_bp  gap_pct  ctg_N50  ctg_L50  ctg_N90  ctg_L90  ctg_max  \
0         31       2775    0.000       30       60       30       90       90   
1         69       9257    0.032       23      149       55       85      363   

    gc_avg   gc_std                                           filename  
0  0.55423  0.02767  /home/huangsixing/Documents/mgrast_ge/test_fas...  
1  0.31323  0.09431  /home/huangsixing/Documents/mgrast_ge/test_fas...  


gc_avg expect_column_values_to_be_between failed
   n_contigs  contig_bp  gap_pct  ctg_N50  ctg_L50  ctg_N90  ctg_L90  ctg

Or we can build a suite based on the example data

In [23]:
for i in range(len(result.results)):
    r = result.results[i]
    if r["success"] == False:
        column = r.expectation_config["kwargs"]["column"]
        expectation_type = r.expectation_config["expectation_type"]
        print (f"{column} {expectation_type} failed")
        print (ge_df.iloc[r.result["unexpected_index_list"]]["filename"])
        print ("\n")

ctg_L50 expect_column_values_to_be_between failed
0    /home/huangsixing/Documents/mgrast_ge/test_fas...
Name: filename, dtype: object


n_contigs expect_column_values_to_be_between failed
0    /home/huangsixing/Documents/mgrast_ge/test_fas...
1    /home/huangsixing/Documents/mgrast_ge/test_fas...
Name: filename, dtype: object


gc_avg expect_column_values_to_be_between failed
1    /home/huangsixing/Documents/mgrast_ge/test_fas...
Name: filename, dtype: object




In [24]:
ge_df = ge.read_csv('tsv/seq_stats.tsv', sep='\t')

ge.profile.BasicSuiteBuilderProfiler().profile(ge_df)

Profiling Columns:   0%|          | 0/11 [00:00<?, ?it/s, n_contigs]



({
   "data_asset_type": "Dataset",
   "expectations": [
     {
       "kwargs": {
         "min_value": 44,
         "max_value": 53
       },
       "expectation_type": "expect_table_row_count_to_be_between",
       "meta": {
         "BasicSuiteBuilderProfiler": {
           "confidence": "very low"
         }
       }
     },
     {
       "kwargs": {
         "value": 11
       },
       "expectation_type": "expect_table_column_count_to_equal",
       "meta": {
         "BasicSuiteBuilderProfiler": {
           "confidence": "very low"
         }
       }
     },
     {
       "kwargs": {
         "column_list": [
           "n_contigs",
           "contig_bp",
           "gap_pct",
           "ctg_N50",
           "ctg_L50",
           "ctg_N90",
           "ctg_L90",
           "ctg_max",
           "gc_avg",
           "gc_std",
           "filename"
         ]
       },
       "expectation_type": "expect_table_columns_to_match_ordered_list",
       "meta": {
         "BasicSui