In [6]:
import great_expectations as ge
import pandas as pd

In [7]:
context_seq = ge.data_context.DataContext()

In [8]:
suite = context_seq.create_expectation_suite(
    'check_seq_data',
    overwrite_existing=True
)

In [9]:
batch_kwargs = {
    'path': 'tsv/seq_stats.tsv',
    'datasource': 'tsv',
    'data_asset_name': 'seq_stats',
    'reader_method': 'read_csv',
    'reader_options': {
        'sep': '\t'
    }
}
batch = context_seq.get_batch(batch_kwargs, suite)

In [10]:
batch.head()

Unnamed: 0,n_contigs,contig_bp,gap_pct,ctg_N50,ctg_L50,ctg_N90,ctg_L90,ctg_max,gc_avg,gc_std,filename
0,100,11670,0.0,34,130,83,74,265,0.47215,0.12871,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
1,100,22815,0.018,25,420,69,84,512,0.51269,0.10778,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
2,100,22025,0.005,26,375,74,91,495,0.54561,0.10954,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
3,100,23148,0.009,26,378,70,99,503,0.53599,0.09698,/home/huangsixing/Documents/mgrast_ge/fasta/mg...
4,100,12116,0.0,30,139,81,67,493,0.48382,0.10773,/home/huangsixing/Documents/mgrast_ge/fasta/mg...


In [14]:
batch.expect_column_min_to_be_between('ctg_L50', min_value=80, max_value=150, result_format={'result_format': 'COMPLETE'})

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 90,
    "element_count": 49,
    "missing_count": null,
    "missing_percent": null
  }
}

In [15]:
batch.expect_column_max_to_be_between('ctg_L50', min_value=140, max_value=380, result_format={'result_format': 'COMPLETE'})

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 420,
    "element_count": 49,
    "missing_count": null,
    "missing_percent": null
  }
}

In [16]:
batch.expect_column_min_to_be_between('n_contigs', min_value=90, max_value=120, result_format={'result_format': 'COMPLETE'})

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 2,
    "element_count": 49,
    "missing_count": null,
    "missing_percent": null
  }
}

In [17]:
batch.expect_column_values_to_be_between('gc_avg', min_value=0.40, max_value=0.6, result_format={'result_format': 'COMPLETE'})

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 49,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  }
}

In [18]:
batch.get_expectation_suite()

{
  "data_asset_type": "Dataset",
  "ge_cloud_id": null,
  "expectations": [
    {
      "kwargs": {
        "column": "ctg_L50",
        "min_value": 80,
        "max_value": 150
      },
      "expectation_type": "expect_column_min_to_be_between",
      "meta": {}
    },
    {
      "kwargs": {
        "column": "gc_avg",
        "min_value": 0.4,
        "max_value": 0.6
      },
      "expectation_type": "expect_column_values_to_be_between",
      "meta": {}
    }
  ],
  "meta": {
    "great_expectations_version": "0.15.7"
  },
  "expectation_suite_name": "check_seq_data"
}

In [19]:
batch.save_expectation_suite()

In [20]:
validation_operator_name = 'seq_validation_operator'
validation_operator_config = {
    'class_name': 'ActionListValidationOperator',
    'action_list': [{'name': 'store_validation_result', 'action': {'class_name': 'StoreValidationResultAction'}}, {'name': 'update_data_docs', 'action': {'class_name': 'UpdateDataDocsAction'}}]
}
context_seq.add_validation_operator(validation_operator_name, validation_operator_config)

<great_expectations.validation_operators.validation_operators.ActionListValidationOperator at 0x7faa66a25a90>

In [21]:
results = context_seq.run_validation_operator('seq_validation_operator', assets_to_validate=[batch])

  return concat(self.root_render_func(self.new_context(vars)))


In [22]:
results

{
  "run_id": {
    "run_time": "2022-05-31T10:11:45.653160+00:00",
    "run_name": "20220531T101145.653160Z"
  },
  "evaluation_parameters": null,
  "success": false,
  "run_results": {
    "ValidationResultIdentifier::check_seq_data/20220531T101145.653160Z/20220531T101145.653160Z/e26b49f837c84090cd19c58a1ae22a29": {
      "validation_result": {
        "statistics": {
          "evaluated_expectations": 4,
          "successful_expectations": 2,
          "unsuccessful_expectations": 2,
          "success_percent": 50.0
        },
        "evaluation_parameters": {},
        "success": false,
        "meta": {
          "great_expectations_version": "0.15.7",
          "expectation_suite_name": "check_seq_data",
          "run_id": {
            "run_time": "2022-05-31T10:11:45.653160+00:00",
            "run_name": "20220531T101145.653160Z"
          },
          "batch_kwargs": {
            "path": "tsv/seq_stats.tsv",
            "datasource": "tsv",
            "data_asset_name"

In [23]:
context_seq.open_data_docs()

In [24]:
%%writefile great_expectations/checkpoints/test_seq_data.yml

validation_operator_name: seq_validation_operator
batches:
  - batch_kwargs:
      path: tsv/test_seq_stats.tsv
      datasource: tsv
      data_asset_name: test_seq_stats
      reader_method: read_csv
      reader_options:
        'sep': '\t'
    expectation_suite_names:
      - check_seq_data

Writing great_expectations/checkpoints/test_seq_data.yml


In [25]:
!great_expectations checkpoint run test_seq_data

Using v3 (Batch Request) API[0m
Checkpoint store named "checkpoint_store" is not a configured store, so will try to use default Checkpoint store.
  Please update your configuration to the new version number 3.0 in order to use the new "Checkpoint Store" feature.
  Visit https://docs.greatexpectations.io/docs/guides/miscellaneous/migration_guide#migrating-to-the-batch-request-v3-api to learn more about the upgrade process.
Could not find Validation Operator "seq_validation_operator" when running Checkpoint "test_seq_data". Using default action_list_operator.
Validation succeeded![0m

Suite Name                                   Status     Expectations met[0m
- check_seq_data                             [32m✔ Passed[0m   2 of 2 (100.0 %)[0m
[0m

In [26]:
test_seq_df = pd.read_csv('tsv/test_seq_stats.tsv', sep='\t')
ge_df = ge.from_pandas(test_seq_df)

In [27]:
result = ge_df.validate(batch.get_expectation_suite(), result_format="COMPLETE")

In [28]:
ge_df.iloc[result.results[0].result["unexpected_index_list"]]

KeyError: 'unexpected_index_list'