<a href="https://colab.research.google.com/github/dchatterjee/machine-learning-workspace/blob/main/data_KPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install great_expectations

Collecting great_expectations
  Downloading great_expectations-0.14.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
Collecting ruamel.yaml<0.17.18,>=0.16
  Downloading ruamel.yaml-0.17.17-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 72.6 MB/s 
Collecting pyparsing<3,>=2.4
  Downloading pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.2 MB/s 
Collecting jsonpatch>=1.22
  Downloading jsonpatch-1.32-py2.py3-none-any.whl (12 kB)
Collecting jsonpointer>=1.9
  Downloading jsonpointer-2.2-py2.py3-none-any.whl (7.5 kB)
Collecting ruamel.yaml.clib>=0.1.2
  Downloading ruamel.yaml.clib-0.2.6-cp37-cp37m-manylinux1_x86_64.whl (546 kB)
[K     |████████████████████████████████| 546 kB 70.1 MB/s 
Installing collected packages: pyparsing, ruamel.yaml.clib, jsonpointer, ruamel.yaml, jsonpatch, great-expectations
  Attempting uninstall: pyparsing
    Found existing installation: pyparsi

In [1]:
import pandas as pd
import numpy as np
import great_expectations as ge
from great_expectations.data_context.types.base import DataContextConfig, DatasourceConfig, FilesystemStoreBackendDefaults
from great_expectations.data_context import BaseDataContext
from great_expectations.checkpoint import SimpleCheckpoint
from great_expectations.core.batch import RuntimeBatchRequest
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
!great_expectations --yes --v3-api init 

Using v3 (Batch Request) API[0m
[36m
  ___              _     ___                  _        _   _
 / __|_ _ ___ __ _| |_  | __|_ ___ __  ___ __| |_ __ _| |_(_)___ _ _  ___
| (_ | '_/ -_) _` |  _| | _|\ \ / '_ \/ -_) _|  _/ _` |  _| / _ \ ' \(_-<
 \___|_| \___\__,_|\__| |___/_\_\ .__/\___\__|\__\__,_|\__|_\___/_||_/__/
                                |_|
             ~ Always know what to expect from your data ~
[0m[0m

[0m
[36mCongratulations! You are now ready to customize your Great Expectations configuration.[0m[0m

[36mYou can customize your configuration in many ways. Here are some examples:[0m

  [36mUse the CLI to:[0m
    - Run `[32mgreat_expectations datasource new[0m` to connect to your data.
    - Run `[32mgreat_expectations checkpoint new <checkpoint_name>[0m` to bundle data with Expectation Suite(s) in a Checkpoint for later re-validation.
    - Run `[32mgreat_expectations suite --help[0m` to create, edit, list, profile Expectation Suites.
    - Run `[32m

In [3]:
products=np.random.choice(['camera', 'phone', 'computer', 'speaker', 'TV', 
                'cable', 'movie', 'guitar', 'printer'], size=5)

quantities=np.random.choice(list(range(10))+[None], size=5)

dates=np.random.choice(pd.date_range(start="2020-12-30",end="2021-01-8", ), size=5)

df=pd.DataFrame({'products': products, 'quantities': quantities, 'dates': dates})
df

Unnamed: 0,products,quantities,dates
0,movie,4,2021-01-06
1,cable,1,2021-01-07
2,TV,0,2021-01-07
3,cable,1,2021-01-08
4,printer,9,2021-01-06


In [4]:
df=ge.from_pandas(df)
df.expect_column_values_to_be_unique('products'); # ~30% chance of passing
df.expect_column_values_to_not_be_null('quantities'); # ~60% chance of passing
df.expect_column_values_to_be_between('dates', '2021-01-01', '2021-01-8', parse_strings_as_datetimes=True); #~60% chance of passing

In [5]:
data_context_config = DataContextConfig(
    datasources={
        "my_datasource": DatasourceConfig(
            class_name="Datasource",
            module_name="great_expectations.datasource",
            execution_engine={
                "class_name": "PandasExecutionEngine",
                "module_name": "great_expectations.execution_engine"
            },
            data_connectors={
                "default_runtime_data_connector_name": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"],
                }
            }
        )
    },
    store_backend_defaults=FilesystemStoreBackendDefaults(root_directory="/work/great_expectations"),
)

context = BaseDataContext(project_config=data_context_config)
context.save_expectation_suite(expectation_suite_name='my_expectation_suite', expectation_suite=df.get_expectation_suite(discard_failed_expectations=False));

In [6]:
batch_request = RuntimeBatchRequest(
    datasource_name="my_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="df",
    runtime_parameters={"batch_data": df},  
    batch_identifiers={"default_identifier_name": "df"},
)

checkpoint_config = {
    "name": "my_checkpoint",
    "config_version": 1,
    "class_name": "SimpleCheckpoint",
    "expectation_suite_name": "my_expectation_suite"
}

context.add_checkpoint(**checkpoint_config);

results = context.run_checkpoint(
    checkpoint_name="my_checkpoint",
    validations = [
        {"batch_request": batch_request}
    ],
    run_id="my_run_id",
)

{
  "name": "my_checkpoint",
  "config_version": 1.0,
  "template_name": null,
  "module_name": "great_expectations.checkpoint",
  "class_name": "Checkpoint",
  "run_name_template": null,
  "expectation_suite_name": "my_expectation_suite",
  "batch_request": null,
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "evaluation_parameters": {},
  "runtime_configuration": {},
  "validations": [],
  "profilers": [],
  "ge_cloud_id": null,
  "expectation_suite_ge_cloud_id": null
}


Calculating Metrics:   0%|          | 0/16 [00:00<?, ?it/s]

In [14]:
!cat /work/great_expectations/expectations/my_expectation_suite.json

{
  "data_asset_type": "Dataset",
  "expectation_suite_name": "my_expectation_suite",
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "products"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "quantities"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "dates",
        "max_value": "2021-01-8",
        "min_value": "2021-01-01",
        "parse_strings_as_datetimes": true
      },
      "meta": {}
    }
  ],
  "ge_cloud_id": null,
  "meta": {
    "great_expectations_version": "0.14.2"
  }
}