In [1]:
!pip install great_expectations sqlalchemy sqlite3

Defaulting to user installation because normal site-packages is not writeable
Collecting great_expectations
  Downloading great_expectations-0.17.7-py3-none-any.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting altair<5.0.0,>=4.2.1 (from great_expectations)
  Downloading altair-4.2.2-py3-none-any.whl (813 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m813.6/813.6 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Click<=8.1.3,>=7.1.2 (from great_expectations)
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorama>=0.4.3 (from great_expectations)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting ipywidgets>=7.5.1 (from great_expectations)
  Downloading ipywidgets-8.1.0-py3-none-any

In [33]:
# Create a Great Expectations project
!great_expectations init

[36m
  ___              _     ___                  _        _   _
 / __|_ _ ___ __ _| |_  | __|_ ___ __  ___ __| |_ __ _| |_(_)___ _ _  ___
| (_ | '_/ -_) _` |  _| | _|\ \ / '_ \/ -_) _|  _/ _` |  _| / _ \ ' \(_-<
 \___|_| \___\__,_|\__| |___/_\_\ .__/\___\__|\__\__,_|\__|_\___/_||_/__/
                                |_|
             ~ Always know what to expect from your data ~
[0m
This looks like an existing project that [32mappears complete![0m You are [32mready to roll.[0m

[0m

<h3> 1. Imports </h3>

In [1]:
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
import os

In [2]:
import great_expectations as gx
import great_expectations.jupyter_ux

2023-08-29T10:06:58-0300 - INFO - Great Expectations logging enabled at 20 level by JupyterUX module.


<h3> 2. Setting up the data, Data Context and Data Source </h3>

In [18]:
filename = 'downloaded_content.csv'
db_filename = 'jobsalary.db'
source_name = 'salary'
if filename not in os.listdir():
    !wget --output-file="logs.csv" "https://docs.google.com/spreadsheets/d/1IPS5dBSGtwYVbjsfbaMCYIWnOuRmJcbequohNxCyGVw/export?format=csv&gid=1625408792" -O "downloaded_content.csv"

if db_filename not in os.listdir():
    df = pd.read_csv('downloaded_content.csv')
    # new_columns = ['Timestamp', 'Age', 'Industry', 'Title', 'Title_Description', 'Salary', 'Bonus', 'CurrencyA', 'CurrencyB', 'Income_Context', 'Country', 'US_State', 'City', 'Overall_Experience', 'Field_Experience', 'Education', 'Gender', 'Race']
    new_columns = [x.lower() for x in ['Timestamp', 'Age', 'Industry', 'Title', 'Title_Description', 'Salary', 'Bonus', 'CurrencyA', 'CurrencyB', 'Income_Context', 'Country', 'US_State', 'City', 'Overall_Experience', 'Field_Experience', 'Education', 'Gender', 'Race']]
    df.columns = new_columns
    disk_engine = create_engine('sqlite:///jobsalary.db')
    df.to_sql('salary', disk_engine, if_exists='replace')
    
context = gx.get_context()
try:
    context.sources.add_sql(name=source_name, connection_string='sqlite:///jobsalary.db')
except:
    print('The datasource of that name already exists')

2023-08-29T17:46:14-0300 - INFO - FileDataContext loading fluent config
2023-08-29T17:46:14-0300 - INFO - Loading 'datasources' ->
[{'assets': [...],
  'connection_string': 'sqlite:///jobsalary.db',
  'name': 'salary',
  'type': 'sql'},
 {'name': 'default_pandas_datasource', 'type': 'pandas'}]
2023-08-29T17:46:14-0300 - INFO - Loaded 'salary' from fluent config
2023-08-29T17:46:14-0300 - INFO - Saving 1 Fluent Datasources to /home/andro/great_expectations/great_expectations.yml
2023-08-29T17:46:14-0300 - INFO - SQLDatasource.dict() - substituting config values
The datasource of that name already exists


<h3> 3. Create a Batch Request </h3>

In [28]:
salary_asset = context.datasources[
    "salary"
].get_asset("salary")

full_batch_salary_batch_request = (
    salary_asset.build_batch_request()
)

<h3> 4. Configure Expectation Suite in Context </h3>

In [29]:
expectation_suite_name = "salary_expectation_suite"

expectation_suite = context.add_or_update_expectation_suite(
    expectation_suite_name=expectation_suite_name
)

<h2> <u> We have a few options when approaching setting un Expectations </u></h2>

<h2> Option 1: Creating an Expectation Suiste using Data Assistant </h2>

<h3> 5. Run the Data Assistant on the Batch </h3>

In [30]:
data_assistant_result = context.assistants.onboarding.run(
    batch_request=full_batch_salary_batch_request)

2023-08-29T17:59:13-0300 - INFO - SQLDatasource.dict() - substituting config values
2023-08-29T17:59:13-0300 - INFO - A new sqlite connection was created: <sqlite3.Connection object at 0x7fd9e798a740>, <sqlalchemy.pool.base._ConnectionRecord object at 0x7fd9e773b580>
2023-08-29T17:59:13-0300 - INFO - Adding custom sqlite functions to connection <sqlite3.Connection object at 0x7fd9e798a740>
2023-08-29T17:59:13-0300 - INFO - Adding custom sqlite functions to connection <sqlalchemy.pool.base._ConnectionFairy object at 0x7fd9e773bd60>
2023-08-29T17:59:14-0300 - INFO - batch_slice: None was parsed to: slice(0, None, None)



Generating Expectations:   0%|          | 0/8 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/42 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/61 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/61 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/17 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/34 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/42 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [31]:
data_assistant_result.get_expectation_suite()

{
  "expectation_suite_name": "tmp.OnboardingDataAssistantResult.suite.0a5daef0",
  "ge_cloud_id": null,
  "expectations": [
    {
      "expectation_type": "expect_table_row_count_to_be_between",
      "kwargs": {
        "max_value": 27971,
        "min_value": 27971
      },
      "meta": {
        "profiler_details": {
          "metric_configuration": {
            "metric_name": "table.row_count",
            "domain_kwargs": {},
            "metric_value_kwargs": null
          },
          "num_batches": 1
        }
      }
    },
    {
      "expectation_type": "expect_table_columns_to_match_set",
      "kwargs": {
        "exact_match": null,
        "column_set": [
          "timestamp",
          "age",
          "title",
          "currencya",
          "city",
          "overall_experience",
          "country",
          "gender",
          "index",
          "bonus",
          "currencyb",
          "income_context",
          "industry",
          "race",
          "fi

<h3> 6. Add the formed Expectation Suite to the Context </h3> 

In [32]:
expectation_suite = data_assistant_result.get_expectation_suite(
    expectation_suite_name=expectation_suite_name
)

In [33]:
context.add_or_update_expectation_suite(expectation_suite=expectation_suite)

{
  "expectation_suite_name": "salary_expectation_suite",
  "ge_cloud_id": null,
  "expectations": [
    {
      "expectation_type": "expect_table_row_count_to_be_between",
      "kwargs": {
        "max_value": 27971,
        "min_value": 27971
      },
      "meta": {
        "profiler_details": {
          "metric_configuration": {
            "metric_name": "table.row_count",
            "domain_kwargs": {},
            "metric_value_kwargs": null
          },
          "num_batches": 1
        }
      }
    },
    {
      "expectation_type": "expect_table_columns_to_match_set",
      "kwargs": {
        "exact_match": null,
        "column_set": [
          "timestamp",
          "age",
          "title",
          "currencya",
          "city",
          "overall_experience",
          "country",
          "gender",
          "index",
          "bonus",
          "currencyb",
          "income_context",
          "industry",
          "race",
          "field_experience",
       

<h3> 7. Run a Checkpoint on the Batch using our Expectation Suite </h3>

In [11]:
checkpoint = context.add_or_update_checkpoint(
    name=f"salary_expectation",
    validations=[
        {
            "batch_request": full_batch_salary_batch_request,
            "expectation_suite_name": expectation_suite_name,
        }
    ],
)
checkpoint_result = checkpoint.run()

2023-08-29T10:07:09-0300 - INFO - SQLDatasource.dict() - substituting config values
2023-08-29T10:07:09-0300 - INFO - batch_slice: None was parsed to: slice(0, None, None)
2023-08-29T10:07:09-0300 - INFO - 	77 expectation(s) included in expectation_suite.


Calculating Metrics:   0%|          | 0/393 [00:00<?, ?it/s]

<h3> 8. Update the Data Docs to see the result of the Expectations </h3>

In [12]:
context.build_data_docs()

{'local_site': 'file:///home/andro/great_expectations/uncommitted/data_docs/local_site/index.html'}

<h3> Option 2: Manually adding Expectations and creating a Suite out of them </h3>

<h3> 3. Create a validator with our datasource </h3>

In [35]:
validator = context.get_validator(batch_request=full_batch_salary_batch_request)

2023-08-29T18:02:05-0300 - INFO - SQLDatasource.dict() - substituting config values
2023-08-29T18:02:05-0300 - INFO - batch_slice: None was parsed to: slice(0, None, None)


<h3> 4. Start adding individual exceptions for each column of interest </h3>

In [36]:
validator.expect_column_values_to_not_be_null("age")
validator.expect_column_max_to_be_between("bonus", strict_max=1200000)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 120000000.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

<h3> 5. Save the expectations to a default suite </h3>

In [42]:
validator.save_expectation_suite()

2023-08-29T20:00:52-0300 - INFO - 	2 expectation(s) included in expectation_suite.


<h3> 6. Create a checkpoint to evaluate the data using the default suite </h3>

In [43]:
checkpoint = context.add_or_update_checkpoint(
    name="salary_checkpoint",
    validator=validator,
)

checkpoint_result = checkpoint.run()

2023-08-29T20:00:53-0300 - INFO - SQLDatasource.dict() - substituting config values
2023-08-29T20:00:53-0300 - INFO - batch_slice: None was parsed to: slice(0, None, None)
2023-08-29T20:00:53-0300 - INFO - 	2 expectation(s) included in expectation_suite.


Calculating Metrics:   0%|          | 0/12 [00:00<?, ?it/s]

<h3> 7. Check the docs for the results of the checkpoint on how the suite performed </h3>

In [44]:
context.build_data_docs()

{'local_site': 'file:///home/andro/great_expectations/uncommitted/data_docs/local_site/index.html'}

<h3> 8. Save the expectation suite in the desired path once all needed expectations are configured </h3>

In [46]:
validator.save_expectation_suite('./manual_salary_expectation_suite.json')

2023-08-29T20:07:06-0300 - INFO - 	2 expectation(s) included in expectation_suite.


<h2> <u> We can also... </u> </h2>

<h3> Modify/Add/Remove Expectations from a Suite </h3>

In [14]:
suite = context.get_expectation_suite(expectation_suite_name)
# Get a list of all expectations in the suite
suite.show_expectations_by_expectation_type()

[ { 'expect_table_columns_to_match_set': { 'column_set': [ 'timestamp',
                                                           'age',
                                                           'title',
                                                           'currencya',
                                                           'city',
                                                           'overall_experience',
                                                           'country',
                                                           'gender',
                                                           'index',
                                                           'bonus',
                                                           'currencyb',
                                                           'income_context',
                                                           'industry',
                                                           'race',
            

<h4> 1. Modify/Add </h4>

In [15]:
# We can copy the configuration from the list in the previous cell. For example:
# {
#   "expectation_type": "expect_column_values_to_be_between",
#   "kwargs": {
#     "column": "Bonus",
#     "min_value": 0.0,
#     "strict_min": false,
#     "max_value": 120000000.0,
#     "mostly": 1.0,
#     "strict_max": false
#   }
# }

# We turn this configuration into an ExpectationConfiguration
from great_expectations.core.expectation_suite import ExpectationConfiguration

updated_config = ExpectationConfiguration(
  expectation_type="expect_column_values_to_be_between",
  kwargs={
    "column": "Bonus",
    "min_value": 0.0,
    "strict_min": False,
    "max_value": 2000.0,
    "mostly": 1.0,
    "strict_max": False
  }
)

In [16]:
# Upserting the config
suite.add_expectation(updated_config)
# By altering the expectation type and/or column in this config we can then use it to 
# insert a new expectation to the suite in the same way as we modify it

{"expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "Bonus", "min_value": 0.0, "strict_min": false, "max_value": 2000.0, "mostly": 1.0, "strict_max": false}, "meta": {}}

In [32]:
# We can check that the expectation updated by
config_to_search = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_between",
    kwargs={"column": "Bonus"},
)
found_expectation = suite.find_expectations(config_to_search, match_type="domain")

# This assertion will succeed because the ExpectationConfiguration has been updated.
if found_expectation == [updated_config]:
    print("Expectation updated, saving Suite")
    context.save_expectation_suite(suite)

Expectation updated, saving Suite


<h4> Remove </h4>

In [17]:
remove_config = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_between",
    kwargs={
        "column": "Bonus"}
)
suite.remove_expectation(remove_config)

# We can check that the expectation updated by
config_to_search = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_between",
    kwargs={"column": "Bonus"},
)
found_expectation = suite.find_expectations(config_to_search, match_type="domain")

# This assertion will succeed because the ExpectationConfiguration has been updated.
if found_expectation != [updated_config]:
    print("Expectation updated, saving Suite")
    context.save_expectation_suite(suite)

Expectation updated, saving Suite
