# Creating & Editing Expectation Suites
Use this example notebook as a "boilerplate" template for creating and modifying your expectation suites.

While the same notebook can be used to manage multiple expectation suites, developers often find it helpful to dedicate a separate notebook for each expectation suite, because it makes the organization of the expectation suites in the code repository more explicit and improves the code readability.

## IMPORTANT
Be sure to commit your notebook to GitHub as part of your repository!  This notebook is the source of truth, capturing your expectations with respect to the given data asset.  (To facilitate code review, you may wish to "Restart Kernel and Clear All Outputs" before committing the notebook to Git).

## _We are here to help!_

You can always **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)

## Initialize Spark Context and Import Python Basics


In [None]:
import os
import sys
import io

import time
import datetime

import findspark

from pyspark import SQLContext

from pyspark.context import SparkContext
from pyspark.sql import SparkSession


In [None]:
from pyspark.sql import functions as F

In [None]:
sys.version_info

In [None]:
findspark.init() 

In [None]:
os.environ.get('PYSPARK_PYTHON')

In [None]:
spark_session = SparkSession.builder.appName("pytest-pyspark-local-notebook-ge-in-memory"). \
    master("local[2]"). \
    config("spark.executor.memory", "6g"). \
    config("spark.driver.memory", "6g"). \
    config("spark.ui.showConsoleProgress", "false"). \
    config("spark.sql.shuffle.partitions", "2"). \
    config("spark.default.parallelism", "4"). \
    enableHiveSupport(). \
    getOrCreate()
sc = spark_session.sparkContext

In [None]:
sc.getConf().getAll()

In [None]:
spark = SQLContext(sc)

## Import Python Basics


In [None]:
import os
import sys
import io

import time
import datetime



In [None]:
sys.version_info

## Import Useful Python Utilities

Also import GreatExpectations.

In [None]:

import json
import re

import pandas as pd


## GreatExpectations Basics

Check GreatExpections version.


In [None]:
import great_expectations as ge

In [None]:
ge.__version__

## Add Repository Repository to Spark Context

Also import frequently used utilities from your repository.

### _Important_
Make sure that the path to your repository archive in S3 for the `sc.addPyFile(s3_path_to_repo_zip)` call below is correct and that the contents are up to date.

In [None]:
# sc.addPyFile('s3://alex-ge-test/code-0.0.0.zip')

In [None]:
def load_csv(spark_context, path, delimiter):
    return spark_context.read \
        .format("com.databricks.spark.csv") \
        .option("delimiter", delimiter) \
        .option("header", "true") \
        .load(path)


def load_parquet(spark_context, path, prefix_path=None, select_cols=None):
    if prefix_path is None:
        spark_parquet_read_func = spark_context.read
    else:
        spark_parquet_read_func = spark_context.read.option("basePath", prefix_path)

    if isinstance(path, list):
        df = spark_parquet_read_func.parquet(*path)
    else:
        df = spark_parquet_read_func.parquet(path)

    if select_cols:
        df = df.select(*select_cols)

    return df


In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
backend_ecosystem = "aws"

In [None]:
datasource_type = "spark"
# datasource_type = "pandas"

In [None]:
expectations_store_bucket = "alex.dev"

In [None]:
expectations_store_prefix = "great_expectations/JSON/EXPERIMENTATION/ExpectationSuites"

In [None]:
validations_store_bucket = "alex.dev"

In [None]:
validations_store_prefix = "great_expectations/JSON/EXPERIMENTATION/Validations"

In [None]:
data_docs_store_bucket = "alex.dev"

In [None]:
data_docs_store_prefix = "great_expectations/HTML/EXPERIMENTATION"

In [None]:
expectations_store_name = "test_expectations_store_0400"

In [None]:
validations_store_name = "test_validations_store_0500"

In [None]:
data_docs_site_name = "test_data_docs_site_0600"

In [None]:
expectations_store_kwargs = None

In [None]:
validations_store_kwargs = None

In [None]:
data_docs_store_kwargs = None

In [None]:
project_config_bucket = None

In [None]:
project_config_prefix = None

In [None]:
project_config_kwargs = None

In [None]:
slack_webhook = "https://hooks.slack.com/services/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"

In [None]:
show_how_to_buttons = True

In [None]:
show_cta_footer = True

In [None]:
include_profiling = True

In [None]:
runtime_environment = None

In [None]:
overwrite_existing = False
# overwrite_existing = True

In [None]:
usage_statistics_enabled = True

In [None]:
project_config = ge.data_context.types.base.DataContextConfig.build(
    backend_ecosystem=backend_ecosystem,
    datasource_type=datasource_type,
    expectations_store_bucket=expectations_store_bucket,
    expectations_store_prefix=expectations_store_prefix,
    validations_store_bucket=validations_store_bucket,
    validations_store_prefix=validations_store_prefix,
    data_docs_store_bucket=data_docs_store_bucket,
    data_docs_store_prefix=data_docs_store_prefix,
    expectations_store_name=expectations_store_name,
    validations_store_name=validations_store_name,
    data_docs_site_name=data_docs_site_name,
    expectations_store_kwargs=expectations_store_kwargs,
    validations_store_kwargs=validations_store_kwargs,
    data_docs_store_kwargs=data_docs_store_kwargs,
    project_config_bucket=project_config_bucket,
    project_config_prefix=project_config_prefix,
    project_config_kwargs=project_config_kwargs,
    slack_webhook=slack_webhook,
    show_how_to_buttons=show_how_to_buttons,
    show_cta_footer=show_cta_footer,
    include_profiling=include_profiling,
    runtime_environment=runtime_environment,
    overwrite_existing=overwrite_existing,
    usage_statistics_enabled=usage_statistics_enabled,
)

In [None]:
project_config

In [None]:
print(project_config.anonymous_usage_statistics.data_context_id)

In [None]:
print(project_config.to_yaml_str())

In [None]:
data_context = ge.data_context.DataContext(project_config=project_config)

In [None]:
data_context

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
data_context.list_datasources()

In [None]:
# data_context.stores

In [None]:
data_context.list_stores()

In [None]:
for store in data_context.list_stores():
    if store["class_name"].find("Backend") == (-1):
        print(f'STORE_NAME: "{store["name"]}" ; CLASS_NAME: "{store["class_name"]}" ; STORE_BACKEND(TYPE="{str(type(store["store_backend"]))}"): "{store["store_backend"]}"')
    else:
        print(f'STORE_NAME: "{store["name"]}" ; CLASS_NAME: "{store["class_name"]}" ; IS_BACKEND')

In [None]:
data_context.list_expectation_suite_names()

In [None]:
data_context.list_expectation_suites()

In [None]:
data_context.list_validation_operator_names()

In [None]:
data_context.list_validation_operators()

In [None]:
# data_context.get_data_docs_sites()  # How come the "data_docs_sites" is a dict as opposed to a list (as is the case with other entities)?

In [None]:
data_context.get_docs_sites_urls(only_if_exists=False)

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
print(data_context.data_context_id)

In [None]:
print(data_context.get_project_config().to_yaml_str())

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

# Author/Manage Your Expectation Suite
Use this notebook to recreate and modify your expectation suite for (write down the name of the expectation suite below for future references):

**Expectation Suite Name**: `Titanic_Expectation_Suite`

You can always **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)

## Data Asset Specification

Specify the S3 path to the data asset that you wish to reason about (by characterising it with expectations) in this notebook.  Then use the previously imported utilities to load this asset into a PySpark DataDrame (we also recommend printing some basic information about your dataframe).

### Terminology
We use the term "check dataframe" when referring to the dataframe corresponding to your data asset, because this is the dataframe, on which the various checks against what is expected will be performed in the course of building the expectation suite.  As part of this process, you may need to create additional columns (e.g., to combine existing columns), join different dataframes, and so on in order to produce a check dataframe for expectations. 

In [None]:
data_asset_path = 's3a://alex.dev/data_assets/Titanic.csv'

In [None]:
df_check = load_csv(
    spark_context=spark,
    path=data_asset_path,
    delimiter=','
)

In [None]:
print(df_check.columns)

In [None]:
print((df_check.count(), len(df_check.columns)))

In [None]:
# df_check.show(n=200, truncate=False)

## Define Expectation Suite Name

Now create the name for your expectation suite.

We recommend the naming convention that concatenates the root of your outputfile name (or project ID) with the suffix "_Expectation_Suite" at the end.  While the name of an expectation suite can be any alphanumeric string, this naming convention facilitates clarity, standardization, and repeatability.

In [None]:
expectation_suite_name = 'Titanic_Expectation_Suite'

## Create Expectation Suite

Use the GreatExpectations context to create your expectation suite with the above name.


In [None]:
data_context.create_expectation_suite(
    expectation_suite_name=expectation_suite_name,
    overwrite_existing=True
)

## Obtain Data Batch

Now wrap your check dataframe into a batch of data within the Great Expectations context.

This is a 2-step process.  First, we create keywork arguments as a metadata for your data asset.  Then we use the GreatExpectations context to generate the batch of data from your data asset and place it within the scope of your expectation suite.  We also display several rows of the batch to make sure that the contents are the same as in your original check dataframe.  Finally, we print out the batch keyword arguments for diagnostics purposes.


In [None]:
batch_kwargs = {
    'datasource': 's3_files_spark_datasource',
    'dataset': df_check
}

In [None]:
batch = data_context.get_batch(
    expectation_suite_name=expectation_suite_name,
    batch_kwargs=batch_kwargs
)
batch.head(10)

In [None]:
batch.batch_kwargs

## Use GreatExpectations API

The GreatExpectations API provides information about the data batch.  For example, `batch.get_table_columns()` returns the columns of your data asset.  In the remainder of this notebook, you will be expressing your reasoning about the data in these columns by creating various expectations on them.

In [None]:
data_source_column_names_list = batch.get_table_columns()
print(data_source_column_names_list, len(data_source_column_names_list))

## Create & Edit Expectations

Add expectations by calling specific expectation methods on the `batch` object. They all begin with `.expect_` which makes autocompleting easy using the "tab" key.

You can see all the available expectations in the **[expectation glossary](https://docs.greatexpectations.io/en/latest/expectation_glossary.html?utm_source=notebook&utm_medium=create_expectations)**.

In [None]:
column_list = data_source_column_names_list

In [None]:
result = batch.expect_table_columns_to_match_ordered_list(
    column_list=column_list,
    result_format='SUMMARY',
    include_config=True,
    catch_exceptions=None,
    meta=None
)
print(result, 'Success: {0}'.format(result.success))

In [None]:
min_value = 1300

In [None]:
max_value = 1500

In [None]:
result = batch.expect_table_row_count_to_be_between(
    min_value=min_value,
    max_value=max_value,
    result_format='SUMMARY',
    include_config=True,
    catch_exceptions=None,
    meta=None
)
print(result, 'Success: {0}'.format(result.success))

In [None]:
column_names = ['Name', 'PClass', 'Age', 'Sex', 'Survived', 'SexCode']

In [None]:
for column_name in column_names:
    result = batch.expect_column_values_to_not_be_null(
        column=column_name,
        mostly=None,
        result_format='SUMMARY',
        include_config=True,
        catch_exceptions=None,
        meta=None
    )
    print(result, 'Success: {0}'.format(result.success))
    print("\n")

In [None]:
column_name = '_c0'

In [None]:
result = batch.expect_column_values_to_not_be_null(
    column=column_name,
    mostly=9.8e-1,
    result_format='SUMMARY',
    include_config=True,
    catch_exceptions=None,
    meta=None
)
print(result, 'Success: {0}'.format(result.success))

In [None]:
# column_name = 'Zip'

In [None]:
# regex_pattern = '^[0-9]{5}(?:-[0-9]{4})?$'

In [None]:
# result = batch.expect_column_values_to_match_regex(
#     column=column_name,
#     regex=regex_pattern,
#     mostly=9.0e-1,
#     result_format='SUMMARY',
#     include_config=True,
#     catch_exceptions=None,
#     meta=None
# )
# print(result, 'Success: {0}'.format(result.success))

In [None]:
# column_name = 'Year'

In [None]:
# value_set = [
#     2019,
#     2020
# ]

In [None]:
# result = batch.expect_column_values_to_be_in_set(
#     column=column_name,
#     value_set=value_set,
#     mostly=None,
#     result_format='SUMMARY',
#     include_config=True,
#     catch_exceptions=None,
#     meta=None
# )
# print(result, 'Success: {0}'.format(result.success))

In [None]:
# column_name = 'Week'

In [None]:
# min_value = 1

In [None]:
# max_value = 52

In [None]:
# result = batch.expect_column_values_to_be_between(
#     column=column_name,
#     min_value=min_value,
#     max_value=max_value,
#     mostly=None,
#     result_format='SUMMARY',
#     include_config=True,
#     catch_exceptions=None,
#     meta=None
# )
# print(result, "Success: {0}".format(result.success))

## Save & Review Your Expectations

Let's save the expectation suite as a JSON file in the `great_expectations/expectations` directory of your project.
If you decide not to save some expectations that you created, use [remove_expectaton method](https://docs.greatexpectations.io/en/latest/module_docs/data_asset_module.html?highlight=remove_expectation&utm_source=notebook&utm_medium=edit_expectations#great_expectations.data_asset.data_asset.DataAsset.remove_expectation).

Let's now rebuild your Data Docs, which helps you communicate about your data with both machines and humans.

In [None]:
batch.get_expectation_suite(discard_failed_expectations=False)

In [None]:
batch.save_expectation_suite(discard_failed_expectations=False)

In [None]:
data_context.build_data_docs()

In [None]:
sc.stop()

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

# Now Restart The Kernel And Continue From This Cell
Next: Running Data Validation based on the Expectation Suite (using the authoring example above).

**Expectation Suite Name**: `Titanic_Expectation_Suite`

You can always **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

# Perform Data Validation Based On Your Expectation Suite
Use this notebook to recreate and modify your expectation suite for (write down the name of the expectation suite below for future references):

**Expectation Suite Name**: `Titanic_Expectation_Suite`

You can always **reach out to us on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)

## Initialize Spark Context and Import Python Basics


In [None]:
import os
import sys
import io

import time
import datetime

import findspark

from pyspark import SQLContext

from pyspark.context import SparkContext
from pyspark.sql import SparkSession


In [None]:
from pyspark.sql import functions as F

In [None]:
sys.version_info

In [None]:
findspark.init() 

In [None]:
os.environ.get('PYSPARK_PYTHON')

In [None]:
spark_session = SparkSession.builder.appName("pytest-pyspark-local-notebook-ge-in-memory"). \
    master("local[2]"). \
    config("spark.executor.memory", "6g"). \
    config("spark.driver.memory", "6g"). \
    config("spark.ui.showConsoleProgress", "false"). \
    config("spark.sql.shuffle.partitions", "2"). \
    config("spark.default.parallelism", "4"). \
    enableHiveSupport(). \
    getOrCreate()
sc = spark_session.sparkContext

In [None]:
sc.getConf().getAll()

In [None]:
spark = SQLContext(sc)

## Import Python Basics


In [None]:
import os
import sys
import io

import time
import datetime



In [None]:
sys.version_info

## Import Useful Python Utilities

Also import GreatExpectations.

In [None]:

import json
import re

import pandas as pd


## GreatExpectations Basics

Check GreatExpections version.


In [None]:
import great_expectations as ge

In [None]:
ge.__version__

## Add Repository Repository to Spark Context

Also import frequently used utilities from your repository.

### _Important_
Make sure that the path to your repository archive in S3 for the `sc.addPyFile(s3_path_to_repo_zip)` call below is correct and that the contents are up to date.

In [None]:
# sc.addPyFile('s3://alex-ge-test/code-0.0.0.zip')

In [None]:
def load_csv(spark_context, path, delimiter):
    return spark_context.read \
        .format("com.databricks.spark.csv") \
        .option("delimiter", delimiter) \
        .option("header", "true") \
        .load(path)


def load_parquet(spark_context, path, prefix_path=None, select_cols=None):
    if prefix_path is None:
        spark_parquet_read_func = spark_context.read
    else:
        spark_parquet_read_func = spark_context.read.option("basePath", prefix_path)

    if isinstance(path, list):
        df = spark_parquet_read_func.parquet(*path)
    else:
        df = spark_parquet_read_func.parquet(path)

    if select_cols:
        df = df.select(*select_cols)

    return df


In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
backend_ecosystem = "aws"

In [None]:
datasource_type = "spark"
# datasource_type = "pandas"

In [None]:
expectations_store_bucket = "alex.dev"

In [None]:
expectations_store_prefix = "great_expectations/JSON/EXPERIMENTATION/ExpectationSuites"

In [None]:
validations_store_bucket = "alex.dev"

In [None]:
validations_store_prefix = "great_expectations/JSON/EXPERIMENTATION/Validations"

In [None]:
data_docs_store_bucket = "alex.dev"

In [None]:
data_docs_store_prefix = "great_expectations/HTML/EXPERIMENTATION"

In [None]:
expectations_store_name = "test_expectations_store_0400"

In [None]:
validations_store_name = "test_validations_store_0500"

In [None]:
data_docs_site_name = "test_data_docs_site_0600"

In [None]:
expectations_store_kwargs = None

In [None]:
validations_store_kwargs = None

In [None]:
data_docs_store_kwargs = None

In [None]:
project_config_bucket = None

In [None]:
project_config_prefix = None

In [None]:
project_config_kwargs = None

In [None]:
slack_webhook = "https://hooks.slack.com/services/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"

In [None]:
show_how_to_buttons = True

In [None]:
show_cta_footer = True

In [None]:
include_profiling = True

In [None]:
runtime_environment = None

In [None]:
overwrite_existing = False
# overwrite_existing = True

In [None]:
usage_statistics_enabled = True

In [None]:
project_config = ge.data_context.types.base.DataContextConfig.build(
    backend_ecosystem=backend_ecosystem,
    datasource_type=datasource_type,
    expectations_store_bucket=expectations_store_bucket,
    expectations_store_prefix=expectations_store_prefix,
    validations_store_bucket=validations_store_bucket,
    validations_store_prefix=validations_store_prefix,
    data_docs_store_bucket=data_docs_store_bucket,
    data_docs_store_prefix=data_docs_store_prefix,
    expectations_store_name=expectations_store_name,
    validations_store_name=validations_store_name,
    data_docs_site_name=data_docs_site_name,
    expectations_store_kwargs=expectations_store_kwargs,
    validations_store_kwargs=validations_store_kwargs,
    data_docs_store_kwargs=data_docs_store_kwargs,
    project_config_bucket=project_config_bucket,
    project_config_prefix=project_config_prefix,
    project_config_kwargs=project_config_kwargs,
    slack_webhook=slack_webhook,
    show_how_to_buttons=show_how_to_buttons,
    show_cta_footer=show_cta_footer,
    include_profiling=include_profiling,
    runtime_environment=runtime_environment,
    overwrite_existing=overwrite_existing,
    usage_statistics_enabled=usage_statistics_enabled,
)

In [None]:
project_config

In [None]:
print(project_config.anonymous_usage_statistics.data_context_id)

In [None]:
print(project_config.to_yaml_str())

In [None]:
data_context = ge.data_context.BaseDataContext(project_config=project_config)

In [None]:
data_context

In [None]:
#############DIVIDER#############

In [None]:
print(data_context.data_context_id)

In [None]:
print(data_context.get_project_config().to_yaml_str())

In [None]:
#############DIVIDER#############

In [None]:
data_asset_path_for_validations = 's3a://alex.dev/data_assets/Titanic.csv'

In [None]:
df_check_for_validations = load_csv(
    spark_context=spark,
    path=data_asset_path_for_validations,
    delimiter=','
)

In [None]:
print(df_check_for_validations.columns)

In [None]:
print((df_check_for_validations.count(), len(df_check_for_validations.columns)))

In [None]:
# df_check_for_validations.show(n=200, truncate=False)

## Provide Expectation Suite Name

Now specify the name for your expectation suite (this name must match the name of an expectation suite that you created previously for this data asset).

In [None]:
expectation_suite_name_for_validations = 'Titanic_Expectation_Suite'

In [None]:
df_check_for_validations.persist()

In [None]:
batch_kwargs_for_validations = {
    'datasource': 's3_files_spark_datasource',
    'dataset': df_check_for_validations
}

In [None]:
batch_for_validations = data_context.get_batch(
    expectation_suite_name=expectation_suite_name_for_validations,
    batch_kwargs=batch_kwargs_for_validations
)

In [None]:
# Note: "run_id" needs to be a simple sortable timestamp, which can be readily generated by any pipeline runner.
run_id = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%S.%fZ')
validation_results = data_context.run_validation_operator(
    validation_operator_name='action_list_operator',
    assets_to_validate=[batch_for_validations],
    run_id=run_id
)

In [None]:
print('Success: {0}'.format(validation_results['success']))

In [None]:
#############DIVIDER#############

In [None]:
validation_results

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
#############DIVIDER#############

In [None]:
sc.stop()