# What is Great Expectations?

# Getting started

In [23]:
import datetime
import great_expectations as ge
import great_expectations.jupyter_ux
from great_expectations.data_context.types.resource_identifiers import ValidationResultIdentifier

context = ge.data_context.DataContext()

In [24]:
suite = context.create_expectation_suite(
    "check_avocado_data",
    overwrite_existing=True
)

In [25]:
batch_kwargs = {
    'datasource': 'data_dir',
    'path': 'data/avocado.csv',
}
batch = context.get_batch(batch_kwargs, suite)

In [26]:
batch.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [27]:
batch.expect_column_to_exist('Date')

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {},
  "meta": {}
}

In [33]:
batch.expect_column_values_to_not_be_null('region')

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 18249,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [42]:
batch.expect_column_values_to_be_between('AveragePrice', min_value=0.5, max_value=3.0)

{
  "success": false,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 18249,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 11,
    "unexpected_percent": 0.06027727546714888,
    "unexpected_percent_nonmissing": 0.06027727546714888,
    "partial_unexpected_list": [
      0.49,
      0.46,
      3.03,
      3.12,
      3.25,
      0.44,
      0.49,
      0.48,
      3.05,
      3.04,
      3.17
    ]
  },
  "meta": {}
}

In [43]:
batch.expect_column_values_to_be_between('AveragePrice', min_value=0.5, max_value=3.0, mostly=0.9)

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 18249,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 11,
    "unexpected_percent": 0.06027727546714888,
    "unexpected_percent_nonmissing": 0.06027727546714888,
    "partial_unexpected_list": [
      0.49,
      0.46,
      3.03,
      3.12,
      3.25,
      0.44,
      0.49,
      0.48,
      3.05,
      3.04,
      3.17
    ]
  },
  "meta": {}
}

In [31]:
batch.expect_column_distinct_values_to_be_in_set('type', ['conventional', 'organic'])

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      "conventional",
      "organic"
    ],
    "element_count": 18249,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {}
}

In [51]:
partition = {
    'values': ['conventional', 'organic'],
    'weights': [0.5, 0.5],
    
}
batch.expect_column_kl_divergence_to_be_less_than('type', partition, 0.1)

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 1.351245850704074e-08,
    "element_count": 18249,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {}
}

In [52]:
batch.get_expectation_suite()

2021-01-13T11:27:37+0100 - INFO - 	5 expectation(s) included in expectation_suite. result_format settings filtered.


{
  "meta": {
    "great_expectations_version": "0.13.4"
  },
  "expectations": [
    {
      "kwargs": {
        "column": "Date"
      },
      "expectation_type": "expect_column_to_exist",
      "meta": {}
    },
    {
      "kwargs": {
        "column": "type",
        "value_set": [
          "conventional",
          "organic"
        ]
      },
      "expectation_type": "expect_column_distinct_values_to_be_in_set",
      "meta": {}
    },
    {
      "kwargs": {
        "column": "region"
      },
      "expectation_type": "expect_column_values_to_not_be_null",
      "meta": {}
    },
    {
      "kwargs": {
        "column": "AveragePrice",
        "min_value": 0.5,
        "max_value": 3.0,
        "mostly": 0.9
      },
      "expectation_type": "expect_column_values_to_be_between",
      "meta": {}
    },
    {
      "kwargs": {
        "column": "type",
        "partition_object": {
          "values": [
            "conventional",
            "organic"
          ],
       

## Expectations

## Metrics

## Profiling: generating expectations

## Tests == docs

## Setting up data context and source

## (Airflow integration)

## (Spark)


In [None]:
!cat great_expectations/great_expectations.yml

In [1]:
import great_expectations as ge

In [2]:
my_df = ge.read_csv("data/avocado.csv")

In [3]:
my_df.head()
my_df.expect_column_values_to_be_in_set("type", ["conventional"])

{
  "success": false,
  "meta": {},
  "result": {
    "element_count": 18249,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 9123,
    "unexpected_percent": 49.991780371527206,
    "unexpected_percent_nonmissing": 49.991780371527206,
    "partial_unexpected_list": [
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic",
      "organic"
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [4]:
my_df.get_expectation_suite()

{
  "meta": {
    "great_expectations_version": "0.13.4"
  },
  "expectation_suite_name": "default",
  "data_asset_type": "Dataset",
  "expectations": []
}

## Expectations

## Metrics

## Profiling: generating expectations

## Tests == docs

## Setting up data context and source

## (Airflow integration)

## (Spark)