In [1]:
import great_expectations as ge

from ruamel import yaml

from great_expectations.core.batch import BatchRequest

from great_expectations.rule_based_profiler.rule.rule import Rule
from great_expectations.rule_based_profiler.rule_based_profiler import RuleBasedProfiler, RuleBasedProfilerResult

from great_expectations.rule_based_profiler.domain_builder import (
    DomainBuilder,
    ColumnDomainBuilder,
)
from great_expectations.rule_based_profiler.parameter_builder import (
    MetricMultiBatchParameterBuilder,
)
from great_expectations.rule_based_profiler.expectation_configuration_builder import (
    DefaultExpectationConfigurationBuilder,
)
from great_expectations.validator.validator import Validator

  warn_incompatible_dep(


# Self-Initializing Expectations
- Self-initializing `Expectations` utilize `RuleBasedProfilers` to automate parameter estimation for Expectations using a Batch or Batches that have been loaded. 

### Does this mean they work for all `Expectations`?
- No, not all `Expectations` have parameters that can be estimated. As an example, `ExpectColumnToExist` only takes in a Domain (`column_name`)  and outputs whether the values in the column are `None`. It would be an example of an `Expectation` that would not work under the self-initializing framework. 
- An example of an `Expectation` that would work under the self-initializing framework would be ones that have numeric ranges, like `ExpectColumnMeanToBeBetween`, `ExpectColumnMaxToBeBetween`, and `ExpectColumnSumToBeBetween`
- To check whether the `Expectation` you are interested in by running the `is_expectation_self_initializing()` method on `Validator`. 

In [3]:
Validator.is_expectation_self_initializing(name="expect_column_to_exist")

The Expectation expect_column_to_exist is not able to be self-initialized.


False

In [4]:
Validator.is_expectation_self_initializing(name="expect_column_mean_to_be_between")

The Expectation expect_column_mean_to_be_between is able to be self-initialized. Please run by using the auto=True parameter.


True

# Set-up

* To setup an example usecase for self-initializing `Expectations`, we will start from a new Great Expectations Data Context (ie `great_expectations` folder after running `great_expectations init`), and begin by adding the `Datasource`, and configuring a `BatchRequest`

In [None]:
data_context: ge.DataContext = ge.get_context()

### Adding taxi_data Datasource
Add `taxi_data` as a new `Datasource`
We are using an `InferredAssetFilesystemDataConnector` to connect to data in the `test_sets/taxi_yellow_tripdata_samples` folder and get one `DataAsset` (`yellow_tripdata_sample_2018`) that has 12 Batches (1 Batch/month).

In [None]:
data_path: str = "../../../../test_sets/taxi_yellow_tripdata_samples"

datasource_config = {
    "name": "taxi_multi_batch_datasource",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "2018_data": {
            "class_name": "InferredAssetFilesystemDataConnector",
            "base_directory": data_path,
            "default_regex": {
                "group_names": ["data_asset_name", "month"],
                "pattern": "(yellow_tripdata_sample_2018)-(\\d.*)\\.csv",
            },
        },
    },
}

data_context.test_yaml_config(yaml.dump(datasource_config))


In [None]:
# add_datasource only if it doesn't already exist in our configuration
try:
    data_context.get_datasource(datasource_config["name"])
except ValueError:
    data_context.add_datasource(**datasource_config)

### Configuring BatchRequest
In this example, we will be using a `BatchRequest` that returns 12 `Batches` of data from the 2018 `taxi_data` datataset.

In [None]:
batch_request_2018_data: BatchRequest = BatchRequest(
    datasource_name="taxi_multi_batch_datasource",
    data_connector_name="2018_data",
    data_asset_name="yellow_tripdata_sample_2018",
)

### Get Validator

Load `taxi_data` into a `Validator` using the `BatchRequest` from the previous step.

In [None]:
suite = data_context.create_expectation_suite(expectation_suite_name="new_expectation_suite", overwrite_existing=True
)

In [None]:
validator = data_context.get_validator(expectation_suite=suite, batch_request=multi_batch_batch_request)

Check that the number of batches in our validator is 12 (1 batch / month for 2018)

In [None]:
assert len(validator.batches) == 12

# Running Self-Initializing Expectation

Now we have all the components we need to build an ExpectationSuite by using a Validator. Let's first look at our data by running `validator.head()` which will output the first few rows of our most recent (December) Batch.

In [None]:
validator.head()

Let's say that you were interested in constructing an Expectation that captured the average distance for taxi trips during a year. 

You find the `expect_column_mean_to_be_between()` Expectation, but realize that there are `min_value` and `max_value` to input. How do you calculate these upper and lower bounds?  Well the good news is that `ExpectColumnMeanToBeBetween` is a self-initializing Expectation, which means the only thing you need to do is run the Expectation with `auto=True`. 

In [None]:
validator.expect_column_mean_to_be_between(column="trip_distance", auto=True)

Then the Expectation will calculate the `min_value` (`2.83`) and `max_value` (`3.06`) using all the `Batches` that are loaded into the Validator, in our case the 12 batches associated with 2018 `taxi_data`. 

Now the Expectation can be saved to the ExpectaionSuite associated with the Validator, with the upper and lower bounds having come from the Batches.

# What has happened? How can I write one on my own?

Inside each of the Expectatations is a RuleBasedProfiler configuration that is run by the Validator when building the `ExpectationConfiguration`. The values that are calculated are found in the

```python
mean_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig(
        module_name="great_expectations.rule_based_profiler.parameter_builder",
        class_name="NumericMetricRangeMultiBatchParameterBuilder",
        name="mean_range_estimator",
        metric_name="column.mean",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, 
        metric_value_kwargs=None,
        enforce_numeric_metric=True,
        replace_nan_with_zero=True,
        reduce_scalar_metric=True,
        false_positive_rate=f"{VARIABLES_KEY}false_positive_rate",
        quantile_statistic_interpolation_method=f"{VARIABLES_KEY}quantile_statistic_interpolation_method",
        estimator=f"{VARIABLES_KEY}estimator",
        n_resamples=f"{VARIABLES_KEY}n_resamples",
        random_seed=f"{VARIABLES_KEY}random_seed",
        include_estimator_samples_histogram_in_details=f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details",
        truncate_values=f"{VARIABLES_KEY}truncate_values",
        round_decimals=f"{VARIABLES_KEY}round_decimals",
        evaluation_parameter_builder_configs=None,
        json_serialize=True,
    )
```

```python
validation_parameter_builder_configs: List[ParameterBuilderConfig] = [
        mean_range_estimator_parameter_builder_config,
    ] 
```

```python
default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
        name="expect_column_mean_to_be_between",  # Convention: use "expectation_type" as profiler name.
        config_version=1.0,
        variables={},
        rules={
            "default_expect_column_mean_to_be_between_rule": {
                "variables": {
                    "strict_min": False,
                    "strict_max": False,
                    "false_positive_rate": 0.05,
                    "quantile_statistic_interpolation_method": "auto",
                    "estimator": "bootstrap",
                    "n_resamples": 9999,
                    "random_seed": None,
                    "include_estimator_samples_histogram_in_details": False,
                    "truncate_values": {
                        "lower_bound": None,
                        "upper_bound": None,
                    },
                    "round_decimals": 2,
                },
                "domain_builder": {
                    "class_name": "ColumnDomainBuilder",
                    "module_name": "great_expectations.rule_based_profiler.domain_builder",
                },
                "expectation_configuration_builders": [
                    {
                        "expectation_type": "expect_column_mean_to_be_between",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder",
                        "validation_parameter_builder_configs": validation_parameter_builder_configs,
                        "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
                        "min_value": f"{PARAMETER_KEY}{mean_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]",
                        "max_value": f"{PARAMETER_KEY}{mean_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]",
                        "strict_min": f"{VARIABLES_KEY}strict_min",
                        "strict_max": f"{VARIABLES_KEY}strict_max",
                        "meta": {
                            "profiler_details": f"{PARAMETER_KEY}{mean_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
                        },
                    },
                ],
            },
        },
    )
```

# From Rachel

- domain builder key : 
    - class name and module name. 
- all of these together will look liek this. 


1. make this human readable. 
2. go through the list of keys and ane xplain what values are appropriate for this key :
    
    - example of all of this together. 
    - refer to RBP notebook  (heading blah blah) in repo. just state what sort of class each piece is. 


In [None]:
* What should we talk about? *
* `ParameterBuilderConfiguration`, which 

In [None]:
* they have a `RuleBasedProfilerConfig` with some default value. 
* and when we run it.. hwat
* we have RuleBasedProfiler is the engine in which we can do this.... create the value . 
* we only hwant to make asomething. 
* want to have hte data. 
  - load batch.
  - estimate value.
  - what are the parameters?

# How to write a self-initializing Expectation
  - how to write self-initializing Expectation?
  - Let's start with
    - ProfilerConfiguration
    - DomainBuider
    - ParameterBuilder
    - test


In [None]:
# What is average trip distance? 
res = validator.expect_column_values_to_be_between(
        column="trip_distance",
        result_format="COMPLETE",
        include_config=True,
        auto=True,
        profiler_config=None,
    )


In [None]:
validator.save_expectation_suite()

In [None]:
# max_value:63.3
# min_value:0

In [None]:
 batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    validator: Validator = get_validator_with_expectation_suite(
        batch_request=batch_request,
        data_context=context,
        expectation_suite_name=None,
        expectation_suite=None,
        component_name="profiler",
    )


In [None]:

batch_request = materialize_batch_request(batch_request=batch_request)
validator: "Validator" = data_context.get_validator(  # noqa: F821
    batch_request=batch_request,
    expectation_suite=expectation_suite,
)

return validator

# how do you write a self -initializiag
### How do I know if my Expectations can be self-initializing?  
  - If the Expectation only requires a `Domain` value as a parameter, like Column Name, Table, or Column Pairs, then it cannot be self-initializing.
  - If the Expectation requires a numeric range, `ExpectColumnMeanToBeBetween` or `ExpectColumnMedianToBeBetween`, then they **can** be self-initializing. 
  - If the Expectation requires a set, like `expect_column_values_to_match_regex` then the **can** be self-initializing in limited cases. 

It can be found in `parameter_container.py`, but they break down to the following:
* `PARAMETER_KEY` : 
* `VARIABLES_KEY` : `$variables.false_positive_rate` 
* `DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME`: `$domain.domain_kwargs`
* `FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER`: `.`
* `FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY`: `value`
* `FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY`: `details`