# Hero Experience
- This Notebook outlines the steps we can take to get a single `Batch` of data and running the `RuleBasedProfiler` on it. It is adopted from the `Bobby` fixture that we have been using in our integration tests.

In [1]:
import great_expectations as ge
from great_expectations.core.batch import BatchRequest
from great_expectations.core import ExpectationSuite

from great_expectations.rule_based_profiler.config.base import RuleBasedProfilerConfig
from great_expectations.rule_based_profiler.rule_based_profiler import BaseRuleBasedProfiler

from great_expectations.core.yaml_handler import YAMLHandler
from great_expectations.checkpoint.checkpoint import SimpleCheckpoint

In [2]:
data_context: ge.DataContext = ge.get_context()

In [3]:
# loading the first 3 months of 2019 Taxi data into our datasource
data_path: str = "../../../../test_sets/taxi_yellow_tripdata_samples/first_3_files"
yaml_handler = YAMLHandler()
datasource_config = {
    "name": "taxi_pandas",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "monthly": {
            "class_name": "ConfiguredAssetFilesystemDataConnector",
            "base_directory": data_path,
            "assets":{
                "my_reports": {
                    "base_directory": "./",
                    "group_names": ["name", "year", "month"],
                    "pattern": "^(.+)_(\\d.*)-(\\d.*)\\.csv",
                    "module_name": "great_expectations.datasource.data_connector.asset",
                    "class_name": "Asset",
                },          
            },
        },
    },
}

data_context.test_yaml_config(yaml_handler.dump(datasource_config))

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	monthly : ConfiguredAssetFilesystemDataConnector

	Available data_asset_names (1 of 1):
		my_reports (3 of 3): ['yellow_tripdata_sample_2019-01.csv', 'yellow_tripdata_sample_2019-02.csv', 'yellow_tripdata_sample_2019-03.csv']

	Unmatched data_references (1 of 1):['.DS_Store']



<great_expectations.datasource.new_datasource.Datasource at 0x7f87c5c40220>

In [4]:
# add_datasource only if it doesn't already exist in our configuration
try:
    data_context.get_datasource(datasource_config["name"])
except ValueError:
    data_context.add_datasource(**datasource_config)

## Build BatchRequest and Retrieve Batch
In this example, we will be using a `BatchRequest` that returns a single `Batch`, namely the most recent 2019 taxi data asset in our `Asset`.

In [5]:
hero_batch: BatchRequest = BatchRequest(
    datasource_name="taxi_pandas",
    data_connector_name="monthly",
    data_asset_name="my_reports",
    data_connector_query={
      "index": "-1"
    }
)

In [6]:
expectation_suite = data_context.create_expectation_suite(
    expectation_suite_name="temp_suite", overwrite_existing=True
)

In [7]:
validator = data_context.get_validator(
    batch_request=hero_batch,
    expectation_suite=expectation_suite,
)

In [8]:
validator.head(n_rows=5, fetch_all=False)

Calculating Metrics: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.82it/s]


Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-03-01 18:43:31,2019-03-01 18:48:42,2,1.1,1,N,143,238,1,6.0,3.5,0.5,1.55,0.0,0.3,11.85,2.5
1,2,2019-03-20 16:30:11,2019-03-20 16:44:49,1,2.63,1,N,231,88,1,12.5,1.0,0.5,3.36,0.0,0.3,20.16,2.5
2,1,2019-03-07 19:01:51,2019-03-07 19:04:36,1,0.3,1,N,237,237,1,4.0,3.5,0.5,1.66,0.0,0.3,9.96,2.5
3,1,2019-03-02 16:33:01,2019-03-02 16:37:41,1,0.9,1,N,42,41,1,5.5,0.0,0.5,0.0,0.0,0.3,6.3,0.0
4,2,2019-03-28 14:10:47,2019-03-28 14:49:37,1,15.68,3,N,231,1,1,63.0,0.0,0.0,14.76,10.5,0.3,88.56,0.0


# Load Profiler


In [None]:
# this part will be taken care of by the Data Assistants

In [9]:
configuration_path = "../bobby_user_workflow_verbose_profiler_config.yml"

In [10]:
with open(configuration_path) as f:
    read = f.read()
    yaml_config = yaml_handler.load(read)

In [11]:
yaml_config

{'name': 'bobby user workflow',
 'class_name': 'RuleBaseProfiler',
 'config_version': 1.0,
 'variables': {'jan_feb_2019_monthly_tripdata_batch_request': {'datasource_name': 'taxi_pandas',
   'data_connector_name': 'monthly',
   'data_asset_name': 'my_reports',
   'data_connector_query': {'index': ':-1'}},
  'estimator': 'oneshot',
  'false_positive_rate': 0.01,
  'mostly': 1.0},
 'rules': {'row_count_range_rule': {'domain_builder': {'class_name': 'TableDomainBuilder'},
   'parameter_builders': [{'name': 'row_count_range',
     'class_name': 'NumericMetricRangeMultiBatchParameterBuilder',
     'batch_request': '$variables.jan_feb_2019_monthly_tripdata_batch_request',
     'metric_name': 'table.row_count',
     'estimator': '$variables.estimator',
     'false_positive_rate': '$variables.false_positive_rate',
     'round_decimals': 0,
     'truncate_values': {'lower_bound': 0}}],
   'expectation_configuration_builders': [{'expectation_type': 'expect_table_row_count_to_be_between',
     'c

In [12]:
profiler_configuration: RuleBasedProfilerConfig = RuleBasedProfilerConfig(**yaml_config)

In [14]:
profiler: BaseRuleBasedProfiler = BaseRuleBasedProfiler(
    profiler_configuration,
    data_context=data_context,
)

In [15]:
expectation_suite: ExpectationSuite = profiler.run(
    expectation_suite_name="NewExpectationSuite",
    batch_request=hero_batch,
    include_citation=True,
)

Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 268.68it/s]
Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 125.68it/s]
Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 124.89it/s]
Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 285.37it/s]
Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 298.76it/s]
Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 335.15it/s]
Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 304.10it/s]
Calculating Metrics: 100%|█████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 291.12it/s]
Calculating Metr

In [16]:
expectation_suite

{
  "expectation_suite_name": "NewExpectationSuite",
  "expectations": [
    {
      "kwargs": {
        "min_value": 10000.0,
        "max_value": 10000.0
      },
      "expectation_type": "expect_table_row_count_to_be_between",
      "meta": {
        "profiler_details": {
          "metric_configuration": {
            "metric_name": "table.row_count",
            "domain_kwargs": {},
            "metric_value_kwargs": null,
            "metric_dependencies": null
          },
          "num_batches": 2
        }
      }
    },
    {
      "kwargs": {
        "column": "vendor_id",
        "min_value": 1.0,
        "max_value": 1.0,
        "mostly": 1.0
      },
      "expectation_type": "expect_column_min_to_be_between",
      "meta": {
        "profiler_details": {
          "metric_configuration": {
            "metric_name": "column.min",
            "domain_kwargs": {
              "column": "vendor_id"
            },
            "metric_value_kwargs": null,
            "metr

In [None]:
# TODO : output of which Expectations are being included?

In [17]:
data_context.save_expectation_suite(expectation_suite)

'/Users/work/Development/great_expectations/tests/test_fixtures/rule_based_profiler/example_notebooks/great_expectations/expectations/NewExpectationSuite.json'

# Use Profiled ExpectationSuite to run Checkpoint

In [18]:
checkpoint: SimpleCheckpoint = SimpleCheckpoint(
    data_context=data_context,
    name="hero_checkpoint",
    validations=[{
         "batch_request": hero_batch,
         "expectation_suite_name": "NewExpectationSuite"
    }]
)

In [19]:
results = checkpoint.run()

Calculating Metrics: 100%|███████████████████████████████████████████████████████████████| 33/33 [00:00<00:00, 485.50it/s]


In [20]:
data_context.build_data_docs()

{'local_site': 'file:///Users/work/Development/great_expectations/tests/test_fixtures/rule_based_profiler/example_notebooks/great_expectations/uncommitted/data_docs/local_site/index.html'}