# Hero Experience
- This Notebook outlines the steps we can take to get a single `Batch` of data and running the `RuleBasedProfiler` on it.  

In [4]:
import great_expectations as ge
from great_expectations.core.batch import BatchRequest
from great_expectations.core import ExpectationSuite

from great_expectations.rule_based_profiler.config.base import RuleBasedProfilerConfig
from great_expectations.rule_based_profiler.rule_based_profiler import BaseRuleBasedProfiler

from great_expectations.core.yaml_handler import YAMLHandler
from great_expectations.checkpoint.checkpoint import SimpleCheckpoint

In [5]:
data_context: ge.DataContext = ge.get_context()

In [6]:
yaml_handler = YAMLHandler()

In [7]:
# loading the first 3 months of 2019 Taxi data into our datasource
data_path: str = "../../../../test_sets/taxi_yellow_tripdata_samples/first_3_files"
datasource_config = {
    "name": "taxi_pandas",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "monthly": {
            "class_name": "ConfiguredAssetFilesystemDataConnector",
            "base_directory": data_path,
            "assets":{
                "my_reports": {
                    "base_directory": "./",
                    "group_names": ["name", "year", "month"],
                    "pattern": "^(.+)_(\\d.*)-(\\d.*)\\.csv",
                    "module_name": "great_expectations.datasource.data_connector.asset",
                    "class_name": "Asset",
                },          
            },
        },
    },
}

data_context.test_yaml_config(yaml_handler.dump(datasource_config))

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	monthly : ConfiguredAssetFilesystemDataConnector

	Available data_asset_names (1 of 1):
		my_reports (3 of 3): ['yellow_tripdata_sample_2019-01.csv', 'yellow_tripdata_sample_2019-02.csv', 'yellow_tripdata_sample_2019-03.csv']

	Unmatched data_references (1 of 1):['.DS_Store']



<great_expectations.datasource.new_datasource.Datasource at 0x7fd3e97c1af0>

In [8]:
# add_datasource only if it doesn't already exist in our configuration
try:
    data_context.get_datasource(datasource_config["name"])
except ValueError:
    data_context.add_datasource(**datasource_config)

## Build BatchRequest and Retrieve Batch
In this example, we will be using a `BatchRequest` that returns a single `Batch`, namely the most recent 2019 taxi data asset in our `Asset`.

In [9]:
hero_batch: BatchRequest = BatchRequest(
    datasource_name="taxi_pandas",
    data_connector_name="monthly",
    data_asset_name="my_reports",
    data_connector_query={
      "index": "-1"
    }
)

In [10]:
expectation_suite = data_context.create_expectation_suite(
    expectation_suite_name="temp_suite", overwrite_existing=True
)
validator = data_context.get_validator(
    batch_request=hero_batch,
    expectation_suite=expectation_suite,
)
validator.head(n_rows=5, fetch_all=False)

Calculating Metrics: 100%|████████████████████████████████| 1/1 [00:00<00:00, 75.84it/s]


Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-03-01 18:43:31,2019-03-01 18:48:42,2,1.1,1,N,143,238,1,6.0,3.5,0.5,1.55,0.0,0.3,11.85,2.5
1,2,2019-03-20 16:30:11,2019-03-20 16:44:49,1,2.63,1,N,231,88,1,12.5,1.0,0.5,3.36,0.0,0.3,20.16,2.5
2,1,2019-03-07 19:01:51,2019-03-07 19:04:36,1,0.3,1,N,237,237,1,4.0,3.5,0.5,1.66,0.0,0.3,9.96,2.5
3,1,2019-03-02 16:33:01,2019-03-02 16:37:41,1,0.9,1,N,42,41,1,5.5,0.0,0.5,0.0,0.0,0.3,6.3,0.0
4,2,2019-03-28 14:10:47,2019-03-28 14:49:37,1,15.68,3,N,231,1,1,63.0,0.0,0.0,14.76,10.5,0.3,88.56,0.0


In [11]:
# this will be used later when we are displaying our Checkpoint results
columns = validator.active_batch.head().columns

# Load Profiler


In [12]:
# this part will be taken care of by the Data Assistants

In [13]:
configuration_path = "../bobby_user_workflow_verbose_profiler_config.yml"

In [14]:
with open(configuration_path) as f:
    yaml_handler = YAMLHandler()
    read = f.read()
    yaml_config = yaml_handler.load(read)

In [15]:
profiler_configuration: RuleBasedProfilerConfig = RuleBasedProfilerConfig(**yaml_config)

In [19]:
profiler: BaseRuleBasedProfiler = BaseRuleBasedProfiler(
    profiler_configuration,
    data_context=data_context,
)

In [20]:
expectation_suite: ExpectationSuite = profiler.run(
    expectation_suite_name="NewExpectationSuite",
    # what does a batch_quest do here? 
    batch_request=hero_batch,
    include_citation=False,
    
)

Calculating Metrics: 100%|███████████████████████████████| 2/2 [00:00<00:00, 303.58it/s]
Calculating Metrics: 100%|███████████████████████████████| 2/2 [00:00<00:00, 139.26it/s]
Calculating Metrics: 100%|███████████████████████████████| 1/1 [00:00<00:00, 132.12it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 420.99it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 471.14it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 474.58it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 379.59it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 368.09it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 405.72it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 480.77it/s]
Calculating Metrics: 100%|███████████████████████████████| 8/8 [00:00<00:00, 450.61it/s]
Calculating Metrics: 

In [21]:
# TODO : output of which Expectations are being included?

In [22]:
data_context.save_expectation_suite(expectation_suite)

'/Users/work/Development/great_expectations/tests/test_fixtures/rule_based_profiler/example_notebooks/great_expectations/expectations/NewExpectationSuite.json'

# Use Profiled ExpectationSuite to run Checkpoint

In [23]:
checkpoint: SimpleCheckpoint = SimpleCheckpoint(
    data_context=data_context,
    name="hero_checkpoint",
    validations=[{
         "batch_request": hero_batch,
         "expectation_suite_name": "NewExpectationSuite"
    }]
)

In [24]:
results = checkpoint.run()

Calculating Metrics: 100%|█████████████████████████████| 33/33 [00:00<00:00, 285.91it/s]


In [25]:
data_context.build_data_docs()

{'local_site': 'file:///Users/work/Development/great_expectations/tests/test_fixtures/rule_based_profiler/example_notebooks/great_expectations/uncommitted/data_docs/local_site/index.html'}