# CAS Data Engineering FHNW
## Data Wrangling Module - Author F.Burnelli


### Exercise 6 Validation
### Note:use the Kernek: "Python (wrangling)"

Run the Notebook and create your own Expectations

In [1]:
import os
## G R E A T  E X P E C T A T I O N S 
import great_expectations as gx


### Data Path

In [2]:
file_path = os.path.join("..","data", "dirty-loan-data.csv")



### Create a Data Context
- configurations
- expectation suites
- data sources 
- etc.

#### Note:for this exercise we skip the "great_expectations init" step 

In [3]:
if not os.path.isdir("gx"):
    context = gx.get_context().convert_to_file_context()
else:
    context = gx.get_context()

some files are created

In [4]:
!ls gx/

[34mcheckpoints[m[m            great_expectations.yml [34mprofilers[m[m
[34mexpectations[m[m           [34mplugins[m[m                [34muncommitted[m[m


## Create a validator

In [5]:
validator = context.sources.pandas_default.read_csv(file_path)

### Create an Expectation: Value in Range

In [6]:
validator.expect_column_values_to_be_between(
    "annual_inc", min_value=1, max_value=10000000
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 17847,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 1,
    "missing_percent": 0.005603182607721186,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

##### Default.json stores relevant info about the validation 

In [7]:
!ls -rtl  ./gx/expectations/default.json

-rw-r--r--  1 coding  staff  383 Jul 13 18:31 ./gx/expectations/default.json


In [8]:
validator.save_expectation_suite(discard_failed_expectations=False)



### Create a Checkpoint
It is a mechanism for running data validation pipelines with predefined expectations and configurations

In [9]:
checkpoint = context.add_or_update_checkpoint(
    name="checkpoint_1",
    validator=validator,
)

### Run the Validation

In [10]:
result_format: dict = {
    "result_format": "COMPLETE",
    "unexpected_index_column_names": ["event_id"],
}
    
checkpoint_results = checkpoint.run(result_format=result_format)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

### View Results (HTML)

In [13]:
context.view_validation_result(checkpoint_results)

### View Results (json)

In [16]:
checkpoint_results

{
  "run_id": {
    "run_name": null,
    "run_time": "2024-07-13T18:42:27.259390+02:00"
  },
  "run_results": {
    "ValidationResultIdentifier::default/__none__/20240713T164227.259390Z/default_pandas_datasource-#ephemeral_pandas_asset": {
      "validation_result": {
        "success": true,
        "results": [
          {
            "success": true,
            "expectation_config": {
              "expectation_type": "expect_column_values_to_be_between",
              "kwargs": {
                "column": "annual_inc",
                "max_value": 10000000,
                "min_value": 1,
                "batch_id": "default_pandas_datasource-#ephemeral_pandas_asset"
              },
              "meta": {}
            },
            "result": {
              "element_count": 17847,
              "unexpected_count": 0,
              "unexpected_percent": 0.0,
              "partial_unexpected_list": [],
              "unexpected_index_column_names": [
                "event_id"


### Was All successful?

In [17]:
checkpoint_results.success

True