# IMPORT LIBS

In [1]:
import great_expectations as gx
from great_expectations.data_context.types.base import DataContextConfig, DatasourceConfig, FilesystemStoreBackendDefaults
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest

from ruamel import yaml

root_directory = '/development/wsl/DataQualityTest/config/quality/'
data_directory = '/development/wsl/DataQualityTest/data/'

# CONFIGURATION PROCESS

## DEFINE DATA CONTEXT CONFIG

In [2]:
dataSourceGE = DataContextConfig(
    store_backend_defaults=FilesystemStoreBackendDefaults(
        root_directory=root_directory
    ),
)

### CREATE DATA CONTEXT

In [3]:
context = gx.get_context(project_config=dataSourceGE)

## DEFINE DATA SOURCE CONFIG

In [4]:
datasource_config = {
    "name": "pandasDataSource",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "runtimeConnector": {
            "class_name": "RuntimeDataConnector",
            "module_name": "great_expectations.datasource.data_connector",
            "batch_identifiers": ["default_identifier_name"],
        },
        "folderConnector": {
            "class_name": "InferredAssetFilesystemDataConnector",
            "base_directory": data_directory,
            "default_regex": {"group_names": ["data_asset_name"], "pattern": "(.*)\.csv$"},
        },
    },
}

### TEST DATA SOURCE CONFIG

In [5]:
context.test_yaml_config(yaml.dump(datasource_config))

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	folderConnector : InferredAssetFilesystemDataConnector

	Available data_asset_names (1 of 1):
		Looks_vs_Personality (1 of 1): ['Looks_vs_Personality.csv']

	Unmatched data_references (1 of 1):['README.md']

	runtimeConnector:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x24b117abbb0>

### ADD DATA SOURCE

In [6]:
context.add_datasource(**datasource_config)

<great_expectations.datasource.new_datasource.Datasource at 0x24b117d26d0>

### CHECK DATA SOURCES

In [7]:
context.list_datasources()

[{'execution_engine': {'class_name': 'PandasExecutionEngine',
   'module_name': 'great_expectations.execution_engine'},
  'class_name': 'Datasource',
  'name': 'pandasDataSource',
  'module_name': 'great_expectations.datasource',
  'data_connectors': {'runtimeConnector': {'class_name': 'RuntimeDataConnector',
    'batch_identifiers': ['default_identifier_name'],
    'module_name': 'great_expectations.datasource.data_connector'},
   'folderConnector': {'default_regex': {'group_names': ['data_asset_name'],
     'pattern': '(.*)\\.csv$'},
    'class_name': 'InferredAssetFilesystemDataConnector',
    'module_name': 'great_expectations.datasource.data_connector',
    'base_directory': '/development/wsl/DataQualityTest/data/'}}}]

# VALIDATION PROCESS

## DEFINE BATCH REQUEST

In [8]:
batch_request = BatchRequest(
    datasource_name=context.list_datasources()[0]["name"],
    data_connector_name="folderConnector",
    data_asset_name=context.get_available_data_asset_names()['pandasDataSource']['folderConnector'][0],
)

In [9]:
batch_request

{
  "datasource_name": "pandasDataSource",
  "data_connector_name": "folderConnector",
  "data_asset_name": "Looks_vs_Personality"
}

## CREATE EXPECTATION SUITE

In [10]:
expectation_suite_name = 'DataQuality'
context.create_expectation_suite(
    expectation_suite_name=expectation_suite_name, overwrite_existing=True
)

{
  "data_asset_type": null,
  "expectations": [],
  "expectation_suite_name": "DataQuality",
  "meta": {
    "great_expectations_version": "0.15.43"
  },
  "ge_cloud_id": null
}

## CREATE VALIDATOR

In [11]:
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name=expectation_suite_name
)
print(validator.head(2))

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

   Unweighted_Sample  Weighted_Sample               Question Nationality  \
0                454              530  They are good looking    American   
1                454              530  They are good looking    American   

  Gender    Rank (text)  Rank (number)  Percentage  
0    Men   Ranked first              1       18.00  
1    Men  Ranked second              2        0.13  


### DEFINE RULES

In [12]:
validator.expect_column_values_to_not_be_null(column='Question')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 1440,
    "unexpected_count": 3,
    "unexpected_percent": 0.20833333333333334,
    "partial_unexpected_list": [
      null,
      null,
      null
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": false
}

In [13]:
validator.expect_column_values_to_be_between(
    column="Percentage", min_value=0, max_value=1
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 1440,
    "unexpected_count": 2,
    "unexpected_percent": 0.1388888888888889,
    "partial_unexpected_list": [
      18.0,
      8.0
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.1388888888888889,
    "unexpected_percent_nonmissing": 0.1388888888888889
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": false
}

### SAVE RULES ON VALIDATOR

In [14]:
validator.save_expectation_suite(discard_failed_expectations=False)

## CREATE CHECKPOINT

In [19]:
my_checkpoint_name = "dataQuality_Check"

checkpoint_config = {
    "name": my_checkpoint_name,
    "config_version": 1.0,
    "class_name": "SimpleCheckpoint",
    "run_name_template": "qualityCheck_%d-%m-%Y_%H-%M",
    "validations": [
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name,
        }
    ],
}

### TEST CHECKPOINT CONFIG

In [20]:
my_checkpoint = context.test_yaml_config(yaml.dump(checkpoint_config))

Attempting to instantiate class from config...
	Instantiating as a SimpleCheckpoint, since class_name is SimpleCheckpoint
	Successfully instantiated SimpleCheckpoint


Checkpoint class name: SimpleCheckpoint


### ADD CHECKPOINT

In [21]:
context.add_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "dataQuality_Check",
  "profilers": [],
  "run_name_template": "qualityCheck_%d-%m-%Y_%H-%M",
  "runtime_configuration": {},
  "validations": [
    {
      "batch_request": {
        "datasource_name": "pandasDataSource",
        "data_connector_name": "folderConnector",
        "data_asset_name": "Looks_vs_Personality"
      },
      "expectation_suite_name": "DataQuality"
    }
  ]
}

### RUN CHECKPOINT

In [22]:
checkpoint_result = context.run_checkpoint(
    checkpoint_name=my_checkpoint_name,
)

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]