# IMPORT LIBS

In [1]:
import great_expectations as gx
from great_expectations.data_context.types.base import DataContextConfig, DatasourceConfig, FilesystemStoreBackendDefaults
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest

from ruamel import yaml

root_directory = '/development/wsl/DataQualityTest/config/quality/'
data_directory = '/development/wsl/DataQualityTest/data/'

# CONFIGURATION PROCESS

## DEFINE DATA CONTEXT CONFIG

In [2]:
dataSourceGE = DataContextConfig(
    store_backend_defaults=FilesystemStoreBackendDefaults(
        root_directory=root_directory
    ),
    stores = {
        "expectations_store": {
        "class_name": "ExpectationsStore",
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "base_directory": "expectations/",
            "root_directory": root_directory
        }
        },
        "validations_store": {
        "class_name": "ValidationsStore",
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "base_directory": "validations/",
            "root_directory": "/development/wsl/DataQualityTest/output/"
        }
        },
        "evaluation_parameter_store": {
        "class_name": "EvaluationParameterStore"
        },
        "checkpoint_store": {
        "class_name": "CheckpointStore",
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "suppress_store_backend_id": True,
            "base_directory": "checkpoints/",
            "root_directory": root_directory
        }
        },
        "profiler_store": {
        "class_name": "ProfilerStore",
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "suppress_store_backend_id": True,
            "base_directory": "profilers/",
            "root_directory": root_directory
        }
        },
        "metric_store": {
        "class_name": "MetricStore",
        "store_backend": {
            "class_name": "DatabaseStoreBackend",
            "credentials": {
                    "drivername": "postgresql",
                    "username": "guilo",
                    "password": "903041",
                    "host": "localhost",
                    "port": "5432",
                    "database": "ge",
                    }
        },
    },
    },
    validations_store_name= "validations_store",
    validation_operators={
    "action_list_operator": {
        "class_name": "ActionListValidationOperator",
        "action_list": [
            {
                "name": "store_metrics",
                "action": {
                    "class_name": "StoreMetricsAction",
                    "target_store_name": "metric_store",
                    "requested_metrics": {
                        "*": [
                                        "statistics.evaluated_expectations",
                                        "statistics.success_percent",
                                        "statistics.unsuccessful_expectations"
                                    ],
                    },
                },
            },
        ],
    },
    },

)

In [3]:
# validation_operators={
#     "action_list_operator": {
#         "class_name": "ActionListValidationOperator",
#         "action_list": [
#             {
#                 "name": "store_metrics",
#                 "action": {
#                     "class_name": "StoreMetricsAction",
#                     "target_store_name": "metric_store",
#                     "requested_metrics": {
#                         "*": {
#                             "statistics.evaluated_expectations",
#                             "statistics.successful_expectations",
#                             "statistics.unsuccessful_expectations"
#                         },
#                     },
#                 },
#             },
#         ],
#     }
# },

### CREATE DATA CONTEXT

In [4]:
context = gx.get_context(project_config=dataSourceGE)

## DEFINE DATA SOURCE CONFIG

In [5]:
datasource_config = {
    "name": "pandasDataSource",
    "class_name": "Datasource",
    "module_name": "great_expectations.datasource",
    "execution_engine": {
        "module_name": "great_expectations.execution_engine",
        "class_name": "PandasExecutionEngine",
    },
    "data_connectors": {
        "runtimeConnector": {
            "class_name": "RuntimeDataConnector",
            "module_name": "great_expectations.datasource.data_connector",
            "batch_identifiers": ["default_identifier_name"],
        },
        "folderConnector": {
            "class_name": "InferredAssetFilesystemDataConnector",
            "base_directory": data_directory,
            "default_regex": {"group_names": ["data_asset_name"], "pattern": "(.*)\.csv$"},
        },
    },
}

### TEST DATA SOURCE CONFIG

In [6]:
context.test_yaml_config(yaml.dump(datasource_config))

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	folderConnector : InferredAssetFilesystemDataConnector

	Available data_asset_names (1 of 1):
		Looks_vs_Personality (1 of 1): ['Looks_vs_Personality.csv']

	Unmatched data_references (1 of 1):['README.md']

	runtimeConnector:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x1d44c68c940>

### ADD DATA SOURCE

In [7]:
context.add_datasource(**datasource_config)

<great_expectations.datasource.new_datasource.Datasource at 0x1d44c6b0d60>

### CHECK DATA SOURCES

In [8]:
context.list_datasources()

[{'data_connectors': {'runtimeConnector': {'class_name': 'RuntimeDataConnector',
    'module_name': 'great_expectations.datasource.data_connector',
    'batch_identifiers': ['default_identifier_name']},
   'folderConnector': {'default_regex': {'group_names': ['data_asset_name'],
     'pattern': '(.*)\\.csv$'},
    'class_name': 'InferredAssetFilesystemDataConnector',
    'module_name': 'great_expectations.datasource.data_connector',
    'base_directory': '/development/wsl/DataQualityTest/data/'}},
  'name': 'pandasDataSource',
  'class_name': 'Datasource',
  'module_name': 'great_expectations.datasource',
  'execution_engine': {'class_name': 'PandasExecutionEngine',
   'module_name': 'great_expectations.execution_engine'}}]

# VALIDATION PROCESS

## DEFINE BATCH REQUEST

In [9]:
batch_request = BatchRequest(
    datasource_name=context.list_datasources()[0]["name"],
    data_connector_name="folderConnector",
    data_asset_name=context.get_available_data_asset_names()['pandasDataSource']['folderConnector'][0],
)

In [10]:
batch_request

{
  "datasource_name": "pandasDataSource",
  "data_connector_name": "folderConnector",
  "data_asset_name": "Looks_vs_Personality"
}

## CREATE EXPECTATION SUITE

In [11]:
expectation_suite_name = 'DataQuality'
context.create_expectation_suite(
    expectation_suite_name=expectation_suite_name, overwrite_existing=True
)

{
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.15.43"
  },
  "ge_cloud_id": null,
  "expectations": [],
  "expectation_suite_name": "DataQuality"
}

## CREATE VALIDATOR

In [12]:
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name=expectation_suite_name
)
print(validator.head(2))

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

   Unweighted_Sample  Weighted_Sample               Question Nationality  \
0                454              530  They are good looking    American   
1                454              530  They are good looking    American   

  Gender    Rank (text)  Rank (number)  Percentage  
0    Men   Ranked first              1        0.18  
1    Men  Ranked second              2        0.13  


### DEFINE RULES

In [13]:
validator.expect_column_values_to_not_be_null(column='Question')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 1440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [14]:
validator.expect_column_values_to_be_between(
    column="Percentage", min_value=0, max_value=1
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 1440,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {}
}

### SAVE RULES ON VALIDATOR

In [15]:
validator.save_expectation_suite(discard_failed_expectations=False)

## CREATE CHECKPOINT

In [16]:
my_checkpoint_name = "dataQuality_Check"

checkpoint_config = {
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    },
    {
      "name": "store_metrics",
      "action": {
          "class_name": "StoreMetricsAction",
          "target_store_name": "metric_store",
          "requested_metrics": {
              "*": [
                "statistics.evaluated_expectations",
                "statistics.success_percent",
                "statistics.unsuccessful_expectations"
              ],
                
              
          },
      },
}
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": my_checkpoint_name,
  "profilers": [],
  "run_name_template": "qualityCheck_%d-%m-%Y_%H-%M-%S",
  "runtime_configuration": {},
  "validations": [
    {
      "batch_request": batch_request,
      "expectation_suite_name": expectation_suite_name,

    }
  ]
}

In [17]:
# my_checkpoint_name = "dataQuality_Check"

# checkpoint_config = {
#     "name": my_checkpoint_name,
#     "config_version": 1.0,
#     "class_name": "SimpleCheckpoint",
#     "run_name_template": "qualityCheck_%d-%m-%Y_%H-%M-%S",
#     "validations": [
#         {
#             "batch_request": batch_request,
#             "expectation_suite_name": expectation_suite_name,
#         }
#     ],
# }

### TEST CHECKPOINT CONFIG

In [18]:
my_checkpoint = context.test_yaml_config(yaml.dump(checkpoint_config))

Attempting to instantiate class from config...
	Instantiating as a Checkpoint, since class_name is Checkpoint
	Successfully instantiated Checkpoint


Checkpoint class name: Checkpoint


### ADD CHECKPOINT

In [19]:
context.add_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction"
      }
    },
    {
      "name": "store_metrics",
      "action": {
        "class_name": "StoreMetricsAction",
        "target_store_name": "metric_store",
        "requested_metrics": {
          "*": [
            "statistics.evaluated_expectations",
            "statistics.success_percent",
            "statistics.unsuccessful_expectations"
          ]
        }
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "dataQuality_Check",
  "profilers": [],
  "run_na

### RUN CHECKPOINT

In [22]:
checkpoint_result = context.run_checkpoint(
    checkpoint_name=my_checkpoint_name,
)

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]

In [21]:
context

{
  "anonymous_usage_statistics": {
    "explicit_id": true,
    "data_context_id": "36eeef9c-23b8-49b3-9de3-ec97aa6ceff9",
    "explicit_url": false,
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "enabled": true
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_version": 3,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "uncommitted/data_docs/local_site/",
        "root_directory": "/development/wsl/DataQualityTest/config/quality/"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {
    "pandasDataSource": {
      "class_name": "Datasource",
      "data_connectors": {
        "runtimeConnector": {
          "class_name": "RuntimeDataConnector",
          "module_name": "gr