In [1]:
pip install great_expectations

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Import Libraries

In [3]:
from lib2to3.fixes.fix_input import context

import pandas as pd
import json
import great_expectations as gx
import sys
import re
import os
import smtplib

from dask.graph_manipulation import checkpoint
from great_expectations.checkpoint.actions import EmailAction
from great_expectations.checkpoint.actions import UpdateDataDocsAction
from great_expectations.expectations import ExpectColumnValuesToBeBetween
from great_expectations.expectations.expectation import ColumnMapExpectation
from great_expectations.execution_engine.pandas_execution_engine import PandasExecutionEngine
from great_expectations.execution_engine.sqlalchemy_execution_engine import SqlAlchemyExecutionEngine
from great_expectations.expectations.metrics import ColumnMapMetricProvider
from great_expectations.expectations.metrics.map_metric_provider import (column_map_metric_provider,
                                                                         column_condition_partial)


print(gx.__version__)
print(pd.__version__)
print(sys.version)

  from lib2to3.fixes.fix_input import context


1.5.1
2.1.4
3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]


### Creating GX Context

In [4]:
context = gx.get_context(mode="ephemeral")

### Creating Datasource

In [5]:
data_source_name ='temperature_data'
data_source = context.data_sources.add_pandas(name=data_source_name)

### Creating DataAsset

In [6]:
data_asset_name = "emp_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

### Creating Batch Definition

In [7]:
batch_definition_name = "emp_batch"
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

### Creating Metric for PAN Validation

In [8]:
class ColumnValuesToMatchPanFormat(ColumnMapMetricProvider):
    condition_metric_name = "column_values.match_pan_format"

    @column_condition_partial(engine=PandasExecutionEngine)
    def _pandas(cls, column, **kwargs):
        pan_regex = r"^[A-Z]{5}[0-9]{4}[A-Z]{1}$"
        return column.apply(lambda x: bool(re.match(pan_regex, str(x))))

    @column_condition_partial(engine=SqlAlchemyExecutionEngine)
    def _sqlalchemy(cls, column, **kwargs):
        pan_regex = r"^[A-Z]{5}[0-9]{4}[A-Z]{1}$"
        return column.like(f'[A-Z]{5}[0-9]{4}[A-Z]{1}')

### Create Custom Expectation

In [9]:
class ExpectColumnValuesToMatchPanFormat(ColumnMapExpectation):
    map_metric = "column_values.match_pan_format"
    success_keys = ("mostly",)
    default_kwarg_values = {
        "mostly": 1.0,
        "result_format": "BASIC",
        "include_config": True
    }

### Creating Expectation

In [10]:
expectation_date = ExpectColumnValuesToMatchPanFormat(column="Date")
expectation_temp = ExpectColumnValuesToBeBetween(column="Temperature",
                                                 max_value=45,
                                                 min_value=10,)
expectation_city = gx.expectations.ExpectColumnDistinctValuesToBeInSet(column="City",
                                                                       value_set=["Mumbai", "Delhi"])
expectation_city_not_null = gx.expectations.ExpectColumnValuesToNotBeNull(column="City")
expectation_date_not_null = gx.expectations.ExpectColumnValuesToNotBeNull(column="Date")
expectation_temp_not_null = gx.expectations.ExpectColumnValuesToNotBeNull(column="Temperature")

### Creating ExpectationSuite and Adding Expectations

In [11]:
expectation_suite_name = "emp_suite"
expectation_suite_ref = gx.ExpectationSuite(name=expectation_suite_name)
expectation_suite = context.suites.add(expectation_suite_ref)

In [12]:
expectation_suite.add_expectation(expectation_date)
expectation_suite.add_expectation(expectation_temp)
expectation_suite.add_expectation(expectation_city)
expectation_suite.add_expectation(expectation_city_not_null)
expectation_suite.add_expectation(expectation_date_not_null)
expectation_suite.add_expectation(expectation_temp_not_null)

ExpectColumnValuesToNotBeNull(id='283720af-41af-4cfe-8255-990fdb7189ed', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='Temperature', mostly=1, row_condition=None, condition_parser=None)

### Creating Validation Definition

In [13]:
validation_def_name = "emp_data_validation"
validation_definition_ref = gx.ValidationDefinition(data=batch_definition,
                                                    suite=expectation_suite,
                                                    name=validation_def_name)

In [14]:
validation_definition = context.validation_definitions.add(validation_definition_ref)

### Creating Update Data Docs Action

In [15]:
update_action = {
    "name": "my_docs_action",
    "type": "update_data_docs",
    "site_names": ["local_site"]
}

### Creating Send Email Action

In [16]:
# Load email credentials from JSON
with open('Email Credentials.json') as f:
    data = json.load(f)

mailserveraddr = data['email']['mailServer']
fromaddr = data['email']['from']
password = data['email']['password']
to = data['email']['to']
port = data['email']['port']

# Convert the list of recipient emails to a comma-separated string
receiver_emails = ",".join(to)

# Test email connection
try:
    with smtplib.SMTP(mailserveraddr, port) as server:
        server.starttls()
        server.login(fromaddr, password)
        print("Email connection successful!")
except Exception as e:
    print("Email connection failed:", str(e))

# Define the email action with corrected configuration
email_action = {
    "name": "send_email_notification",  # Added required name field
    "type": "email",  # Specifies the action type
    "notify_on": "all",
    "smtp_address": mailserveraddr,
    "smtp_port": port,
    "sender_login": fromaddr,
    "sender_password": password,
    "receiver_emails": receiver_emails,  # Use comma-separated string
    "use_tls": True
}

Email connection successful!


### Creating Checkpoint

In [17]:
checkpoint_name = "temp_run_checkpoint"
checkpoint_to_add = gx.Checkpoint(
    name=checkpoint_name,
    validation_definitions=[validation_definition],
    actions=[email_action, update_action],  # Now with the correct action config
    result_format="COMPLETE"
)

checkpoint = context.checkpoints.add(checkpoint_to_add)

### Reading Data in Pandas DataFrame

In [18]:
data_df = pd.read_csv('temperature.csv')
data_df.head()

Unnamed: 0,Date,City,Temperature
0,8/1/2024,Mumbai,28
1,8/2/2024,Mumbai,30
2,8/3/2024,Mumbai,32
3,8/4/2024,Mumbai,31
4,8/5/2024,Mumbai,33


### Creating BatchParameter

In [19]:
batch_parameters = {"dataframe": data_df}

### Creating runtime parameter object

In [20]:
runtime_parameters = {
    "temp_min_value": 29,
    "temp_max_value": 50,
    "city": ["Mumbai", "Delhi"]
}

### Running Checkpoint

In [21]:
validation_results = checkpoint.run(batch_parameters=batch_parameters,
                                    expectation_parameters=runtime_parameters)


Calculating Metrics:   0%|          | 0/29 [00:00<?, ?it/s]

### Updating runtime parameter object

In [26]:
runtime_parameters = {
    "temp_min_value": 19,
    "temp_max_value": 25,
    "city": ["Mumbai", "Delhi"]
}

### Rerunning Checkpoint

In [27]:
validation_results = checkpoint.run(batch_parameters=batch_parameters,
                                    expectation_parameters=runtime_parameters)

Calculating Metrics:   0%|          | 0/29 [00:00<?, ?it/s]

### Displaying validation results

In [28]:
print(validation_results)

run_id={"run_name": null, "run_time": "2025-06-23T14:07:48.924329+06:00"} run_results={ValidationResultIdentifier::emp_suite/__none__/20250623T080748.924329Z/temperature_data-emp_asset: {
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_match_pan_format",
        "kwargs": {
          "batch_id": "temperature_data-emp_asset",
          "column": "Date"
        },
        "meta": {},
        "id": "0421f92b-0e07-48fa-b3e3-9f0a14cb993d"
      },
      "result": {
        "element_count": 62,
        "unexpected_count": 62,
        "unexpected_percent": 100.0,
        "partial_unexpected_list": [
          "8/1/2024",
          "8/2/2024",
          "8/3/2024",
          "8/4/2024",
          "8/5/2024",
          "8/6/2024",
          "8/7/2024",
          "8/8/2024",
          "8/9/2024",
          "8/10/2024",
          "8/11/2024",
          "8/12/2024",
          "8/13/2024",
          "8/14/2024",

### Opening Data Docs

In [25]:
context.open_data_docs()