## SetUp the tool
- Prepare the environment
- Read data asset (synthetic data)

In [53]:
import pandas as pd
from datetime import datetime, timedelta
import great_expectations as gx
import numpy as np

df = pd.read_csv('data/synthetic_data.csv')
print(df.head(10))

context = gx.get_context()

data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")
batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

   transaction_id  customer_id            transaction_date  \
0               1         2685  2025-03-06 17:50:46.055480   
1               2         1769  2024-05-06 17:50:46.055480   
2               3         7949  2024-12-20 17:50:46.055480   
3               4         3433  2024-07-16 17:50:46.055480   
4               5         6311  2024-08-07 17:50:46.055480   
5               6         6051  2024-11-19 17:50:46.055480   
6               7         7420  2025-02-02 17:50:46.055480   
7               8         2184  2025-03-06 17:50:46.055480   
8               9         5555  2024-07-16 17:50:46.055480   
9              10         4385  2024-11-03 17:50:46.055480   

   transaction_amount account_type transaction_type     status  
0         2310.159223     checking       withdrawal  completed  
1         2069.799784      savings       withdrawal    pending  
2         6383.228325   investment       withdrawal  completed  
3         2576.269638     checking       withdrawal  comp

## Implementing Data Quality Checks with Great Expectations

We'll now define and execute expectations for our two focus dimensions:

### 1. Accuracy Expectations
- Transaction amounts should be positive

In [49]:

# Define expectations
expectations_results = []

# Accuracy Expectations
print("Executing Accuracy Expectations...")

# 1. Transaction Amount Validation
expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="transaction_amount", 
    min_value=0,
    mostly=0.95
)

validation_result = batch.validate(expectation)

expectations_results.append({
    'Data Quality Issue': 'Accuracy',
    'Expectation': 'Positive Transaction Amounts',
    'Success': validation_result['success'],
    'Total records': validation_result['result']['element_count'],
    'Unexpected records': validation_result['result']['unexpected_count'],
    'Unexpected percentage': f"{validation_result['result']['unexpected_percent']:.2f}%",
    'Partial List': validation_result['result']['partial_unexpected_list'],
})
df_results = pd.DataFrame(expectations_results)
from tabulate import tabulate

print(tabulate(df_results, headers='keys', tablefmt='github'))


Executing Accuracy Expectations...


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

|    | Data Quality Issue   | Expectation                  | Success   |   Total records |   Unexpected records | Unexpected percentage   | Partial List                                                                                                                                             |
|----|----------------------|------------------------------|-----------|-----------------|----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | Accuracy             | Positive Transaction Amounts | True      |            1000 |                   19 | 1.96%                   | [-100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0] |


### 2. Completeness Expectations
- Identifying missing transaction_type values

In [50]:
# 1. Transaction Amount Validation
expectation = gx.expectations.ExpectColumnValuesToNotBeNull(
    column="transaction_type",
    #value_set=["deposit", "withdrawal", "transfer", "payment"]
)

print("Executing Completeness Expectations...")

validation_result = batch.validate(expectation)

expectations_results.append({
        'Data Quality Issue': 'Completeness',
        'Expectation': 'Expect transaction_type column values to not be null',
        'Success': validation_result['success'],
        'Total records': validation_result['result']['element_count'],
        'Unexpected records': validation_result['result']['unexpected_count'],
        'Unexpected percentage': f"{validation_result['result']['unexpected_percent']:.2f}%",
        'Partial List': validation_result['result']['partial_unexpected_list'],
    })
df_results = pd.DataFrame(expectations_results)
from tabulate import tabulate

print(tabulate(df_results, headers='keys', tablefmt='github'))

Executing Completeness Expectations...


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

|    | Data Quality Issue   | Expectation                                          | Success   |   Total records |   Unexpected records | Unexpected percentage   | Partial List                                                                                                                                             |
|----|----------------------|------------------------------------------------------|-----------|-----------------|----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | Accuracy             | Positive Transaction Amounts                         | True      |            1000 |                   19 | 1.96%                   | [-100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0] |
|  1 | Completeness         | Expect transa