In [0]:
#%pip install great_expectations==1.10.0

In [0]:
#dbutils.library.restartPython()

In [0]:
%run "./01_config"

In [0]:
Conf = Config()
print(f"Deleting data quality folder...", end='')
dbutils.fs.rm(Conf.project_dir + 'gx/data_quality_quarantine/', True)
print("Done")

In [0]:
import os
import shutil
import great_expectations as gx
import great_expectations.expectations as gxe

gx_root_dir = "/Volumes/fitbit_dev_catalog/gx/gx_configs"

path_to_delete = f"/dbfs{gx_root_dir}" if not gx_root_dir.startswith("/dbfs") else gx_root_dir

if os.path.exists(gx_root_dir):
    print(f"Âº∫Âà∂Ê∏ÖÁ©∫ÁõÆÂΩï: {gx_root_dir}")
    # True Ë°®Á§∫ÈÄíÂΩíÂà†Èô§
    dbutils.fs.rm(gx_root_dir, True) 

# ÈáçÊñ∞ÂàõÂª∫
os.makedirs(gx_root_dir, exist_ok=True)

def get_context():
    return gx.get_context(context_root_dir=gx_root_dir)

In [0]:
context = get_context()

In [0]:
tables_to_validate = [
    {"table": "calories_min_bz", "suite": "calories_min_bz_suite"},
    {"table": "heartrate_sec_bz", "suite": "heartrate_sec_bz_suite"},
    {"table": "intensities_min_bz", "suite": "intensities_min_bz_suite"},
    {"table": "mets_min_bz", "suite": "mets_min_bz_suite"},
    {"table": "sleep_min_bz", "suite": "sleep_min_bz_suite"},
    {"table": "steps_min_bz", "suite": "steps_min_bz_suite"},
    {"table": "weight_daily_bz", "suite": "weight_daily_bz_suite"}  
]

for item in tables_to_validate:
    table_name = item["table"]
    suite_name = item["suite"]
    
    # 1.10+ Ëé∑ÂèñÊàñÂàõÂª∫ Suite ÁöÑÊ†áÂáÜÂÜôÊ≥ï
    try:
        # ‰ΩøÁî® .suites.get Ëé∑Âèñ
        suite = context.suites.get(name=suite_name)
        print(f"‚úÖ Â∑≤Âä†ËΩΩÁé∞Êúâ Suite: {suite_name}")
    except Exception:
        # ‰ΩøÁî® .suites.add Âíå gx.ExpectationSuite ÂàõÂª∫
        # Ëøô‰ºöËá™Âä®Âú® Files/gx_config/expectations/ ‰∏ãÁîüÊàêÂØπÂ∫îÁöÑ .json Êñá‰ª∂
        suite = context.suites.add(gx.ExpectationSuite(name=suite_name))
        print(f"‚ú® Â∑≤ÊàêÂäüÊñ∞Âª∫Âπ∂ÊåÅ‰πÖÂåñ Suite: {suite_name}")

print("\nÊâÄÊúâ Suite ÂàùÂßãÂåñÂÆåÊàê„ÄÇ")

In [0]:
table_rules_mapping = {
    "calories_min_bz": [
        gxe.ExpectTableColumnsToMatchSet(
            column_set=[
                "user_id",
                "activity_minute",
                "calories",
                "date",
                "timeKey",
                "load_time",
                "source_file",
                "_dq_batch_id"
            ],
            exact_match=True 
        ),
        gxe.ExpectColumnValuesToNotBeNull(column="user_id"),
        gxe.ExpectColumnValuesToBeBetween(column="calories", min_value=0)
    ],
    "heartrate_sec_bz": [
        gxe.ExpectTableColumnsToMatchSet(
            column_set=[
                "user_id",
                "time",
                "value",
                "date",
                "timeKey",
                "load_time",
                "source_file",
                "_dq_batch_id"
            ],
            exact_match=True 
        ),
        gxe.ExpectColumnValuesToNotBeNull(column="user_id"),
        gxe.ExpectColumnValuesToBeBetween(column="value", min_value=0, max_value=200)
    ],
    "intensities_min_bz": [
        gxe.ExpectTableColumnsToMatchSet(
            column_set=[
                "user_id",
                "activity_minute",
                "intensity",
                "date",
                "timeKey",
                "load_time",
                "source_file",
                "_dq_batch_id"
            ],
            exact_match=True 
        ),
        gxe.ExpectColumnValuesToNotBeNull(column="user_id"),
        gxe.ExpectColumnValuesToBeBetween(column="intensity", min_value=0, max_value=3)
    ],    
    "mets_min_bz": [
        gxe.ExpectTableColumnsToMatchSet(
            column_set=[
                "user_id",
                "activity_minute",
                "mets",
                "date",
                "timeKey",
                "load_time",
                "source_file",
                "_dq_batch_id"
            ],
            exact_match=True 
        ),
        gxe.ExpectColumnValuesToNotBeNull(column="user_id"),
        gxe.ExpectColumnValuesToBeBetween(column="mets", min_value=10, max_value=200)
    ], 
    "sleep_min_bz": [
        gxe.ExpectTableColumnsToMatchSet(
            column_set=[
                "user_id",
                "activity_minute",
                "value",
                "log_id",
                "date",
                "timeKey",
                "load_time",
                "source_file",
                "_dq_batch_id"
            ],
            exact_match=True 
        ),
        gxe.ExpectColumnValuesToNotBeNull(column="user_id"),
        gxe.ExpectColumnValuesToBeBetween(column="value", min_value=1, max_value=3)
    ], 
    "steps_min_bz": [
        gxe.ExpectTableColumnsToMatchSet(
            column_set=[
                "user_id",
                "activity_minute",
                "steps",
                "date",
                "timeKey",
                "load_time",
                "source_file",
                "_dq_batch_id"
            ],
            exact_match=True 
        ),
        gxe.ExpectColumnValuesToNotBeNull(column="user_id"),
        gxe.ExpectColumnValuesToBeBetween(column="steps", min_value=0)
    ], 
    "weight_daily_bz": [
        gxe.ExpectTableColumnsToMatchSet(
            column_set=[
                "user_id",
                "date",
                "weight_kg",
                "weight_pounds",
                "fat",
                "bmi",
                "is_manual_report",
                "log_id",
                "activity_minute",
                "load_time",
                "source_file",
                "_dq_batch_id"
            ],
            exact_match=True 
        ),
        gxe.ExpectColumnValuesToNotBeNull(column="user_id"),
        gxe.ExpectColumnValuesToBeBetween(column="weight_kg", min_value=0),
        gxe.ExpectColumnValuesToBeBetween(column="weight_pounds", min_value=0),
        gxe.ExpectColumnValuesToBeBetween(column="fat", min_value=0),
        gxe.ExpectColumnValuesToBeBetween(column="bmi", min_value=0)
    ]    
}

In [0]:
def initialize_all_suites(context, rules_mapping):
    for table_name, expectations in rules_mapping.items():
        suite_name = f"{table_name}_suite"
        
        # 1. Ëé∑ÂèñÊàñÂàõÂª∫ Suite
        try:
            suite = context.suites.get(name=suite_name)
            suite.expectations = [] 
            print(f"üîÑ Êõ¥Êñ∞Áé∞Êúâ Suite: {suite_name}")
        except Exception:
            suite = context.suites.add(gx.ExpectationSuite(name=suite_name))
            print(f"‚ú® ÂàõÂª∫Êñ∞ Suite: {suite_name}")

        # 2. ‰∏∫ËØ•Ë°®Ê∑ªÂä†ÂÆö‰πâÁöÑÊØè‰∏ÄÊù°ËßÑÂàô
        for exp in expectations:
            # add_expectation Âú® 1.x ‰∏≠‰ºöËá™Âä®ÂéªÈáçÔºàÂ¶ÇÊûúËßÑÂàôÂÆåÂÖ®‰∏ÄÊ†∑Ôºâ
            suite.add_expectation(exp)
        
        print(f"   Â∑≤Ê∑ªÂä† {len(expectations)} Êù°ËßÑÂàôÂà∞ {suite_name}")

In [0]:
initialize_all_suites(context, table_rules_mapping)

In [0]:
# 1. ÂΩªÂ∫ïÂà†Èô§ÊóßË°®
spark.sql("DROP TABLE IF EXISTS fitbit_dev_catalog.gx.data_quality_quarantine")

# 2. ÈáçÊñ∞ÂàõÂª∫Ë°® (ËÆ© Unity Catalog Ëá™Âä®ÁÆ°ÁêÜÂ≠òÂÇ®Ë∑ØÂæÑ)
create_table_sql = f"""
CREATE TABLE fitbit_dev_catalog.gx.data_quality_quarantine (
    table_name STRING COMMENT 'The name of the source table where the data originated',
    gx_batch_id STRING COMMENT 'The identifier for the GX validation run (casted to string)',
    violated_rules STRING COMMENT 'A list or description of the rules that failed validation',
    raw_data STRING COMMENT 'The original record stored in JSON format',
    ingestion_time TIMESTAMP COMMENT 'The timestamp when the record was quarantined'
)
USING DELTA
LOCATION '{Conf.project_dir}gx/data_quality_quarantine/'
TBLPROPERTIES (
    'delta.columnMapping.mode' = 'name',
    'delta.minReaderVersion' = '2',
    'delta.minWriterVersion' = '5'
)
COMMENT 'Universal data quality quarantine table for storing records that failed GX validation'
"""

spark.sql(create_table_sql)

print("Table 'data_quality_quarantine' has been recreated successfully.")