# Data Quality Validation Notebook

This notebook performs comprehensive data quality validation using AWS Deequ framework.

## Steps:
1. Load raw data from Delta Lake
2. Define data quality rules
3. Run AWS Deequ validation checks
4. Identify and flag anomalies
5. Store validation results


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import logging

# Retrieve AWS credentials from Databricks Secrets
access_key = dbutils.secrets.get("aws-keys", "aws-access-key")
secret_key = dbutils.secrets.get("aws-keys", "aws-secret-key")

# Build the Spark session with Delta Lake and S3 support
spark = (
    SparkSession.builder
    .appName("PsychoBunny-DataQuality")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.access.key", access_key)
    .config("spark.hadoop.fs.s3a.secret.key", secret_key)
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .config("spark.jars.packages", "com.amazon.deequ:deequ:2.0.4-spark-3.4") \
    .getOrCreate()
)

spark.conf.set("fs.s3a.access.key", access)
spark.conf.set("fs.s3a.secret.key", secret)
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")   

# logger code
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Spark session initialized with Delta Lake and AWS S3 support")


INFO:__main__:Spark session initialized with Delta Lake and AWS S3 support


In [0]:


# Initialize Spark session with Delta Lake and Deequ support
spark = SparkSession.builder \
    .appName("PsychoBunny-DataQuality") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "com.amazon.deequ:deequ:2.0.4-spark-3.4") \
    .getOrCreate()

logger.info("Spark session initialized with Delta Lake and Deequ support")


INFO:__main__:Spark session initialized with Delta Lake and Deequ support
INFO:py4j.clientserver:Received command c on object id p0


In [0]:
# Setup PyDeequ
import os
os.environ["SPARK_VERSION"] = "3.3"

from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite

# Data paths
RAW_DATA_PATH = "s3://psycho-bunny-data-lake/raw-data/"

print("PyDeequ setup completed")


PyDeequ setup completed


In [0]:
# Load data
customers_df = spark.read.format("delta").load(f"{RAW_DATA_PATH}customers")
transactions_df = spark.read.format("delta").load(f"{RAW_DATA_PATH}transactions")
calendar_df = spark.read.format("delta").load(f"{RAW_DATA_PATH}calendar")

print("Data loaded:")
print(f"Customers: {customers_df.count()}")
print(f"Transactions: {transactions_df.count()}")
print(f"Calendar: {calendar_df.count()}")


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


Data loaded:


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Python Server ready to receive messages
INFO:py4j.clientserver:Received command c on object id p0


Customers: 2000


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


Transactions: 2823


INFO:py4j.clientserver:Received command c on object id p0


Calendar: 6944


INFO:py4j.clientserver:Received command c on object id p0


In [0]:
from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite

# Test 1: Customer Data Size Check
check = Check(spark, CheckLevel.Warning, "Customer Size Check") \
    .hasSize(lambda size: size >= 1000)

result = VerificationSuite(spark) \
    .onData(customers_df) \
    .addCheck(check) \
    .run()

print("Customer Size Check:")
print(f"Overall Status: {result.status}\n")

for check_result in result.checkResults:
    print(f"Results for check: {check_result}")
    for cr in check_result:
        if isinstance(cr, dict):
            print(f"  Constraint: {cr['constraint']}")
            print(f"  Status:     {cr['status']}")
            print(f"  Message:    {cr['message']}\n")

INFO:py4j.clientserver:Received command c on object id p0


Customer Size Check:
Overall Status: Success



In [0]:
%%writefile /databricks/driver/dq.py

from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite

class DataQuality:
    def __init__(self, spark, df):
        self.spark = spark
        self.df = df
        self.suite = VerificationSuite(spark).onData(df)
        self.checks = []

    def add_size_check(self, min_rows, level=CheckLevel.Error):
        self.checks.append(
            Check(self.spark, level, f"size_at_least_{min_rows}").hasSize(lambda s: s >= min_rows)
        )
        return self

    def add_not_null(self, column, level=CheckLevel.Error):
        """Ensure no NULLs in `column`."""
        self.checks.append(
            Check(self.spark, level, f"{column}_not_null").isComplete(column)
        )
        return self

    def add_unique(self, column, level=CheckLevel.Error):
        """Ensure all values in `column` are unique."""
        self.checks.append(
            Check(self.spark, level, f"{column}_unique").isUnique(column)
        )
        return self

    def run(self):
        # attach all checks
        for c in self.checks:
            self.suite = self.suite.addCheck(c)
        result = self.suite.run()

        if result.status != "Success":
            failures = []
            for _, checks in result.checkResults.items():
                for cr in checks:
                    if cr["status"] != "Success":
                        failures.append(f"{cr['constraint']}: {cr['message']}")
            raise AssertionError("Data Quality failed:\n" + "\n".join(failures))

        return result



INFO:py4j.clientserver:Received command c on object id p0


Overwriting /databricks/driver/dq.py


In [0]:
import sys
if '/databricks/driver' not in sys.path:
    sys.path.insert(0, '/databricks/driver')

from dq import DataQuality
from pyspark.sql.functions import col

dq = (
    DataQuality(spark, customers_df)
    .add_size_check(1000)
    .add_not_null("customer_id")
    .add_unique("customer_id")
)
dq.run()   

email_regex = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
invalids = customers_df.filter(~col("EMAIL").rlike(email_regex))
cnt = invalids.count()
if cnt:
    print(f"{cnt} invalid emails")
    invalids.show(truncate=False)
    raise ValueError("Email format validation failed")

print("All checks passed!")


[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-7178245291264403>, line 13[0m
[1;32m      6[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m[38;5;21;01m.[39;00m[38;5;21;01mfunctions[39;00m [38;5;28;01mimport[39;00m col
[1;32m      8[0m [38;5;66;03m# 1) Built-in checks via Deequ[39;00m
[1;32m      9[0m dq [38;5;241m=[39m (
[1;32m     10[0m     DataQuality(spark, customers_df)
[1;32m     11[0m     [38;5;241m.[39madd_size_check([38;5;241m1000[39m)
[1;32m     12[0m     [38;5;66;03m#.add_not_null("customer_id")[39;00m
[0;32m---> 13[0m     [38;5;241m.[39madd_unique([38;5;124m"[39m[38;5;124mcustomer_id[39m[38;5;124m"[39m)
[1;32m     14[0m )
[1;32m     15[0m dq[38;5;241m.[39mrun()   [38;5;66;03m# raises if any built-in check fails[39;00m
[1;32m    