## Data Quality checks
- Check fo missing data or gaps in time series 
- Check Outliers (if applicable)
- Schema validation - already done during ingestion phase

In [None]:
start_time = ""
end_time = ""

In [10]:
import os
import json
from dotenv import load_dotenv
from datetime import datetime, timedelta, timezone
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max, expr, to_date, count, mean, stddev, col,to_timestamp, lit

In [11]:
#load_dotenv()
#os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home"
# Get Jar path needed for spark session
# For simplicity using locally downloaded jars for delta format
cwd = os.getcwd()
if cwd.endswith("notebooks"):
    proj_dir = os.path.abspath("..")
else:
    proj_dir = cwd
jar_dir = os.path.join(proj_dir, "jars")
jar1 = os.path.join(jar_dir, "delta-spark_2.13-4.0.0.jar")
jar2 = os.path.join(jar_dir, "delta-storage-4.0.0.jar")

In [12]:
spark = SparkSession.builder.appName("EnergyUseCase") \
            .config("spark.jars", f"{jar1},{jar2}") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .getOrCreate()

## Checking DQ of public power data ..

In [20]:

power_data = spark.read.format("delta").load(f"{proj_dir}/data/silver/public_power_data")  
power_data_24hr = power_data.filter((col("timestamp") >= to_timestamp(lit(start_time))) & (col("timestamp") <= to_timestamp(lit(end_time))))


In [27]:
# DQ functions 
def check_for_gaps():
    """ Records per day should have 96 distinct values because of 15 mins interval"""

    res = power_data_24hr.withColumn("day", to_date("timestamp")) \
        .groupBy("production_type","day") \
        .agg(count("*").alias("records_per_day")) \
        .orderBy("production_type", "day") 
    cnt = res.select("records_per_day").distinct().count()
    print(cnt)
    invalid_days = res.filter(col("records_per_day") != 96).select("day").distinct().collect()
    print(invalid_days)
    invalid_days_dict = [row.asDict() for row in invalid_days]
    return {"check": f"Data Gaps", "status": "Pass" if cnt == 1 else "Fail", "failed_records": f"failed on days {invalid_days_dict}"}

def check_missing_field(id_column):
    missing_id_count = power_data_24hr.filter(col(id_column).isNull()).count()
    return {"check": f"{id_column} not null", "status": "Pass" if missing_id_count == 0 else "Fail", "failed_records": missing_id_count}


def get_outliers():
    
    stats = power_data_24hr.groupBy("production_type").agg(
        mean("net_power_produced").alias("mean"),
        stddev("net_power_produced").alias("std")
    )
    df_with_stats = power_data_24hr.join(stats, on="production_type", how="left")
    outliers = df_with_stats.filter(
        (col("net_power_produced") > col("mean") + 3 * col("std")) |
        (col("net_power_produced") < col("mean") - 3 * col("std"))
    )
    summary= outliers.groupBy("production_type").count()
    outlier_info = {row['production_type']: row['count'] for row in summary.collect()}
    return {"check": f"Data Outliers", "status": "Pass" if bool(outlier_info) == False else "Fail", "failed_records": f"Have outliers {outlier_info}"}
    
    

In [28]:
def run_data_quality_checks():
    results = []

    results.append(check_missing_field("production_type"))
    results.append(check_for_gaps())
    results.append(get_outliers())

    return results

In [29]:
import pandas as pd
from IPython.display import display, Markdown

dq_results = run_data_quality_checks()
dq_report_df = pd.DataFrame(dq_results)
display(Markdown("### Data Quality Report"))
display(dq_report_df)

1
[Row(day=datetime.date(2025, 7, 1))]


### Data Quality Report

Unnamed: 0,check,status,failed_records
0,production_type not null,Pass,0
1,Data Gaps,Pass,"failed on days [{'day': datetime.date(2025, 7,..."
2,Data Outliers,Fail,"Have outliers {'Fossil coal-derived gas': 2, '..."


In [None]:
spark.stop()