In [0]:
# -------------------------------
# Logging setup (self-contained)
# -------------------------------

from pyspark.sql.types import *
from pyspark.sql.functions import current_timestamp
import uuid

LOG_PATH = "/FileStore/project/logs/pipeline_logs"

log_schema = StructType([
    StructField("run_id", StringType()),
    StructField("pipeline_layer", StringType()),
    StructField("notebook_name", StringType()),
    StructField("event_type", StringType()),   # START / END / ERROR / REJECTED
    StructField("record_count", LongType()),
    StructField("status", StringType()),       # RUNNING / SUCCESS / FAILED
    StructField("error_message", StringType()),
    StructField("event_timestamp", TimestampType())
])

# Create logging table if it does not exist
if not spark._jsparkSession.catalog().tableExists("delta.`/FileStore/project/logs/pipeline_logs`"):
    spark.createDataFrame([], log_schema) \
        .write.format("delta") \
        .mode("overwrite") \
        .save(LOG_PATH)

def log_event(layer, notebook, event_type, record_count, status, error_msg=None):

    row = [(
        str(uuid.uuid4()),
        layer,
        notebook,
        event_type,
        int(record_count),
        status,
        error_msg,          # can be None safely now
        None                # placeholder for timestamp
    )]

    df = spark.createDataFrame(row, schema=log_schema) \
              .withColumn("event_timestamp", current_timestamp())

    df.write.format("delta").mode("append").save(LOG_PATH)



In [0]:
# Task 4.1: Start Silver processing

silver_path = "/FileStore/project/silver/"
quarantine_path = "/FileStore/project/quarantine/"
notebook_name = "silver_processing"

log_event("silver", notebook_name, "START", 0, "RUNNING")

bronze_sales = spark.read.format("delta").load(
    "/FileStore/project/bronze/sales"
)


In [0]:
# Task 4.2: Remove duplicate transactions

silver_dedup = bronze_sales.dropDuplicates(["transaction_id"])


In [0]:
# Task 4.3: Recalculate totals correctly

from pyspark.sql.functions import col

silver_calc = silver_dedup.withColumn(
    "correct_total",
    col("quantity") * col("unit_price") - col("discount")
)


In [0]:
# Task 4.4: Apply validation rules

valid_sales = silver_calc.filter(
    (col("quantity") > 0) &
    (col("unit_price") > 0) &
    (col("store_id").isin("S01","S02","S03"))
)

invalid_sales = silver_calc.subtract(valid_sales)


In [0]:
# Task 4.5: Write outputs and log metrics

try:
    valid_sales.write.format("delta") \
        .mode("overwrite") \
        .save(silver_path + "sales")

    invalid_sales.write.format("delta") \
        .mode("overwrite") \
        .save(quarantine_path + "sales")

    log_event("silver", notebook_name, "END",
              valid_sales.count(), "SUCCESS")

    log_event("silver", notebook_name, "REJECTED",
              invalid_sales.count(), "SUCCESS")

except Exception as e:
    log_event("silver", notebook_name, "ERROR", 0, "FAILED", str(e))
    raise
