In [0]:
# -------------------------------
# Logging setup (self-contained)
# -------------------------------

from pyspark.sql.types import *
from pyspark.sql.functions import current_timestamp
import uuid

LOG_PATH = "/FileStore/project/logs/pipeline_logs"

log_schema = StructType([
    StructField("run_id", StringType()),
    StructField("pipeline_layer", StringType()),
    StructField("notebook_name", StringType()),
    StructField("event_type", StringType()),   # START / END / ERROR / REJECTED
    StructField("record_count", LongType()),
    StructField("status", StringType()),       # RUNNING / SUCCESS / FAILED
    StructField("error_message", StringType()),
    StructField("event_timestamp", TimestampType())
])

# Create logging table if it does not exist
if not spark._jsparkSession.catalog().tableExists("delta.`/FileStore/project/logs/pipeline_logs`"):
    spark.createDataFrame([], log_schema) \
        .write.format("delta") \
        .mode("overwrite") \
        .save(LOG_PATH)

def log_event(layer, notebook, event_type, record_count, status, error_msg=None):

    row = [(
        str(uuid.uuid4()),
        layer,
        notebook,
        event_type,
        int(record_count),
        status,
        error_msg,          # can be None safely now
        None                # placeholder for timestamp
    )]

    df = spark.createDataFrame(row, schema=log_schema) \
              .withColumn("event_timestamp", current_timestamp())

    df.write.format("delta").mode("append").save(LOG_PATH)



In [0]:
# Task 3.1: Initialize Bronze ingestion
# Load shared logging utilities

source_path = "/FileStore/project/source/"
bronze_path = "/FileStore/project/bronze/"
notebook_name = "bronze_ingestion"

log_event("bronze", notebook_name, "START", 0, "RUNNING")


In [0]:
# Task 3.2: Read raw sales data

sales_raw = spark.read.option("header", True).csv(
    source_path + "sales_transactions.csv"
)


In [0]:
# Task 3.3: Add ingestion metadata

from pyspark.sql.functions import current_timestamp, lit

sales_bronze = (
    sales_raw
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_system", lit("manual_csv"))
)


In [0]:
# Task 3.4: Write Bronze Delta table with error handling

try:
    sales_bronze.write.format("delta") \
        .mode("append") \
        .save(bronze_path + "sales")

    count = sales_bronze.count()

    log_event("bronze", notebook_name, "END", count, "SUCCESS")

except Exception as e:
    log_event("bronze", notebook_name, "ERROR", 0, "FAILED", str(e))
    raise
