In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Paths for reading and saving data
silver_path = "dbfs:/Volumes/workspace/default/test/silver"
gold_path = "dbfs:/Volumes/workspace/default/test/gold"

# Load Silver Data
silver_df = spark.read.format("delta").load(silver_path)

# ==============================
# Gold Layer Transformation
# ==============================
gold_df = (silver_df
           .groupBy("JOB_ID")
           .agg(
               F.avg("DOUBLE_SALARY").alias("AVG_DOUBLE_SALARY"),
               F.count("*").alias("EMPLOYEE_COUNT"),
               F.max("DOUBLE_SALARY").alias("MAX_DOUBLE_SALARY"),
               F.min("DOUBLE_SALARY").alias("MIN_DOUBLE_SALARY")
           )
           .orderBy(F.desc("AVG_DOUBLE_SALARY"))
)

# Write to Gold Layer as Delta Table
gold_df.write.format("delta") \
             .mode("overwrite") \
             .option("overwriteSchema", "true") \
             .save(gold_path)

# Display the final aggregated data
gold_df.show()


+----------+-----------------+--------------+-----------------+-----------------+
|    JOB_ID|AVG_DOUBLE_SALARY|EMPLOYEE_COUNT|MAX_DOUBLE_SALARY|MIN_DOUBLE_SALARY|
+----------+-----------------+--------------+-----------------+-----------------+
|   AD_PRES|          48000.0|             1|            48000|            48000|
|     AD_VP|          34000.0|             2|            34000|            34000|
|    MK_MAN|          26000.0|             1|            26000|            26000|
|    AC_MGR|          24016.0|             1|            24016|            24016|
|    FI_MGR|          24016.0|             1|            24016|            24016|
|    PU_MAN|          22000.0|             1|            22000|            22000|
|    PR_REP|          20000.0|             1|            20000|            20000|
|AC_ACCOUNT|          16600.0|             1|            16600|            16600|
|FI_ACCOUNT|          15840.0|             5|            18000|            13800|
|    ST_MAN|    