In [None]:
# Creating gold schema
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

In [None]:
# Initial building of gold fact table

from pyspark.sql.functions import  col, count, sum, when

df_fact_base = spark.table("silver.appointments_clean")

df_practice = spark.table("gold.dim_practice")
df_date = spark.table("gold.dim_date")

df_fact = (
    df_fact_base
    .groupBy("practice_code", "appointment_date")
    .agg(
        count("*").alias("total_appointments"),
        sum(when(col("current_slot_status") == "DNA", 1).otherwise(0)).alias("dna_count"),
        sum(when(col("current_slot_status") == "Walked Out", 1).otherwise(0)).alias("walked_out_count"),
        sum(when(col("current_slot_status") == "Left", 1).otherwise(0)).alias("completed_count")
    )
    .withColumn("dna_rate", col("dna_count") / col("total_appointments"))
    .withColumn("walked_out_rate", col("walked_out_count") / col("total_appointments"))
    .join(df_practice, "practice_code")
    .join(df_date, "appointment_date")
    .select(
        "practice_key",
        "practice_code",
        "date_key",
        "total_appointments",
        "dna_count",
        "walked_out_count",
        "completed_count"
    )
)

## Partitioning by date key
df_fact.write \
    .partitionBy("date_key") \
    .saveAsTable("gold.fact_appointments")



In [None]:

# Create metadata control table 
spark.sql("""
CREATE TABLE IF NOT EXISTS gold.pipeline_metadata (
    pipeline_name STRING,
    last_processed_date DATE
)
""")



In [None]:
# Insert inital values
spark.sql("""
INSERT INTO gold.pipeline_metadata
VALUES ('fact_appointments_pipeline', '2025-01-01')
""")
