In [0]:
from pyspark.sql.functions import col, sum, avg, count, date_trunc

# --- PART 1: SETUP & MOUNTING ---
storage_account_name = "STORAGE ACCOUNT NAME" # <--- UPDATE THIS
storage_account_key = "KEY" # <--- UPDATE THIS
container_name = "gold"

source_url = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/"
mount_point = f"/mnt/{container_name}"
extra_configs = {f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key}

# Mount Gold if not already mounted
if not any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    dbutils.fs.mount(source=source_url, mount_point=mount_point, extra_configs=extra_configs)
    print(f"Mounted {mount_point} successfully!")
else:
    print(f"{mount_point} is already mounted.")


In [0]:
# --- PART 2: BUSINESS LOGIC (Aggregations) ---
# We read from the Clean Silver Tables we created in Notebook 2
# Note: We group by Month and Zone to get a high-level summary.

gold_df = spark.sql("""
    SELECT 
        -- 1. Keep the raw date for sorting/Time Intelligence
        date_trunc('month', t.pickup_time) AS report_date,
        
        -- 2. Extract readable columns for Power BI
        year(t.pickup_time) AS report_year,
        concat('Q', quarter(t.pickup_time)) AS report_quarter, 
        date_format(t.pickup_time, 'MMMM') AS report_month_name,
        
        -- 3. The Business Dimensions
        z.Borough,
        z.Zone,
        
        -- 4. The Metrics
        count(t.vendor_id) AS total_trips,
        sum(t.total_amount) AS total_revenue,
        avg(t.trip_distance) AS avg_distance,
        avg(t.passenger_count) AS avg_passenger_count
    FROM urban_mobility.silver_taxi t
    LEFT JOIN urban_mobility.taxi_zone_lookup z
        ON t.PULocationID = z.LocationID
    GROUP BY 1, 2, 3, 4, 5, 6
""")

In [0]:
# --- PART 3: WRITE TO GOLD ---
gold_path = "/mnt/gold/monthly_revenue_by_zone"

gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(gold_path)

# --- PART 4: REGISTER TABLE ---
spark.sql("CREATE DATABASE IF NOT EXISTS urban_mobility_gold")

spark.sql("""
    CREATE TABLE IF NOT EXISTS urban_mobility_gold.monthly_revenue_by_zone
    USING DELTA
    LOCATION '/mnt/gold/monthly_revenue_by_zone'
""")

print("Success! Gold Table created.")
display(gold_df)