In [None]:
import dlt
from pyspark.sql import functions as F
import pyspark.sql.types as T

**ML GOLD:for Predictive Modeling**

In [None]:

@dlt.table(
    name="gold_ml",
    comment="Gold layer: ML features for weather forecasting"
)
def gold_ml():
    silver_df = dlt.readStream("silver_table")
    df_ml = (
        silver_df
        .groupBy(
            F.col("city_id"),
            F.col("name").alias("city_name"),
            F.col("country").alias("country_code"),
            F.window("local_time", "1 hour").alias("hour_window")
        )
        .agg(
            # --- Core Numeric Stats ---
            F.mean("temperature").alias("temp_mean"),
            F.expr("percentile_approx(temperature, 0.5)").alias("temp_median"),
            F.expr("mode(temperature)").alias("temp_mode"), 

            F.mean("humidity").alias("humidity_mean"),
            F.expr("percentile_approx(humidity, 0.5)").alias("humidity_median"),
            F.expr("mode(humidity)").alias("humidity_mode"), 

            F.mean("pressure").alias("pressure_mean"),
            F.expr("percentile_approx(pressure, 0.5)").alias("pressure_median"),
            F.expr("mode(pressure)").alias("pressure_mode"), # FIXED

            F.mean("windspeed").alias("windspeed_mean"),
            F.expr("percentile_approx(windspeed, 0.5)").alias("windspeed_median"),
            F.expr("mode(windspeed)").alias("windspeed_mode"), # FIXED

            # --- Rain (Handle Nulls) ---
            F.mean(F.coalesce("rain_1h", F.lit(0))).alias("rain_mean"),
            F.expr("percentile_approx(coalesce(rain_1h, 0), 0.5)").alias("rain_median"),
            F.expr("mode(rain_1h)").alias("rain_mode") # FIXED
        )

        .select(
            # 1. Extract Window Columns
            F.col("hour_window.start").alias("event_hour_local"),
            F.col("hour_window.end").alias("event_hour_end"),

            # 2. Select Everything Else (This includes city_id, city_name, country_code)
            "*" 
        )
        # 3. Drop the struct column (since we extracted start/end)
        .drop("hour_window") 
    )
    
    return df_ml