In [None]:
from databricks.connect import DatabricksSession
spark = DatabricksSession.builder.getOrCreate()

In [10]:
from pyspark.sql import functions as F
import pyspark.sql.types as T

In [None]:
catalog = dbutils.widgets.get("catalog")
schema_landing = dbutils.widgets.get("schema_landing")
schema_silver = dbutils.widgets.get("schema_silver")
schema_gold = dbutils.widgets.get("schema_gold")
volume = dbutils.widgets.get("volume")

**LOAD DATA**

In [13]:
# ---------------------------
# 1. Load silver Delta table 
# ----------------------------

source_table = f"{catalog}.{schema_silver}.weather_clean"
silver_df = spark.readStream.table(source_table)

**ML GOLD:for Predictive Modeling**

In [14]:
target_table_ml = f"{catalog}.{schema_gold}.ml_weather_features"
checkpoint_ml = f"/Volumes/{catalog}/{schema_landing}/{volume}/checkpoints/iot_gold/df_ml"

In [15]:
df_ml = (
    silver_df
    .groupBy(
        F.col("city_id"),
        F.col("name").alias("city_name"),
        F.col("country").alias("country_code"),
        F.window("local_time", "1 hour").alias("hour_window")
    )
    .agg(
        # --- Core Numeric Stats ---
        F.mean("temperature").alias("temp_mean"),
        F.expr("percentile_approx(temperature, 0.5)").alias("temp_median"),
        F.expr("mode(temperature)").alias("temp_mode"), 

        F.mean("humidity").alias("humidity_mean"),
        F.expr("percentile_approx(humidity, 0.5)").alias("humidity_median"),
        F.expr("mode(humidity)").alias("humidity_mode"), 

        F.mean("pressure").alias("pressure_mean"),
        F.expr("percentile_approx(pressure, 0.5)").alias("pressure_median"),
        F.expr("mode(pressure)").alias("pressure_mode"), # FIXED

        F.mean("windspeed").alias("windspeed_mean"),
        F.expr("percentile_approx(windspeed, 0.5)").alias("windspeed_median"),
        F.expr("mode(windspeed)").alias("windspeed_mode"), # FIXED

        # --- Rain (Handle Nulls) ---
        F.mean(F.coalesce("rain_1h", F.lit(0))).alias("rain_mean"),
        F.expr("percentile_approx(coalesce(rain_1h, 0), 0.5)").alias("rain_median"),
        F.expr("mode(rain_1h)").alias("rain_mode") # FIXED
    )

    .select(
        # 1. Extract Window Columns
        F.col("hour_window.start").alias("event_hour_local"),
        F.col("hour_window.end").alias("event_hour_end"),
        
        # 2. Select Everything Else (This includes city_id, city_name, country_code)
        "*" 
    )
    # 3. Drop the struct column (since we extracted start/end)
    .drop("hour_window") 
)

In [16]:
(df_ml.writeStream
    .format("delta")
    
    # CRITICAL: 'complete' mode is required for Aggregations without Watermark.
    # It means: "Write the current calculated value for every window."
    # Delta Lake handles the "Upsert" (updating the existing row) automatically.
    .outputMode("complete")
    
    .option("checkpointLocation", checkpoint_ml)
    
    # TRIGGER: Process all available data as a batch, then stop.
    .trigger(availableNow=True)
    
    .toTable(target_table_ml)
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x1a5b2dd4560>

In [None]:
spark.sql(f"OPTIMIZE {catalog}.{schema_gold}.ml_weather_features")

Unnamed: 0,path,metrics
0,,"{'numFilesAdded': 0, 'numFilesRemoved': 0, 'filesAdded': {'min': None, 'max': None, 'avg': 0.0, 'totalFiles': 0, 'totalSize': 0}, 'filesRemoved': {'min': None, 'max': None, 'avg': 0.0, 'totalFiles': 0, 'totalSize': 0}, 'partitionsOptimized': 0, 'zOrderStats': None, 'clusteringStats': None, 'numBins': 0, 'numBatches': 0, 'totalConsideredFiles': 0, 'totalFilesSkipped': 0, 'preserveInsertionOrder': True, 'numFilesSkippedToReduceWriteAmplification': 0, 'numBytesSkippedToReduceWriteAmplification': 0, 'startTimeMs': 1770579863454, 'endTimeMs': 1770579865657, 'totalClusterParallelism': 8, 'totalScheduledTasks': 0, 'autoCompactParallelismStats': None, 'deletionVectorStats': {'numDeletionVectorsRemoved': 0, 'numDeletionVectorRowsRemoved': 0}, 'recompressionCodec': None, 'numTableColumns': 20, 'numTableColumnsWithStats': 20, 'totalTaskExecutionTimeMs': 0, 'skippedArchivedFiles': 0, 'clusteringMetrics': None}"
