In [None]:
from databricks.connect import DatabricksSession 
spark = DatabricksSession.builder.getOrCreate()

In [None]:
from pyspark.sql import functions as F
import pyspark.sql.types as T

In [None]:
catalog = dbutils.widgets.get("catalog")
schema_landing = dbutils.widgets.get("schema_landing")
schema_silver = dbutils.widgets.get("schema_silver")
schema_gold = dbutils.widgets.get("schema_gold")
volume = dbutils.widgets.get("volume")

**LOAD DATA**

In [None]:
# ---------------------------
# 1. Load silver Delta table 
# ----------------------------

source_table = f"{catalog}.{schema_silver}.weather_clean"
silver_df = spark.readStream.table(source_table)

**BI GOLD Table**

In [7]:
target_table_bi = f"{catalog}.{schema_gold}.bi_weather_observations"
checkpoint_bi   = f"/Volumes/{catalog}/{schema_landing}/{volume}/checkpoints/iot_gold/bi_obs"

In [8]:
df_bi = (
    silver_df
    .select(
        # ---------------------------
        # 1. Location / Entity
        # ---------------------------
        F.col("city_id"),
        F.col("name").alias("city_name"),
        F.col("country").alias("country_code"),

        # ---------------------------
        # 2. Time (Local + UTC)
        # ---------------------------
        F.col("local_time").alias("observation_time_local"),
        F.col("datetime").alias("observation_time_utc"),

        # ---------------------------
        # 3. Temperature (explicit units)
        # ---------------------------
        F.round("temperature", 1).alias("temperature_c"),
        F.round("feels_like", 1).alias("feels_like_c"),
        F.round("temperature_min", 1).alias("temp_min_c"),
        F.round("temperature_max", 1).alias("temp_max_c"),

        # ---------------------------
        # 4. Atmosphere
        # ---------------------------
        F.col("humidity").alias("humidity_pct"),
        F.col("pressure").alias("pressure_hpa"),
        F.col("clouds_all").alias("cloud_cover_pct"),

        # ---------------------------
        # 5. Conditions (human readable)
        # ---------------------------
        F.col("weather_main").alias("condition_category"),
        F.initcap("weather_description").alias("condition_text"),
        F.col("weather_icon"),

        # ---------------------------
        # 6. Wind
        # ---------------------------
        F.round("windspeed", 1).alias("wind_speed_ms"),
        F.col("wind_deg").alias("wind_direction_deg"),
        F.round("wind_gust", 1).alias("wind_gust_ms"),

        # ---------------------------
        # 7. Rain (clean nulls)
        # ---------------------------
        F.coalesce("rain_1h", F.lit(0.0)).alias("rain_mm")
    )
)


In [9]:
(df_bi.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_bi)
    .trigger(availableNow=True)
    .toTable(target_table_bi)
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x22cc80fe750>

In [None]:
# ==============================================================================
# 1. BI OBSERVATIONS (Reporting Layer)
# ==============================================================================
# BI dashboards almost always filter by City first ("Show me London") 
# and then by Date Range ("Last 7 days").
# Clustering ensures all London data for a specific week is packed into ONE file,
# allowing the dashboard to skip 99% of the data instantly.
spark.sql(f"OPTIMIZE {catalog}.{schema_gold}.bi_weather_observations")


Unnamed: 0,path,metrics
0,,"{'numFilesAdded': 0, 'numFilesRemoved': 0, 'filesAdded': {'min': None, 'max': None, 'avg': 0.0, 'totalFiles': 0, 'totalSize': 0}, 'filesRemoved': {'min': None, 'max': None, 'avg': 0.0, 'totalFiles': 0, 'totalSize': 0}, 'partitionsOptimized': 0, 'zOrderStats': None, 'clusteringStats': None, 'numBins': 0, 'numBatches': 0, 'totalConsideredFiles': 0, 'totalFilesSkipped': 0, 'preserveInsertionOrder': True, 'numFilesSkippedToReduceWriteAmplification': 0, 'numBytesSkippedToReduceWriteAmplification': 0, 'startTimeMs': 1770579844772, 'endTimeMs': 1770579845558, 'totalClusterParallelism': 8, 'totalScheduledTasks': 0, 'autoCompactParallelismStats': None, 'deletionVectorStats': {'numDeletionVectorsRemoved': 0, 'numDeletionVectorRowsRemoved': 0}, 'recompressionCodec': None, 'numTableColumns': 19, 'numTableColumnsWithStats': 19, 'totalTaskExecutionTimeMs': 0, 'skippedArchivedFiles': 0, 'clusteringMetrics': None}"
