In [3]:
from databricks.connect import DatabricksSession  #ruung in db
spark = DatabricksSession.builder.getOrCreate()


ValueError: default auth: cannot configure default credentials, please check https://docs.databricks.com/en/dev-tools/auth.html#databricks-client-unified-authentication to configure credentials for your preferred authentication method.

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
#CREATE THE BRONZE SCHEMA
spark.sql("""CREATE SCHEMA IF NOT EXISTS iot_catalog.02_silver""")

In [None]:
# ------------------------------------------------------------
# 1. Load Bronze Delta table (append-only raw ingestion layer)
# ------------------------------------------------------------
# WHY:
# Silver ALWAYS reads incrementally from Bronze.
# Bronze should contain raw nested JSON + metadata columns.
# Here we assume your Bronze table is:
#   iot_catalog.bronze.weather_raw
# ------------------------------------------------------------
bronze_df = spark.readStream.table("iot_catalog.01_bronze.iot_bronze_weather")

In [None]:
# ------------------------------------------------------------
# 2. Select & flatten nested structures
# ------------------------------------------------------------
# WHY:
# Silver removes nested dictionaries (coord, main, wind, sys, weather[])
# because BI tools & SQL users cannot work with nested structures.
# This is a mandatory Silver requirement.
# ------------------------------------------------------------
flattened_df = bronze_df.select(
    # Business attributes (extracted from raw JSON)

    col("id").alias("city_id"),                 # normalize naming
    col("name").alias("city_name"),
    col("base"),                                # source type (stations)
    col("timezone").alias("timezone_offset"),   # seconds offset
    
    # Geographic flattening
    col("coord.lat").alias("latitude"),
    col("coord.lon").alias("longitude"),

    # Weather main metrics
    col("main.temp").alias("temperature_celsius"),
    col("main.pressure").alias("pressure_hpa"),
    col("main.humidity").alias("humidity_pct"),
    col("main.temp_min").alias("temp_min_celsius"),
    col("main.temp_max").alias("temp_max_celsius"),

    # Weather description (first element of array)
    col("weather")[0]["main"].alias("weather_main"),
    col("weather")[0]["description"].alias("weather_description"),
    col("weather")[0]["icon"].alias("weather_icon"),

    # Wind flattening
    col("wind.deg").alias("wind_direction_deg"),
    col("wind.speed").alias("wind_speed_ms"),

    # Timestamp from payload
    (col("dt")).cast("timestamp").alias("source_event_time"),

    # Rain flattening (if present)
    col("rain.1h").alias("rain_1h_mm"),
    col("rain.3h").alias("rain_3h_mm"),

    # Bronze metadata REQUIRED in Silver
    col("_ingest_timestamp"),
    col("_ingest_file_name"),
    col("ingestion_date"),
    
    # Add Silver processing timestamp
    current_timestamp().alias("_processed_timestamp")
)

In [None]:
# ------------------------------------------------------------
# 3. Data Quality Enforcement (Silver requirement #2)
# ------------------------------------------------------------
# Examples applied here:
#  - Ensure latitude/longitude are valid
#  - Remove records missing critical weather values
#  - Drop negative humidity or pressure
#  - Ensure temperature is within realistic bounds
#
# In real production youâ€™d add many more expectations.
# ------------------------------------------------------------
clean_df = flattened_df.filter(
    (col("latitude").isNotNull()) &
    (col("longitude").isNotNull()) &
    (col("temperature_celsius").between(-100, 100)) &
    (col("humidity_pct").between(0, 100)) &
    (col("pressure_hpa") > 300) &
    (col("pressure_hpa") < 1200)
)

In [None]:
# ------------------------------------------------------------
# 4. Deduplication (Silver requirement #4)
# ------------------------------------------------------------
# Dedupe key for weather data:
# city_id + source_event_time uniquely identifies a weather reading.
# We use windowing + row_number.
# ------------------------------------------------------------
from pyspark.sql.window import Window

dedupe_window = Window.partitionBy(
    "city_id", "source_event_time"
).orderBy(
    col("_ingest_timestamp").desc()
)

deduped_df = clean_df.withColumn(
    "row_num", row_number().over(dedupe_window)
).filter("row_num = 1").drop("row_num")


In [None]:
# ------------------------------------------------------------
# 5. Survivorship & Standardization (Silver #6)
# ------------------------------------------------------------
# Apply:
#  - timezone standardization (convert to UTC)
#  - normalize strings (trim, lower-case)
#  - ensure consistent units
# ------------------------------------------------------------
standard_df = deduped_df.select(
    "*",
    # Convert weather to lowercase for consistency
    lower(col("weather_main")).alias("weather_main_std"),
    lower(col("weather_description")).alias("weather_description_std"),

    # Standardizing city names
    initcap(col("city_name")).alias("city_name_std"),

    # Convert source timestamp + timezone to UTC (canonical time)
    (col("source_event_time") - expr("INTERVAL timezone_offset SECOND")).alias("event_time_utc")
).drop("weather_main", "weather_description", "city_name")  # remove unstandardized values


In [None]:
# ------------------------------------------------------------
# 6. Incremental Silver Write (Silver #7)
# ------------------------------------------------------------
# Silver MUST be incremental, never full-refresh.
# Always write to a managed Delta table with ACID guarantees.
# ------------------------------------------------------------
(
    standard_df.writeStream
        .format("delta")
        .option("checkpointLocation", "/Volumes/iot_catalog/silver/checkpoints/weather/")
        .trigger(availableNow=True)                      # runs once per new Bronze batch
        .table("iot_catalog.silver.weather_clean")
)

In [None]:
# ------------------------------------------------------------
# 7. Optional Delta Optimization (Silver #8)
# ------------------------------------------------------------
# Run manually or via a scheduled job:
# spark.sql("OPTIMIZE iot_catalog.silver.weather_clean ZORDER BY (event_time_utc)")
# spark.sql("VACUUM iot_catalog.silver.weather_clean RETAIN 168 HOURS")
