In [1]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Step 1: Validate bronze data exists
    try:
        bronze_df = spark.table("bronze_data")
        if bronze_df.count() == 0:
            raise ValueError("Bronze table is empty")
    except AnalysisException:
        raise ValueError("Bronze table 'bronze_data' not found")
    
    logger.info("Starting silver layer transformation...")

    # Define window specification for rolling calculations
    window_spec = Window.partitionBy("ticker").orderBy("date").rowsBetween(-4, 0)

    # Step 2: Transform and clean the bronze DataFrame
    silver_df = (
        bronze_df
        # Standardize column names
        .withColumnRenamed("price", "price_close")
        .withColumnRenamed("open", "price_open")
        .withColumnRenamed("high", "price_high")
        .withColumnRenamed("low", "price_low")
        
        # Handle timestamps and dates
        .withColumn("date", F.to_date("timestamp"))
        .withColumn("trade_timestamp", F.col("trade_timestamp").cast("timestamp"))
        
        # Calculate derived metrics
        .withColumn("price_change", F.col("price_close") - F.col("price_open"))
        .withColumn("daily_change_pct", 
                   F.round(((F.col("price_close") - F.col("price_open")) / 
                           F.col("price_open")) * 100, 2))
        .withColumn("price_range", F.col("price_high") - F.col("price_low"))
        
        # Market status flags
        .withColumn("is_market_open",
                   ~F.dayofweek("date").isin([1, 7]) &  # Not weekend
                   (F.hour("trade_timestamp").between(9, 16)))  # Market hours
        
        # Data quality checks
        .filter(F.col("price_close").isNotNull())
        .filter(F.col("volume") > 0)
        .filter(F.col("price_high") >= F.col("price_low"))
        .filter(F.year("date") >= 2000)  # Reasonable date filter
        
        # Window calculations
        .withColumn("rolling_5d_avg_price", 
                   F.round(F.avg("price_close").over(window_spec), 2))
        .withColumn("rolling_5d_avg_volume", 
                   F.avg("volume").over(window_spec).cast("long"))
        
        # Select final columns
        .select(
            "ticker",
            "date",
            "price_close",
            "price_open",
            "price_high",
            "price_low",
            "price_range",
            "volume",
            "daily_change_pct",
            "price_change",
            "trade_timestamp",
            "is_market_open",
            "rolling_5d_avg_price",
            "rolling_5d_avg_volume"
        )
    )

    # Step 3: Validate silver data before saving
    record_count = silver_df.count()
    if record_count == 0:
        raise ValueError("Transformed silver data is empty")
    
    logger.info(f"Transformed {record_count} records for silver layer")

    # Step 4: Save with Delta Lake best practices
    (silver_df.write
     .format("delta")
     .mode("overwrite")
     .option("overwriteSchema", "true")
     .saveAsTable("silver_data"))
    
    # Step 5: Optimize the table
    spark.sql("OPTIMIZE silver_data ZORDER BY (ticker, date)")
    
    # Add table properties
    spark.sql("""
    ALTER TABLE silver_data SET TBLPROPERTIES (
        'description' = 'Cleaned and standardized stock price data',
        'layer' = 'silver',
        'source' = 'bronze_data'
    )
    """)
    
    logger.info("Silver layer transformation completed successfully")
    display(silver_df.limit(5))

except Exception as e:
    logger.error(f"Silver transformation failed: {str(e)}")
    raise

StatementMeta(, 94a85d51-a1ea-4917-9b8b-2e99a338f51f, 3, Finished, Available, Finished)

INFO:__main__:Starting silver layer transformation...
INFO:__main__:Transformed 90 records for silver layer
INFO:__main__:Silver layer transformation completed successfully


SynapseWidget(Synapse.DataFrame, df27471c-ef24-427b-92f9-54331328113b)