In [2]:
# Bronze Layer
bronze_df = spark.read.format("delta").table("raw_finance_ai_data")

# Preview the raw data
bronze_df.show(5)


StatementMeta(, 1fa0ca57-7f15-4eb6-9ea5-b675710fc015, 4, Finished, Available, Finished)

+------+------+-------------------+---------+------+------+------+------+
|ticker| price|          timestamp|   volume|  open|  high|   low|change|
+------+------+-------------------+---------+------+------+------+------+
|  AAPL|223.19|2025-04-01 04:00:00| 36336840|219.81|223.68| 218.9|  3.38|
|  AAPL|223.89|2025-04-02 04:00:00| 35905904|221.31|225.19|221.02|  2.58|
|  AAPL|203.19|2025-04-03 04:00:00|103419006|205.54|207.49|201.25| -2.35|
|  AAPL|188.38|2025-04-04 04:00:00|125897269|193.89|199.88|187.34| -5.51|
|  AAPL|181.46|2025-04-07 04:00:00|160406286| 177.2|194.15|174.62|  4.26|
+------+------+-------------------+---------+------+------+------+------+
only showing top 5 rows



In [3]:
from pyspark.sql.functions import col, to_timestamp

# Step 1: Clean the raw data (Bronze Layer)
bronze_df = bronze_df.withColumn("trade_timestamp", to_timestamp("timestamp")) \
    .withColumn("price", col("price").cast("float")) \
    .withColumn("volume", col("volume").cast("long")) \
    .withColumn("open", col("open").cast("float")) \
    .withColumn("high", col("high").cast("float")) \
    .withColumn("low", col("low").cast("float")) \
    .withColumn("change", col("change").cast("float"))
    
# Remove rows where price, volume, or timestamp is missing
bronze_df = bronze_df.filter(col("price").isNotNull() & col("volume").isNotNull() & col("trade_timestamp").isNotNull())

# Store cleaned data into the Silver Layer (Delta table)
bronze_df.write.format("delta").mode("overwrite").saveAsTable("bronze_data")

# Preview the cleaned data
bronze_df.show(5)

StatementMeta(, 1fa0ca57-7f15-4eb6-9ea5-b675710fc015, 5, Finished, Available, Finished)

+------+------+-------------------+---------+------+------+------+------+-------------------+
|ticker| price|          timestamp|   volume|  open|  high|   low|change|    trade_timestamp|
+------+------+-------------------+---------+------+------+------+------+-------------------+
|  AAPL|223.19|2025-04-01 04:00:00| 36336840|219.81|223.68| 218.9|  3.38|2025-04-01 04:00:00|
|  AAPL|223.89|2025-04-02 04:00:00| 35905904|221.31|225.19|221.02|  2.58|2025-04-02 04:00:00|
|  AAPL|203.19|2025-04-03 04:00:00|103419006|205.54|207.49|201.25| -2.35|2025-04-03 04:00:00|
|  AAPL|188.38|2025-04-04 04:00:00|125897269|193.89|199.88|187.34| -5.51|2025-04-04 04:00:00|
|  AAPL|181.46|2025-04-07 04:00:00|160406286| 177.2|194.15|174.62|  4.26|2025-04-07 04:00:00|
+------+------+-------------------+---------+------+------+------+------+-------------------+
only showing top 5 rows

