In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp


In [0]:
spark = SparkSession.builder.appName("DailyETLJob").getOrCreate()

INFO:py4j.clientserver:Received command c on object id p0


In [0]:
file_path = "/databricks-datasets/iot/iot_devices.json"
raw_data_df = spark.read.format("json").load(file_path)

INFO:py4j.clientserver:Received command c on object id p0


In [0]:
transformed_df = (raw_data_df
    .filter(col("device_id").isNotNull())  # Remove rows with null device_id
    .select(
        col("device_id"),
        col("device_name"),
        col("temp").cast("float"),  # Ensure temperature is float
        col("humidity").cast("float"),    # Ensure humidity is float
        current_timestamp().alias("processed_time")  # Add processing timestamp
    )
    .dropDuplicates(["device_id"])  # Remove duplicate devices
)

INFO:py4j.clientserver:Received command c on object id p0


In [0]:
output_table = "daily_etl_output"
transformed_df.write.format("delta").mode("overwrite").saveAsTable(output_table)

INFO:py4j.clientserver:Received command c on object id p0


In [0]:
spark.sql(f"SELECT * FROM {output_table} LIMIT 10").show()

+---------+--------------------+----+--------+--------------------+
|device_id|         device_name|temp|humidity|      processed_time|
+---------+--------------------+----+--------+--------------------+
|        7|meter-gauge-7GeDoanM|18.0|    26.0|2025-03-21 05:39:...|
|       19|meter-gauge-19eg1...|29.0|    75.0|2025-03-21 05:39:...|
|       22|  sensor-pad-22oWV2D|24.0|    58.0|2025-03-21 05:39:...|
|       26|sensor-pad-26rAyC...|10.0|    27.0|2025-03-21 05:39:...|
|       29|meter-gauge-29lyN...|15.0|    69.0|2025-03-21 05:39:...|
|       31|meter-gauge-31bcR...|11.0|    48.0|2025-03-21 05:39:...|
|       32|  sensor-pad-329KFvY|19.0|    30.0|2025-03-21 05:39:...|
|       34|sensor-pad-34F1Ju...|27.0|    89.0|2025-03-21 05:39:...|
|       43|meter-gauge-43RYo...|12.0|    39.0|2025-03-21 05:39:...|
|       50|  sensor-pad-50g2ukc|29.0|    69.0|2025-03-21 05:39:...|
+---------+--------------------+----+--------+--------------------+

