In [1]:
from pyspark.sql import SparkSession

In [5]:
spark = (
    SparkSession
    .builder
    .appName("raw-kafka-log-data-load-tables-iceberg")
    .master("spark://spark-master:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

25/02/14 17:07:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
spark

# Home Power Usage Kafka Log Data

## Read Kafka Log Data

In [24]:
# Read home log data
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, TimestampType, FloatType

data_file_date = "2025-02-14"

_schema = StructType([
    StructField("time_stamp", TimestampType(), True),
    StructField("current_consumption_w", FloatType(), True),
    StructField("consumption_accumulated_w", FloatType(), True),
])

home_power_usage_df = spark.read.format("json").schema(_schema)\
                   .load(f"/home/iceberg/warehouse/solarx_kafka_log_data/kafka_log_home_energy_consumption_{data_file_date}.log")\
                   .withColumn("15_minutes_interval", F.floor((F.hour(F.col("time_stamp"))*60 + F.minute(F.col("time_stamp")) - 60) / 15))                                                                                                 

In [26]:
home_power_usage_df.printSchema()

root
 |-- time_stamp: timestamp (nullable = true)
 |-- current_consumption_w: float (nullable = true)
 |-- consumption_accumulated_w: float (nullable = true)
 |-- 15_minutes_interval: long (nullable = true)



In [25]:
home_power_usage_df.show()

+-------------------+---------------------+-------------------------+-------------------+
|         time_stamp|current_consumption_w|consumption_accumulated_w|15_minutes_interval|
+-------------------+---------------------+-------------------------+-------------------+
|2025-02-14 02:08:36|                 0.82|                     0.82|                  4|
|2025-02-14 02:08:37|                 0.79|                     1.61|                  4|
|2025-02-14 02:08:38|                 0.76|                     2.37|                  4|
|2025-02-14 02:08:39|                 0.83|                      3.2|                  4|
|2025-02-14 02:08:40|                 0.82|                     4.02|                  4|
|2025-02-14 02:08:41|                 0.79|                     4.81|                  4|
|2025-02-14 02:08:42|                 0.86|                     5.67|                  4|
|2025-02-14 02:08:43|                  0.8|                     6.47|                  4|
|2025-02-1

In [27]:
# Check Spark Shuffle Partition setting

print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", home_power_usage_df.rdd.getNumPartitions())
print("# Parallelism: ", spark.sparkContext.defaultParallelism)

# shiffle:  200
# partitions:  2
# Parallelism:  2


In [29]:
spark.conf.set("spark.sql.shuffle.partitions", 92)

## Iceberg Table

In [30]:
%%sql

CREATE DATABASE IF NOT EXISTS SolarX_Raw_Transactions

25/02/14 17:55:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [31]:
%%sql

DROP TABLE IF EXISTS SolarX_Raw_Transactions.home_power_readings PURGE

In [32]:
%%sql

CREATE TABLE SolarX_Raw_Transactions.home_power_readings(
    timestamp               TIMESTAMP NOT NULL,
    15_minutes_interval     SMALLINT  NOT NULL,
    min_consumption_wh      FLOAT     NOT NULL,
    max_consumption_wh      FLOAT     NOT NULL
)
USING iceberg
PARTITIONED BY (DAY(timestamp), 15_minutes_interval);

In [33]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.home_power_readings;

timestamp,15_minutes_interval,min_consumption_wh,max_consumption_wh


## ETL

In [34]:
home_power_usage_df.createOrReplaceTempView("temp_view")

In [35]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.home_power_readings (timestamp, 15_minutes_interval, min_consumption_wh, max_consumption_wh)
SELECT time_stamp                 as timestamp,
       15_minutes_interval        as 15_minutes_interval,
       current_consumption_w      as min_consumption_wh,
       current_consumption_w      as max_consumption_wh
       
FROM temp_view

                                                                                

# Solar Power Kafka Log Data

In [4]:
spark.stop()