In [1]:
from pyspark.sql import SparkSession

In [5]:
spark = (
    SparkSession
    .builder
    .appName("raw-kafka-log-data-load-tables-iceberg")
    .master("spark://spark-master:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

25/02/14 17:07:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
spark

# Home Power Usage Kafka Log Data

## Read Kafka Log Data

In [58]:
# Read home log data
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, TimestampType, FloatType, IntegerType, StringType, MapType

data_file_date = "2025-02-14"

_schema = StructType([
    StructField("time_stamp", TimestampType(), True),
    StructField("current_consumption_w", FloatType(), True),
    StructField("consumption_accumulated_w", FloatType(), True),
])

home_power_usage_df = spark.read.format("json").schema(_schema)\
                   .load(f"/home/iceberg/warehouse/solarx_kafka_log_data/kafka_log_home_energy_consumption_{data_file_date}.log")\
                   .withColumn("15_minutes_interval", F.floor((F.hour(F.col("time_stamp"))*60 + F.minute(F.col("time_stamp")) - 60) / 15))                                                                                                 

In [26]:
home_power_usage_df.printSchema()

root
 |-- time_stamp: timestamp (nullable = true)
 |-- current_consumption_w: float (nullable = true)
 |-- consumption_accumulated_w: float (nullable = true)
 |-- 15_minutes_interval: long (nullable = true)



In [25]:
home_power_usage_df.show()

+-------------------+---------------------+-------------------------+-------------------+
|         time_stamp|current_consumption_w|consumption_accumulated_w|15_minutes_interval|
+-------------------+---------------------+-------------------------+-------------------+
|2025-02-14 02:08:36|                 0.82|                     0.82|                  4|
|2025-02-14 02:08:37|                 0.79|                     1.61|                  4|
|2025-02-14 02:08:38|                 0.76|                     2.37|                  4|
|2025-02-14 02:08:39|                 0.83|                      3.2|                  4|
|2025-02-14 02:08:40|                 0.82|                     4.02|                  4|
|2025-02-14 02:08:41|                 0.79|                     4.81|                  4|
|2025-02-14 02:08:42|                 0.86|                     5.67|                  4|
|2025-02-14 02:08:43|                  0.8|                     6.47|                  4|
|2025-02-1

In [27]:
# Check Spark Shuffle Partition setting

print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", home_power_usage_df.rdd.getNumPartitions())
print("# Parallelism: ", spark.sparkContext.defaultParallelism)

# shiffle:  200
# partitions:  2
# Parallelism:  2


In [29]:
spark.conf.set("spark.sql.shuffle.partitions", 92)

## Iceberg Table

In [30]:
%%sql

CREATE DATABASE IF NOT EXISTS SolarX_Raw_Transactions

25/02/14 17:55:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [31]:
%%sql

DROP TABLE IF EXISTS SolarX_Raw_Transactions.home_power_readings PURGE

In [32]:
%%sql

CREATE TABLE SolarX_Raw_Transactions.home_power_readings(
    timestamp               TIMESTAMP NOT NULL,
    15_minutes_interval     SMALLINT  NOT NULL,
    min_consumption_wh      FLOAT     NOT NULL,
    max_consumption_wh      FLOAT     NOT NULL
)
USING iceberg
PARTITIONED BY (DAY(timestamp), 15_minutes_interval);

In [33]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.home_power_readings;

timestamp,15_minutes_interval,min_consumption_wh,max_consumption_wh


## ETL

In [34]:
home_power_usage_df.createOrReplaceTempView("temp_view")

In [35]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.home_power_readings (timestamp, 15_minutes_interval, min_consumption_wh, max_consumption_wh)
SELECT time_stamp                 as timestamp,
       15_minutes_interval        as 15_minutes_interval,
       current_consumption_w      as min_consumption_wh,
       current_consumption_w      as max_consumption_wh
       
FROM temp_view

                                                                                

# Solar Power Kafka Log Data

## Read Kafka Log Data

In [37]:
data_file_date = "2025-02-14"

_schema = StructType([
    StructField("time_stamp", TimestampType(), True),
    StructField("current_consumption_w", FloatType(), True),
    StructField("consumption_accumulated_w", FloatType(), True),
])

solar_power_usage_df = spark.read.format("json").schema(_schema)\
                   .load(f"/home/iceberg/warehouse/solarx_kafka_log_data/kafka_log_solar_energy_data_{data_file_date}.log")\
                   .withColumn("15_minutes_interval", F.floor((F.hour(F.col("time_stamp"))*60 + F.minute(F.col("time_stamp")) - 60) / 15))                                                                                                 

In [38]:
solar_power_usage_df.printSchema()

root
 |-- time_stamp: timestamp (nullable = true)
 |-- current_consumption_w: float (nullable = true)
 |-- consumption_accumulated_w: float (nullable = true)
 |-- 15_minutes_interval: long (nullable = true)



In [39]:
solar_power_usage_df.show()

+-------------------+---------------------+-------------------------+-------------------+
|         time_stamp|current_consumption_w|consumption_accumulated_w|15_minutes_interval|
+-------------------+---------------------+-------------------------+-------------------+
|2025-02-14 02:08:29|                  0.0|                      0.0|                  4|
|2025-02-14 02:08:30|                  0.0|                      0.0|                  4|
|2025-02-14 02:08:31|                  0.0|                      0.0|                  4|
|2025-02-14 02:08:32|                  0.0|                      0.0|                  4|
|2025-02-14 02:08:33|                  0.0|                      0.0|                  4|
|2025-02-14 02:08:34|                  0.0|                      0.0|                  4|
|2025-02-14 02:08:35|                  0.0|                      0.0|                  4|
|2025-02-14 02:08:36|                  0.0|                      0.0|                  4|
|2025-02-1

## Iceberg Table

In [41]:
%%sql

CREATE TABLE SolarX_Raw_Transactions.solar_panel_readings(
    timestamp TIMESTAMP NOT NULL,
    15_minutes_interval INT NOT NULL,
    panel_id INT NOT NULL,
    generation_power_wh FLOAT NOT NULL
)
USING iceberg
PARTITIONED BY (DAY(timestamp), panel_id, 15_minutes_interval);

## ETL

In [43]:
solar_power_usage_df.createOrReplaceTempView("temp_view")

In [45]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.solar_panel_readings (timestamp, 15_minutes_interval, panel_id, generation_power_wh)
SELECT time_stamp                 as timestamp,
       15_minutes_interval        as 15_minutes_interval,
       4                          as panel_id,
       current_consumption_w      as generation_power_wh
       
FROM temp_view

                                                                                

# Batteries Power Kafka Log Data

## Read Kafka Log Data

In [70]:
from pyspark.sql.functions import col, explode
from pyspark.sql.types import StructType, StructField, TimestampType, FloatType, IntegerType, StringType, MapType


data_file_date = "2025-02-14"

battery_schema = StructType([
    StructField("capacity_kwh", FloatType(), True),
    StructField("max_charge_speed_w", FloatType(), True),
    StructField("current_energy_wh", FloatType(), True),
    StructField("is_charging", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("max_output_w", FloatType(), True)
])

_schema = StructType([
    StructField("time_stamp", TimestampType(), True),
    StructField("batteries", MapType(StringType(), battery_schema), True),  # Dictionary of batteries
])

battery_power_usage_df = spark.read.format("json").schema(_schema)\
                   .load(f"/home/iceberg/warehouse/solarx_kafka_log_data/kafka_log_battery_data_{data_file_date}.log")\
                   .select("time_stamp", explode("batteries").alias("battery_name", "battery_data"))\
                   .select(
                        col("time_stamp"),
                        col("battery_name"),
                        col("battery_data.capacity_kwh").alias("capacity_kwh"),
                        col("battery_data.max_charge_speed_w").alias("max_charge_speed_w"),
                        col("battery_data.current_energy_wh").alias("current_energy_wh"),
                        col("battery_data.is_charging").alias("is_charging"),
                        col("battery_data.status").alias("status"),
                        col("battery_data.max_output_w").alias("max_output_w")
                   ).withColumn("15_minutes_interval", F.floor((F.hour(F.col("time_stamp"))*60 + F.minute(F.col("time_stamp")) - 60) / 15))                                                                                            

In [71]:
battery_power_usage_df.show()

+-------------------+------------+------------+------------------+-----------------+-----------+-----------+------------+-------------------+
|         time_stamp|battery_name|capacity_kwh|max_charge_speed_w|current_energy_wh|is_charging|     status|max_output_w|15_minutes_interval|
+-------------------+------------+------------+------------------+-----------------+-----------+-----------+------------+-------------------+
|2025-02-14 02:08:52|   battery_1|        12.0|               1.0|           9860.0|          0|      ideal|        3.33|                  4|
|2025-02-14 02:08:52|   battery_2|        12.0|               1.0|           8600.0|          0|      ideal|        3.33|                  4|
|2025-02-14 02:08:52|   battery_3|        12.0|               1.0|          9999.27|          0|discharging|        3.33|                  4|
|2025-02-14 02:08:53|   battery_1|        12.0|               1.0|           9860.0|          0|      ideal|        3.33|                  4|
|2025-

## Iceberg Table

In [75]:
%%sql

DROP TABLE IF EXISTS SolarX_Raw_Transactions.battery_readings;

In [76]:
%%sql

CREATE TABLE SolarX_Raw_Transactions.battery_readings(
    timestamp               TIMESTAMP NOT NULL,
    15_minutes_interval     INT NOT NULL,
    battery_name            VARCHAR(15) NOT NULL,
    capacity_kwh            FLOAT NOT NULL,
    max_charge_speed_w      FLOAT NOT NULL,
    current_energy_wh       FLOAT NOT NULL,
    is_charging             FLOAT NOT NULL,
    status                  VARCHAR(15) NOT NULL,
    max_output_w            FLOAT NOT NULL
)
USING iceberg
PARTITIONED BY (DAY(timestamp), battery_name, 15_minutes_interval);

## ETL

In [77]:
battery_power_usage_df.createOrReplaceTempView("temp_view")

In [79]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.battery_readings (timestamp, 15_minutes_interval, battery_name, capacity_kwh,
                                                          max_charge_speed_w, current_energy_wh, is_charging, status, max_output_w)
SELECT time_stamp                 as timestamp,
       15_minutes_interval        as 15_minutes_interval,
       battery_name               as battery_name,
       capacity_kwh               as capacity_kwh,
       max_charge_speed_w         as max_charge_speed_w,
       current_energy_wh          as current_energy_wh,
       is_charging                as is_charging,
       status                     as status,
       max_output_w               as max_output_w
       
FROM temp_view

                                                                                

In [80]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.battery_readings LIMIT 10;

timestamp,15_minutes_interval,battery_name,capacity_kwh,max_charge_speed_w,current_energy_wh,is_charging,status,max_output_w
2025-02-14 12:00:00,44,battery_2,12.0,1.0,12000.0,0.0,ideal,3.3299999237060547
2025-02-14 12:00:01,44,battery_2,12.0,1.0,11999.2099609375,0.0,discharging,3.3299999237060547
2025-02-14 12:00:02,44,battery_2,12.0,1.0,11999.2099609375,0.0,discharging,3.3299999237060547
2025-02-14 12:00:03,44,battery_2,12.0,1.0,11999.2099609375,0.0,discharging,3.3299999237060547
2025-02-14 12:00:04,44,battery_2,12.0,1.0,11998.4501953125,0.0,discharging,3.3299999237060547
2025-02-14 12:00:05,44,battery_2,12.0,1.0,11998.4501953125,0.0,discharging,3.3299999237060547
2025-02-14 12:00:06,44,battery_2,12.0,1.0,11998.4501953125,0.0,discharging,3.3299999237060547
2025-02-14 12:00:07,44,battery_2,12.0,1.0,11997.669921875,0.0,discharging,3.3299999237060547
2025-02-14 12:00:08,44,battery_2,12.0,1.0,11997.669921875,0.0,discharging,3.3299999237060547
2025-02-14 12:00:09,44,battery_2,12.0,1.0,11997.669921875,0.0,discharging,3.3299999237060547


In [4]:
spark.stop()