In [1]:
from pyspark.sql import SparkSession

In [41]:
spark = (
    SparkSession
    .builder
    .appName("solar-panel-tables-iceberg")
    .master("spark://db21a0191477:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

24/12/30 21:37:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [42]:
spark

# Read Weather CSV data

In [43]:
# Read Weather CSV data
from pyspark.sql import functions as F

_schema = "timestamp timestamp, solar_intensity float, temp float"

weather_df = spark.read.format("csv").schema(_schema).option("header", True)\
                   .load("/home/iceberg/warehouse/weather_history_splitted_resampled/2013-01-01.csv")\
                   .withColumn("15_min_interval", F.floor((F.hour(F.col("timestamp"))*60 + F.minute(F.col("timestamp")) - 60) / 15))                                                                                                 

In [44]:
weather_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- solar_intensity: float (nullable = true)
 |-- temp: float (nullable = true)
 |-- 15_min_interval: long (nullable = true)



# Optimize Number of Partitions and Shuffle

In [45]:
# Check Spark Shuffle Partition setting

print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", weather_df.rdd.getNumPartitions())

# shiffle:  200
# partitions:  6


In [46]:
spark.sparkContext.defaultParallelism

6

In [47]:
spark.conf.set("spark.sql.shuffle.partitions", 92)
# weather_partitioned_df = weather_df.repartition(92, F.col('15_min_interval'))

# Craete solar panel tables

In [48]:
%%sql

CREATE DATABASE IF NOT EXISTS SolarX_Raw_Transactions

24/12/30 21:37:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [51]:
%%sql

DROP TABLE IF EXISTS SolarX_Raw_Transactions.solar_panel PURGE

In [50]:
%%sql
    
DROP TABLE IF EXISTS SolarX_Raw_Transactions.solar_panel_readings PURGE

                                                                                

In [52]:
%%sql
    
CREATE TABLE SolarX_Raw_Transactions.solar_panel(
    id INT,
    name VARCHAR(25) NOT NULL,
    capacity_kwh FLOAT NOT NULL,
    intensity_power_rating FLOAT NOT NULL,
    temperature_power_rating FLOAT NOT NULL
)
USING iceberg

In [53]:
%%sql

CREATE TABLE SolarX_Raw_Transactions.solar_panel_readings(
    timestamp TIMESTAMP NOT NULL,
    15_minutes_interval INT NOT NULL,
    panel_id INT NOT NULL,
    generation_power_wh FLOAT NOT NULL
)
USING iceberg
PARTITIONED BY (DAY(timestamp), panel_id, 15_minutes_interval);

In [54]:
%%sql

INSERT INTO SolarX_Raw_Transactions.solar_panel (id, name, capacity_kwh, intensity_power_rating, temperature_power_rating) VALUES
(1, 'roof panel', 10, 1000, 25),
(2, 'pole panel', 12.5, 1300, 25),
(3, 'flush panel', 14, 1500, 25);

                                                                                

In [55]:
%%sql

SELECT * from SolarX_Raw_Transactions.solar_panel;

id,name,capacity_kwh,intensity_power_rating,temperature_power_rating
1,roof panel,10.0,1000.0,25.0
2,pole panel,12.5,1300.0,25.0
3,flush panel,14.0,1500.0,25.0


In [57]:
%%sql

SELECT * from SolarX_Raw_Transactions.solar_panel_readings;

timestamp,15_minutes_interval,panel_id,generation_power_wh


In [58]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.solar_panel.files

                                                                                

content,file_path,file_format,spec_id,record_count,file_size_in_bytes,column_sizes,value_counts,null_value_counts,nan_value_counts,lower_bounds,upper_bounds,key_metadata,split_offsets,equality_ids,sort_order_id,readable_metrics
0,s3://warehouse/SolarX_Raw_Transactions/solar_panel/data/00000-104-8de86d43-133d-4a0b-83dd-002a3d618a75-0-00001.parquet,PARQUET,0,1,1450,"{1: 36, 2: 46, 3: 36, 4: 36, 5: 36}","{1: 1, 2: 1, 3: 1, 4: 1, 5: 1}","{1: 0, 2: 0, 3: 0, 4: 0, 5: 0}","{4: 0, 5: 0, 3: 0}","{1: bytearray(b'\x01\x00\x00\x00'), 2: bytearray(b'roof panel'), 3: bytearray(b'\x00\x00 A'), 4: bytearray(b'\x00\x00zD'), 5: bytearray(b'\x00\x00\xc8A')}","{1: bytearray(b'\x01\x00\x00\x00'), 2: bytearray(b'roof panel'), 3: bytearray(b'\x00\x00 A'), 4: bytearray(b'\x00\x00zD'), 5: bytearray(b'\x00\x00\xc8A')}",,[4],,0,"Row(capacity_kwh=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=10.0, upper_bound=10.0), id=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=None, lower_bound=1, upper_bound=1), intensity_power_rating=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=1000.0, upper_bound=1000.0), name=Row(column_size=46, value_count=1, null_value_count=0, nan_value_count=None, lower_bound='roof panel', upper_bound='roof panel'), temperature_power_rating=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=25.0, upper_bound=25.0))"
0,s3://warehouse/SolarX_Raw_Transactions/solar_panel/data/00001-105-8de86d43-133d-4a0b-83dd-002a3d618a75-0-00001.parquet,PARQUET,0,1,1450,"{1: 36, 2: 46, 3: 36, 4: 36, 5: 36}","{1: 1, 2: 1, 3: 1, 4: 1, 5: 1}","{1: 0, 2: 0, 3: 0, 4: 0, 5: 0}","{4: 0, 5: 0, 3: 0}","{1: bytearray(b'\x02\x00\x00\x00'), 2: bytearray(b'pole panel'), 3: bytearray(b'\x00\x00HA'), 4: bytearray(b'\x00\x80\xa2D'), 5: bytearray(b'\x00\x00\xc8A')}","{1: bytearray(b'\x02\x00\x00\x00'), 2: bytearray(b'pole panel'), 3: bytearray(b'\x00\x00HA'), 4: bytearray(b'\x00\x80\xa2D'), 5: bytearray(b'\x00\x00\xc8A')}",,[4],,0,"Row(capacity_kwh=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=12.5, upper_bound=12.5), id=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=None, lower_bound=2, upper_bound=2), intensity_power_rating=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=1300.0, upper_bound=1300.0), name=Row(column_size=46, value_count=1, null_value_count=0, nan_value_count=None, lower_bound='pole panel', upper_bound='pole panel'), temperature_power_rating=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=25.0, upper_bound=25.0))"
0,s3://warehouse/SolarX_Raw_Transactions/solar_panel/data/00002-106-8de86d43-133d-4a0b-83dd-002a3d618a75-0-00001.parquet,PARQUET,0,1,1456,"{1: 35, 2: 47, 3: 36, 4: 36, 5: 36}","{1: 1, 2: 1, 3: 1, 4: 1, 5: 1}","{1: 0, 2: 0, 3: 0, 4: 0, 5: 0}","{4: 0, 5: 0, 3: 0}","{1: bytearray(b'\x03\x00\x00\x00'), 2: bytearray(b'flush panel'), 3: bytearray(b'\x00\x00`A'), 4: bytearray(b'\x00\x80\xbbD'), 5: bytearray(b'\x00\x00\xc8A')}","{1: bytearray(b'\x03\x00\x00\x00'), 2: bytearray(b'flush panel'), 3: bytearray(b'\x00\x00`A'), 4: bytearray(b'\x00\x80\xbbD'), 5: bytearray(b'\x00\x00\xc8A')}",,[4],,0,"Row(capacity_kwh=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=14.0, upper_bound=14.0), id=Row(column_size=35, value_count=1, null_value_count=0, nan_value_count=None, lower_bound=3, upper_bound=3), intensity_power_rating=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=1500.0, upper_bound=1500.0), name=Row(column_size=47, value_count=1, null_value_count=0, nan_value_count=None, lower_bound='flush panel', upper_bound='flush panel'), temperature_power_rating=Row(column_size=36, value_count=1, null_value_count=0, nan_value_count=0, lower_bound=25.0, upper_bound=25.0))"


In [59]:
spark.sql("SHOW CATALOGS").show()

+-------------+
|      catalog|
+-------------+
|         demo|
|spark_catalog|
+-------------+



In [60]:
spark.conf.get("spark.sql.catalog.demo")

'org.apache.iceberg.spark.SparkCatalog'

In [61]:
spark.sql("SHOW DATABASES IN demo").show()

+--------------------+
|           namespace|
+--------------------+
|SolarX_Raw_Transa...|
+--------------------+



# Calc Solar Power

In [62]:
solar_panele_df = spark.read.format("iceberg").load("demo.SolarX_Raw_Transactions.solar_panel")

In [63]:
solar_panel_rating = solar_panele_df.filter(solar_panele_df.id == 1) \
                                    .select("capacity_kwh", "intensity_power_rating", "temperature_power_rating") \
                                    .first()
solar_panel_rating

Row(capacity_kwh=10.0, intensity_power_rating=1000.0, temperature_power_rating=25.0)

In [64]:
def calc_solar_readings(panel_id, weather_df):
    # Loading solar_panel table
    solar_panele_df = spark.read.format("iceberg").load("SolarX_Raw_Transactions.solar_panel")

    # Quering panel power ratings
    solar_panel_rating = solar_panele_df.filter(solar_panele_df.id == panel_id) \
                                    .select("capacity_kwh", "intensity_power_rating", "temperature_power_rating") \
                                    .first()

    solar_panel_rating_kwh       = solar_panel_rating["capacity_kwh"]
    solar_intensity_power_rating = solar_panel_rating["intensity_power_rating"]
    temp_power_rating            = solar_panel_rating["temperature_power_rating"]
    
    solar_panel_rating_w_5ms = solar_panel_rating_kwh*1000/(60*60*1000/5)


    # Calculating the power value into a new df
    solar_panel_readings_df = weather_df.withColumn('current_generation_watt', solar_panel_rating_w_5ms \
                * (1 /(1 - (temp_power_rating - F.col("temp"))/(temp_power_rating))) \
                * (1 - (solar_intensity_power_rating - F.col("solar_intensity"))/solar_intensity_power_rating) \
                                                           ).drop("solar_intensity", "temp")

    return solar_panel_readings_df

## Inserting solar_panel_readings 1 data into iceberg

In [65]:
panel_id = 1
solar_panel_readings_df1 = calc_solar_readings(panel_id, weather_df)
solar_panel_readings_df1.printSchema()
solar_panel_readings_df1.createOrReplaceTempView("temp_view_1")

root
 |-- timestamp: timestamp (nullable = true)
 |-- 15_min_interval: long (nullable = true)
 |-- current_generation_watt: double (nullable = true)



In [66]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.solar_panel_readings (timestamp, 15_minutes_interval, panel_id, generation_power_wh)
SELECT timestamp                  as timestamp,
       15_min_interval            as 15_minutes_interval,
       1                          as panel_id,
       current_generation_watt    as generation_power_wh
       
FROM temp_view_1

                                                                                

## Inserting solar_panel_readings 2 and 3 data into iceberg

In [67]:
panel_id = 2
solar_panel_readings_df2 = calc_solar_readings(panel_id, weather_df)
solar_panel_readings_df2.createOrReplaceTempView("temp_view_2")

In [68]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.solar_panel_readings (timestamp, 15_minutes_interval, panel_id, generation_power_wh)
SELECT timestamp                  as timestamp,
       15_min_interval            as 15_minutes_interval,
       2                          as panel_id,
       current_generation_watt    as generation_power_wh
       
FROM temp_view_2

                                                                                

In [69]:
panel_id = 3
solar_panel_readings_df3 = calc_solar_readings(panel_id, weather_df)
solar_panel_readings_df3.createOrReplaceTempView("temp_view_3")

In [70]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.solar_panel_readings (timestamp, 15_minutes_interval, panel_id, generation_power_wh)
SELECT timestamp                  as timestamp,
       15_min_interval            as 15_minutes_interval,
       3                          as panel_id,
       current_generation_watt    as generation_power_wh
       
FROM temp_view_3

                                                                                

## Some analysis

In [71]:
%%sql

SELECT * from SolarX_Raw_Transactions.solar_panel_readings limit 10;

timestamp,15_minutes_interval,panel_id,generation_power_wh
2013-01-01 19:45:00,75,3,0.0
2013-01-01 19:45:00.005000,75,3,0.0
2013-01-01 19:45:00.010000,75,3,0.0
2013-01-01 19:45:00.015000,75,3,0.0
2013-01-01 19:45:00.020000,75,3,0.0
2013-01-01 19:45:00.025000,75,3,0.0
2013-01-01 19:45:00.030000,75,3,0.0
2013-01-01 19:45:00.035000,75,3,0.0
2013-01-01 19:45:00.040000,75,3,0.0
2013-01-01 19:45:00.045000,75,3,0.0


In [72]:
%%sql

SELECT panel_id, 15_minutes_interval, SUM(generation_power_wh) as generation_power_wh
FROM SolarX_Raw_Transactions.solar_panel_readings
GROUP BY panel_id, 15_minutes_interval
SORT BY generation_power_wh desc
LIMIT 20

                                                                                

panel_id,15_minutes_interval,generation_power_wh
1,34,3707.9093862380832
1,32,3695.013436084613
1,37,3583.8991967197508
1,33,3567.1697686873376
2,34,3565.297486697324
2,32,3552.8975344654173
3,34,3460.715426978655
3,32,3448.679207050241
1,39,3446.10297219269
2,37,3446.0569196464494


In [73]:
spark.stop()