In [51]:
from pyspark.sql import SparkSession

In [52]:
spark = (
    SparkSession
    .builder
    .appName("home-load-tables-iceberg")
    .master("spark://db21a0191477:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

24/12/30 18:07:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [53]:
spark

# Read Home Power Usage CSV Data

In [90]:
# Read Weather CSV data
from pyspark.sql import functions as F

_schema = "timestamp timestamp, min_consumption_wh float, max_consumption_wh float, avg_consumption_wh float"

home_power_usage_df = spark.read.format("csv").schema(_schema).option("header", True)\
                   .load("/home/iceberg/warehouse/home_power_usage_history/2013-01-01.csv")\
                   .withColumn("15_minutes_interval", F.floor((F.hour(F.col("timestamp"))*60 + F.minute(F.col("timestamp")) - 60) / 15))                                                                                                 


In [79]:
home_power_usage_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- min_consumption_wh: float (nullable = true)
 |-- max_consumption_wh: float (nullable = true)
 |-- avg_consumption_wh: float (nullable = true)
 |-- 15_minutes_interval: long (nullable = true)



# Optimize Number of Partitions and Shuffle

In [80]:
# Check Spark Shuffle Partition setting

print("# shiffle: ", spark.conf.get("spark.sql.shuffle.partitions"))
print("# partitions: ", home_power_usage_df.rdd.getNumPartitions())
print("# Parallelism: ", spark.sparkContext.defaultParallelism)

# shiffle:  92
# partitions:  11
# Parallelism:  6


In [81]:
spark.conf.set("spark.sql.shuffle.partitions", 92)
# weather_partitioned_df = weather_df.repartition(92, F.col('15_minutes_interval'))

# Craete home power usage tables

In [82]:
%%sql

CREATE DATABASE IF NOT EXISTS SolarX_Raw_Transactions

In [83]:
%%sql

DROP TABLE IF EXISTS SolarX_Raw_Transactions.home_power_readings PURGE

In [84]:
%%sql

CREATE TABLE SolarX_Raw_Transactions.home_power_readings(
    timestamp               TIMESTAMP NOT NULL,
    15_minutes_interval     SMALLINT  NOT NULL,
    min_consumption_wh      FLOAT     NOT NULL,
    max_consumption_wh      FLOAT     NOT NULL
)
USING iceberg
PARTITIONED BY (DAY(timestamp), 15_minutes_interval);

In [85]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.home_power_readings;

timestamp,15_minutes_interval,min_consumption_wh,max_consumption_wh


In [86]:
home_power_usage_df.createOrReplaceTempView("temp_view")

In [87]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.home_power_readings (timestamp, 15_minutes_interval, min_consumption_wh, max_consumption_wh)
SELECT timestamp                  as timestamp,
       15_minutes_interval        as 15_minutes_interval,
       min_consumption_wh         as min_consumption_wh,
       max_consumption_wh         as max_consumption_wh
       
FROM temp_view

                                                                                

# Add Day 2 Data

In [92]:
_schema = "timestamp timestamp, min_consumption_wh float, max_consumption_wh float, avg_consumption_wh float"

home_power_usage_df2 = spark.read.format("csv").schema(_schema).option("header", True)\
                   .load("/home/iceberg/warehouse/home_power_usage_history/2013-01-02.csv")\
                   .withColumn("15_minutes_interval", F.floor((F.hour(F.col("timestamp"))*60 + F.minute(F.col("timestamp")) - 60) / 15))                                                                                                 


In [93]:
home_power_usage_df2.createOrReplaceTempView("temp_view_2")

In [94]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.home_power_readings (timestamp, 15_minutes_interval, min_consumption_wh, max_consumption_wh)
SELECT timestamp                  as timestamp,
       15_minutes_interval        as 15_minutes_interval,
       min_consumption_wh         as min_consumption_wh,
       max_consumption_wh         as max_consumption_wh
       
FROM temp_view_2

                                                                                

# Add Day 3 Data

In [96]:
_schema = "timestamp timestamp, min_consumption_wh float, max_consumption_wh float, avg_consumption_wh float"

home_power_usage_df3 = spark.read.format("csv").schema(_schema).option("header", True)\
                   .load("/home/iceberg/warehouse/home_power_usage_history/2013-01-03.csv")\
                   .withColumn("15_minutes_interval", F.floor((F.hour(F.col("timestamp"))*60 + F.minute(F.col("timestamp")) - 60) / 15))                                                                                                 


In [97]:
home_power_usage_df3.createOrReplaceTempView("temp_view_3")

In [98]:
%%sql
    
INSERT INTO SolarX_Raw_Transactions.home_power_readings (timestamp, 15_minutes_interval, min_consumption_wh, max_consumption_wh)
SELECT timestamp                  as timestamp,
       15_minutes_interval        as 15_minutes_interval,
       min_consumption_wh         as min_consumption_wh,
       max_consumption_wh         as max_consumption_wh
       
FROM temp_view_3

                                                                                

# Some Analysis

In [99]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.home_power_readings LIMIT 15;

timestamp,15_minutes_interval,min_consumption_wh,max_consumption_wh
2013-01-03 22:15:00,85,0.0009668199927546,0.0053314194083213
2013-01-03 22:15:00.005000,85,0.0009668199345469,0.0053314203396439
2013-01-03 22:15:00.010000,85,0.0009668199345469,0.0053314217366278
2013-01-03 22:15:00.015000,85,0.0009668198763392,0.0053314231336116
2013-01-03 22:15:00.020000,85,0.0009668198181316,0.0053314245305955
2013-01-03 22:15:00.025000,85,0.0009668197599239,0.0053314254619181
2013-01-03 22:15:00.030000,85,0.0009668197017163,0.0053314268589019
2013-01-03 22:15:00.035000,85,0.0009668196435086,0.0053314282558858
2013-01-03 22:15:00.040000,85,0.0009668195853009,0.0053314291872084
2013-01-03 22:15:00.045000,85,0.0009668195853009,0.0053314305841922


In [72]:
%%sql

SELECT 
    DAY(timestamp) as day,
    HOUR(timestamp) as hour, 
    SUM(min_consumption_wh) as min_consumption_wh, 
    SUM(max_consumption_wh) as max_consumption_wh
FROM SolarX_Raw_Transactions.home_power_readings
WHERE DAY(timestamp) = 1
GROUP BY day, hour
SORT BY hour


                                                                                

day,hour,min_consumption_wh,max_consumption_wh
1,1,1688.8756494044792,9411.99205300957
1,2,1529.512551076943,9357.665394478478
1,3,1527.5180095451651,9382.351135632023
1,4,1680.5353006699588,7591.124603360891
1,5,1646.826776633272,7016.384929115884
1,6,3886.2872099610977,11014.719419708475
1,7,3799.24679281842,11138.108585019596
1,8,1533.8708930264693,7224.401512389071
1,9,1573.2611340137664,6672.83367728442
1,10,1579.798095861683,7621.791841390543


In [100]:
%%sql

SELECT 
    DAY(timestamp) as day, 
    SUM(min_consumption_wh)/1000 as min_consumption_kwh, 
    SUM(max_consumption_wh)/1000 as max_consumption_kwh
FROM SolarX_Raw_Transactions.home_power_readings
GROUP BY day

                                                                                

day,min_consumption_kwh,max_consumption_kwh
1,31.03053565694584,126.82835777154985
2,33.01199622871832,121.20579759246624
3,31.291361060320924,121.99561212848174


In [101]:
spark.stop()