In [1]:
from pyspark.sql import SparkSession

In [71]:
spark = (
    SparkSession
    .builder
    .appName("home-load-tables-iceberg")
    .master("spark://f7f8bb82a877:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

24/12/31 17:40:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [72]:
spark

In [7]:
%%sql

SHOW TABLES IN SolarX_Raw_Transactions

24/12/31 14:30:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace,tableName,isTemporary
SolarX_Raw_Transactions,home_power_readings,False
SolarX_Raw_Transactions,solar_panel,False
SolarX_Raw_Transactions,solar_panel_readings,False


# Create a new name-space/database for the warehouse

In [None]:
%%sql

CREATE DATABASE IF NOT EXISTS SolarX_WH

# Home Power Usage WH

## Investigate home raw date

In [8]:
%%sql

DESCRIBE SolarX_Raw_Transactions.home_power_readings

col_name,data_type,comment
timestamp,timestamp,
15_minutes_interval,int,
min_consumption_wh,float,
max_consumption_wh,float,
,,
# Partitioning,,
Part 0,days(timestamp),
Part 1,15_minutes_interval,


In [9]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.home_power_readings LIMIT 5

                                                                                

timestamp,15_minutes_interval,min_consumption_wh,max_consumption_wh
2013-01-01 03:30:00,10,0.0010607765289023,0.0065155210904777
2013-01-01 03:30:00.005000,10,0.0010607768781483,0.0065155201591551
2013-01-01 03:30:00.010000,10,0.0010607772273942,0.0065155192278325
2013-01-01 03:30:00.015000,10,0.0010607774602249,0.0065155178308486
2013-01-01 03:30:00.020000,10,0.0010607778094708,0.0065155168995261


In [50]:
%%sql

SELECT 
    timestamp,
    DATE(timestamp) as date,
    15_minutes_interval,
    min_consumption_wh,
    max_consumption_wh
FROM 
    SolarX_Raw_Transactions.home_power_readings 
LIMIT 5

timestamp,date,15_minutes_interval,min_consumption_wh,max_consumption_wh
2013-01-03 22:15:00,2013-01-03,85,0.0009668199927546,0.0053314194083213
2013-01-03 22:15:00.005000,2013-01-03,85,0.0009668199345469,0.0053314203396439
2013-01-03 22:15:00.010000,2013-01-03,85,0.0009668199345469,0.0053314217366278
2013-01-03 22:15:00.015000,2013-01-03,85,0.0009668198763392,0.0053314231336116
2013-01-03 22:15:00.020000,2013-01-03,85,0.0009668198181316,0.0053314245305955


### Query low frequency data for wh

In [39]:
%%sql

SELECT
    CONCAT(DATE(timestamp), '--', 15_minutes_interval) as home_power_readings_id,
    DATE(timestamp) as date,
    15_minutes_interval,
    SUM(min_consumption_wh) as min_consumption_power_wh,
    SUM(max_consumption_wh) as max_consumption_power_wh
FROM 
    SolarX_Raw_Transactions.home_power_readings
WHERE 
    DAY(timestamp) = 1
GROUP BY 
    DATE(timestamp), 15_minutes_interval
SORT BY
    15_minutes_interval
LIMIT 10

                                                                                

home_power_readings_id,date,15_minutes_interval,min_consumption_power_wh,max_consumption_power_wh
2013-01-01--0,2013-01-01,0,210.4949002956273,1234.37793413084
2013-01-01--1,2013-01-01,1,210.90460421564055,1195.791982460767
2013-01-01--2,2013-01-01,2,211.31430813518816,1157.2060307911595
2013-01-01--3,2013-01-01,3,211.72401205578356,1118.620079122018
2013-01-01--4,2013-01-01,4,206.7439376907423,1116.9222173416056
2013-01-01--5,2013-01-01,5,196.37402515218128,1152.1128553207964
2013-01-01--6,2013-01-01,6,186.00411261827685,1187.3034932990558
2013-01-01--7,2013-01-01,7,175.6342000772711,1222.494131277781
2013-01-01--8,2013-01-01,8,175.57182770763757,1223.2657041028142
2013-01-01--9,2013-01-01,9,185.8171100311447,1189.6178293367848


## Home readings dimension and fact tables

### Home dimension

In [55]:
%%sql

DROP TABLE IF EXISTS SolarX_WH.dim_home PURGE

In [56]:
%%sql

CREATE TABLE SolarX_WH.dim_home(
    home_key                            INT         NOT NULL,
    home_id                             INT         NOT NULL,
    appliance                           VARCHAR(25) NOT NULL,    
    consumption_power                   FLOAT       NOT NULL,
    usage_time                          VARCHAR(50) NOT NULL,

    -- scd type2 for consumption_power
    consumption_power_start_date        TIMESTAMP   NOT NULL,
    consumption_power_end_date          TIMESTAMP,

    -- scd type2 for usage_time
    usage_time_start_date               TIMESTAMP  NOT NULL,
    usage_time_end_date                 TIMESTAMP
)
USING iceberg;

### Home fact

In [73]:
%%sql

DROP TABLE IF EXISTS SolarX_WH.fact_home_power_readings PURGE

24/12/31 17:41:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [74]:
%%sql

CREATE TABLE SolarX_WH.fact_home_power_readings(
    home_power_reading_key          INT           NOT NULL,
    home_key                        SMALLINT      NOT NULL,   -- REFERENCES dim_home(home_key)
    date_key                        SMALLINT      NOT NULL,   -- REFERENCES dim_date(date_key)

    home_power_reading_id          VARCHAR(25)   NOT NULL,
    date                            DATE          NOT NULL,
    15_minutes_interval             SMALLINT      NOT NULL,
    min_consumption_power_wh        FLOAT         NOT NULL,
    max_consumption_power_wh        FLOAT         NOT NULL 
)

USING iceberg
PARTITIONED BY (MONTH(date), 15_minutes_interval)

## Solar panel readings dimension and fact tables

## Investigate solar panel raw date

In [52]:
%%sql

DESCRIBE SolarX_Raw_Transactions.solar_panel

col_name,data_type,comment
id,int,
name,string,
capacity_kwh,float,
intensity_power_rating,float,
temperature_power_rating,float,


In [53]:
%%sql

DESCRIBE SolarX_Raw_Transactions.solar_panel_readings

col_name,data_type,comment
timestamp,timestamp,
15_minutes_interval,int,
panel_id,int,
generation_power_wh,float,
,,
# Partitioning,,
Part 0,days(timestamp),
Part 1,panel_id,
Part 2,15_minutes_interval,


### Query low frequency data for wh

In [61]:
%%sql

SELECT
    CONCAT(DATE(timestamp), '--', 15_minutes_interval) as solar_panel_power_readings_id,
    DATE(timestamp) as date,
    15_minutes_interval,
    SUM(generation_power_wh) as generation_power_wh
FROM 
    SolarX_Raw_Transactions.solar_panel_readings
WHERE 
    DAY(timestamp) = 1 AND panel_id = 1
GROUP BY 
    DATE(timestamp), 15_minutes_interval
SORT BY
    generation_power_wh desc
LIMIT 10

                                                                                

solar_panel_power_readings_id,date,15_minutes_interval,generation_power_wh
2013-01-01--34,2013-01-01,34,1112.372815785464
2013-01-01--32,2013-01-01,32,1108.504030722659
2013-01-01--37,2013-01-01,37,1075.1697589475662
2013-01-01--33,2013-01-01,33,1070.1509305494835
2013-01-01--39,2013-01-01,39,1033.8308916147798
2013-01-01--35,2013-01-01,35,1033.8161300485954
2013-01-01--38,2013-01-01,38,1001.8857660298236
2013-01-01--36,2013-01-01,36,997.2686491240748
2013-01-01--42,2013-01-01,42,929.7826389158143
2013-01-01--41,2013-01-01,41,923.0105679007248


### Solar panel dimension

In [54]:
%%sql

DROP TABLE IF EXISTS SolarX_WH.dim_solar_panel PURGE

In [57]:
%%sql

CREATE TABLE SolarX_WH.dim_solar_panel(
    solar_panel_key                             SMALLINT    NOT NULL,
    solar_panel_id                              SMALLINT    NOT NULL,
    name                                        VARCHAR(20) NOT NULL,    
    capacity_kwh                                FLOAT       NOT NULL,
    intensity_power_rating_wh                   FLOAT       NOT NULL,
    temperature_power_rating_c                  FLOAT       NOT NULL,

    -- scd type2 for capacity_kwh
    capacity_kwh_start_date                     TIMESTAMP   NOT NULL,
    capacity_kwh_end_date                       TIMESTAMP,

    -- scd type2 for intensity_power_rating_wh
    intensity_power_rating_wh_start_date        TIMESTAMP  NOT NULL,
    intensity_power_rating_wh_end_date          TIMESTAMP,

    -- scd type2 for temperature_power_rating_c
    temperature_power_rating_c_start_date       TIMESTAMP  NOT NULL,
    temperature_power_rating_c_end_date         TIMESTAMP
)
USING iceberg;

### Solar panel fact

In [66]:
%%sql

DROP TABLE IF EXISTS SolarX_WH.fact_solar_panel_power_readings PURGE

In [67]:
%%sql

CREATE TABLE SolarX_WH.fact_solar_panel_power_readings(
    solar_panel_reading_key         INT           NOT NULL,
    solar_panel_key                 SMALLINT      NOT NULL,   -- REFERENCES dim_solar_panel(solar_panel_key)
    date_key                        SMALLINT      NOT NULL,   -- REFERENCES dim_date(date_key)

    solar_panel_reading_id          VARCHAR(25)   NOT NULL,
    date                            DATE          NOT NULL,
    15_minutes_interval             SMALLINT      NOT NULL,
    generation_power_wh             FLOAT         NOT NULL 
)

USING iceberg
PARTITIONED BY (MONTH(date), 15_minutes_interval)

In [70]:
spark.stop()