In [26]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession
    .builder
    .appName("wh-etl")
    .master("spark://44952dfaa576:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

25/01/05 14:00:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark

In [5]:
%%sql

SHOW TABLES IN demo.SolarX_Raw_Transactions

25/01/02 22:03:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace,tableName,isTemporary
SolarX_Raw_Transactions,home_power_readings,False
SolarX_Raw_Transactions,solar_panel,False
SolarX_Raw_Transactions,solar_panel_readings,False


In [50]:
%%sql

SHOW TABLES IN demo.SolarX_WH

25/01/03 00:25:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace,tableName,isTemporary
SolarX_WH,dim_date,False
SolarX_WH,dim_home,False
SolarX_WH,dim_home_appliances,False
SolarX_WH,dim_solar_panel,False
SolarX_WH,fact_home_power_readings,False
SolarX_WH,fact_solar_panel_power_readings,False


# Date dimension

In [7]:
%%sql

DESCRIBE SolarX_WH.dim_date

col_name,data_type,comment
date_key,timestamp,
year,int,
quarter,int,
month,int,
week,int,
day,int,
hour,int,
minute,int,
is_weekend,boolean,
# Partition Information,,


In [8]:
import datetime

def generate_15min_intervals(start_time, end_time):
    current_time = start_time
    timestamps = []
    while current_time <= end_time:
        timestamps.append((current_time,))
        current_time += datetime.timedelta(minutes=15)
    return timestamps

# start date
start_time = datetime.datetime(2013, 1, 1, 0, 0, 0)

# future end date
end_time = datetime.datetime.now() + datetime.timedelta(days=365 * 1)

# generate a list of 15-minute intervals
timestamp_list = generate_15min_intervals(start_time, end_time)

In [9]:
timestamp_list[0:5]

[(datetime.datetime(2013, 1, 1, 0, 0),),
 (datetime.datetime(2013, 1, 1, 0, 15),),
 (datetime.datetime(2013, 1, 1, 0, 30),),
 (datetime.datetime(2013, 1, 1, 0, 45),),
 (datetime.datetime(2013, 1, 1, 1, 0),)]

In [10]:
# load this list into a df
from pyspark.sql import functions as F
timestamp_df = spark.createDataFrame(timestamp_list, schema="timestamp timestamp")\
                    .withColumn("15_minutes_interval", F.floor((F.hour(F.col("timestamp"))*60 + F.minute(F.col("timestamp")) - 60) / 15))

In [11]:
timestamp_df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------------------+-------------------+
|          timestamp|15_minutes_interval|
+-------------------+-------------------+
|2013-01-01 00:00:00|                 -4|
|2013-01-01 00:15:00|                 -3|
|2013-01-01 00:30:00|                 -2|
|2013-01-01 00:45:00|                 -1|
|2013-01-01 01:00:00|                  0|
|2013-01-01 01:15:00|                  1|
|2013-01-01 01:30:00|                  2|
|2013-01-01 01:45:00|                  3|
|2013-01-01 02:00:00|                  4|
|2013-01-01 02:15:00|                  5|
|2013-01-01 02:30:00|                  6|
|2013-01-01 02:45:00|                  7|
|2013-01-01 03:00:00|                  8|
|2013-01-01 03:15:00|                  9|
|2013-01-01 03:30:00|                 10|
|2013-01-01 03:45:00|                 11|
|2013-01-01 04:00:00|                 12|
|2013-01-01 04:15:00|                 13|
|2013-01-01 04:30:00|                 14|
|2013-01-01 04:45:00|                 15|
+-------------------+-------------

                                                                                

In [13]:
timestamp_df.createOrReplaceTempView("temp_view")

In [15]:
%%sql
    
INSERT INTO SolarX_WH.dim_date (date_key, year, quarter, month, week, day, hour, minute, is_weekend)
SELECT timestamp                      as date_key,
       YEAR(timestamp)                as year,
       QUARTER(timestamp)             as quarter,
       MONTH(timestamp)               as month,
       EXTRACT(WEEK FROM timestamp)   as week,
       DAY(timestamp)                 as day,
       HOUR(timestamp)                as hour,
       15_minutes_interval            as minute,
       CASE 
               WHEN WEEKDAY(timestamp) in (6,7) then true 
               ELSE FALSE
       END
       
FROM temp_view

                                                                                

In [19]:
%%sql

SELECT * FROM SolarX_WH.dim_date 
WHERE day = 1 AND year = 2013
LIMIT 10

date_key,year,quarter,month,week,day,hour,minute,is_weekend
2013-04-01 02:45:00,2013,2,4,14,1,2,7,False
2013-10-01 05:30:00,2013,4,10,40,1,5,18,False
2013-10-01 04:45:00,2013,4,10,40,1,4,15,False
2013-10-01 04:15:00,2013,4,10,40,1,4,13,False
2013-03-01 11:45:00,2013,1,3,9,1,11,43,False
2013-03-01 15:15:00,2013,1,3,9,1,15,57,False
2013-09-01 17:45:00,2013,3,9,35,1,17,67,True
2013-03-01 14:30:00,2013,1,3,9,1,14,54,False
2013-10-01 07:00:00,2013,4,10,40,1,7,24,False
2013-03-01 14:15:00,2013,1,3,9,1,14,53,False


In [23]:
%%sql

SELECT DISTINCT(year) FROM SolarX_WH.dim_date 

                                                                                

year
2025
2018
2015
2023
2022
2013
2014
2019
2020
2016


In [24]:
%%sql

SELECT COUNT(year) FROM SolarX_WH.dim_date 

count(year)
455993


# Home appliances dimension

In [28]:
%%sql

DESCRIBE SolarX_WH.dim_home_appliances

col_name,data_type,comment
home_appliance_key,int,
home_key,int,
appliance,string,
min_consumption_power_wh,float,
max_consumption_power_wh,float,
usage_time,string,


In [29]:
import json
import pandas as pd

def get_home_appliances_df():
    with open('/home/iceberg/warehouse/home_appliances_consumption.json') as f:
        HOME_USAGE_POWER = json.load(f)
        HOME_USAGE_POWER.items()
        
        df = pd.DataFrame([
                {
                    "home_key" : 1,
                    "name": name,
                    "min_consumption_rating": info["consumption"][0],
                    "max_consumption_rating": info["consumption"][1],
                    "usage_time": info["time"]
                }
                for name, info in HOME_USAGE_POWER.items()
            ])
        df.loc[len(df)] = [1, "app", 452, 895, "545454"]
        df.index += 1 
        df.index.name = 'home_appliance_key'
    return df.reset_index()

In [30]:
home_appliances_df = spark.createDataFrame(get_home_appliances_df())

In [31]:
home_appliances_df.show()

+------------------+--------+--------------------+----------------------+----------------------+--------------------+
|home_appliance_key|home_key|                name|min_consumption_rating|max_consumption_rating|          usage_time|
+------------------+--------+--------------------+----------------------+----------------------+--------------------+
|                 1|       1|        Refrigerator|                   300|                  1500|         00:00-24:00|
|                 2|       1|       Electric Oven|                  2000|                  5000|16:00-16:30,21:00...|
|                 3|       1|     Electric Kettle|                  1500|                  1500|07:00-07:15,12:00...|
|                 4|       1|     Air Conditioner|                   500|                  3000|         00:00-24:00|
|                 5|       1|Incandescent Ligh...|                    60|                    60|         00:00-24:00|
|                 6|       1|     LED Light Bulbs|      

In [32]:
home_appliances_df.createOrReplaceTempView("temp_view_2")

In [33]:
%%sql

MERGE INTO SolarX_WH.dim_home_appliances dim_app
USING 
    (SELECT    home_appliance_key        as home_appliance_key, 
               home_key                  as home_key,
               name                      as appliance,
               min_consumption_rating    as min_consumption_power_wh,
               max_consumption_rating    as max_consumption_power_wh,
               usage_time                as usage_time
    FROM temp_view_2) tmp
    
ON dim_app.home_appliance_key = tmp.home_appliance_key

WHEN MATCHED AND (
    dim_app.min_consumption_power_wh != tmp.min_consumption_power_wh OR
    dim_app.max_consumption_power_wh != tmp.max_consumption_power_wh
) THEN UPDATE SET 
    dim_app.min_consumption_power_wh = tmp.min_consumption_power_wh,
    dim_app.max_consumption_power_wh = tmp.max_consumption_power_wh

WHEN NOT MATCHED THEN INSERT *

In [34]:
%%sql

SELECT * FROM SolarX_WH.dim_home_appliances

home_appliance_key,home_key,appliance,min_consumption_power_wh,max_consumption_power_wh,usage_time
7,1,Laptop,50.0,100.0,00:00-24:00
6,1,LED Light Bulbs,10.0,10.0,00:00-24:00
5,1,Incandescent Light Bulbs,60.0,60.0,00:00-24:00
8,1,Computer,100.0,600.0,00:00-24:00
12,1,Blow Dryer,800.0,1800.0,"07:00-07:30,15:00-15:30"
11,1,Smartphone Charger,5.0,5.0,00:00-24:00
13,1,Iron,1000.0,1000.0,"07:00-07:30,15:00-15:30"
14,1,Washing Machine,500.0,1000.0,17:00-19:00
15,1,Water Heater,1000.0,4000.0,12:00-16:00
16,1,app,452.0,895.0,545454


# Home dimension

In [21]:
%%sql

DESCRIBE SolarX_WH.dim_home

col_name,data_type,comment
home_key,int,
home_id,int,
min_consumption_power_wh,float,
max_consumption_power_wh,float,
start_date,timestamp,
end_date,timestamp,
current_flag,boolean,


In [22]:
%%sql 
SELECT home_key, SUM(min_consumption_power_wh) as min_consumption_power_wh,
              SUM(max_consumption_power_wh) as max_consumption_power_wh 
       FROM SolarX_WH.dim_home_appliances
GROUP BY home_key

                                                                                

home_key,min_consumption_power_wh,max_consumption_power_wh
1,7880.0,19895.0


In [37]:
%%sql

MERGE INTO SolarX_WH.dim_home dim_home
USING (
    SELECT 
        home_key, 
        SUM(min_consumption_power_wh) AS min_consumption_power_wh,
        SUM(max_consumption_power_wh) AS max_consumption_power_wh 
    FROM SolarX_WH.dim_home_appliances
    GROUP BY home_key
) dim_app
ON dim_home.home_id = dim_app.home_key AND dim_home.current_flag = TRUE

WHEN MATCHED AND (
    dim_home.max_consumption_power_wh != dim_app.max_consumption_power_wh OR
    dim_home.min_consumption_power_wh != dim_app.min_consumption_power_wh
) THEN UPDATE SET 
    dim_home.end_date = NOW(),
    dim_home.current_flag = FALSE;

In [38]:
%%sql

MERGE INTO SolarX_WH.dim_home dim_home
USING (
    SELECT 
        home_key, 
        SUM(min_consumption_power_wh) AS min_consumption_power_wh,
        SUM(max_consumption_power_wh) AS max_consumption_power_wh 
    FROM SolarX_WH.dim_home_appliances
    GROUP BY home_key
) dim_app
ON dim_home.home_id = dim_app.home_key AND dim_home.current_flag = TRUE

WHEN NOT MATCHED THEN 
INSERT (
    home_key,
    home_id,
    min_consumption_power_wh, 
    max_consumption_power_wh,
    start_date,
    end_date,
    current_flag
) VALUES (
    (SELECT COUNT(*) FROM SolarX_WH.dim_home) + 1,
    1,
    dim_app.min_consumption_power_wh,
    dim_app.max_consumption_power_wh,
    NOW(),
    NULL,
    TRUE
);

In [25]:
%%sql

SELECT * FROM SolarX_WH.dim_home LIMIT 10

home_key,home_id,min_consumption_power_wh,max_consumption_power_wh,start_date,end_date,current_flag
1,1,7880.0,19895.0,2025-01-05 14:02:41.682903,,True


### test scd2
I inserted an extra appliances which changed the tatal consumption_power_wh

In [36]:
%%sql 
SELECT home_key, SUM(min_consumption_power_wh) as min_consumption_power_wh,
              SUM(max_consumption_power_wh) as max_consumption_power_wh 
       FROM SolarX_WH.dim_home_appliances
GROUP BY home_key

home_key,min_consumption_power_wh,max_consumption_power_wh
1,8332.0,20790.0


In [39]:
%%sql

SELECT * FROM SolarX_WH.dim_home LIMIT 10

home_key,home_id,min_consumption_power_wh,max_consumption_power_wh,start_date,end_date,current_flag
1,1,7880.0,19895.0,2025-01-05 14:02:41.682903,2025-01-05 14:05:11.044428,False
2,1,8332.0,20790.0,2025-01-05 14:05:12.263195,,True


In [40]:
%%sql

CALL demo.system.create_changelog_view(
    table => 'SolarX_WH.dim_home',
    changelog_view => 'dim_home_clv',
    identifier_columns => array('home_id')
)

changelog_view
dim_home_clv


In [41]:
%%sql
    
SELECT *
FROM dim_home_clv WHERE home_id = 1
ORDER BY _change_ordinal, _change_type DESC

home_key,home_id,min_consumption_power_wh,max_consumption_power_wh,start_date,end_date,current_flag,_change_type,_change_ordinal,_commit_snapshot_id
1,1,7880.0,19895.0,2025-01-05 14:02:41.682903,,True,INSERT,1,8836579956006501925
1,1,7880.0,19895.0,2025-01-05 14:02:41.682903,,True,UPDATE_BEFORE,2,4097282869843868147
1,1,7880.0,19895.0,2025-01-05 14:02:41.682903,2025-01-05 14:05:11.044428,False,UPDATE_AFTER,2,4097282869843868147
2,1,8332.0,20790.0,2025-01-05 14:05:12.263195,,True,INSERT,3,6664356424762371718


# Home fact

## Investigate home raw date

In [79]:
%%sql

DESCRIBE SolarX_Raw_Transactions.home_power_readings

col_name,data_type,comment
timestamp,timestamp,
15_minutes_interval,int,
min_consumption_wh,float,
max_consumption_wh,float,
,,
# Partitioning,,
Part 0,days(timestamp),
Part 1,15_minutes_interval,


In [38]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.home_power_readings LIMIT 5

                                                                                

timestamp,15_minutes_interval,min_consumption_wh,max_consumption_wh
2013-01-02 01:15:00,1,0.0012557294685393,0.0048566237092018
2013-01-02 01:15:00.005000,1,0.0012557295849546,0.0048566227778792
2013-01-02 01:15:00.010000,1,0.00125572970137,0.0048566223122179
2013-01-02 01:15:00.015000,1,0.0012557298177853,0.0048566218465566
2013-01-02 01:15:00.020000,1,0.0012557299342006,0.0048566213808953


In [54]:
%%sql

SELECT 
    timestamp,
    DATE_TRUNC('minute', timestamp) AS truncated_timestamp,
    DATE(timestamp) as date,
    15_minutes_interval,
    min_consumption_wh,
    max_consumption_wh
FROM 
    SolarX_Raw_Transactions.home_power_readings 
LIMIT 10


timestamp,truncated_timestamp,date,15_minutes_interval,min_consumption_wh,max_consumption_wh
2013-01-02 01:15:00,2013-01-02 01:15:00,2013-01-02,1,0.0012557294685393,0.0048566237092018
2013-01-02 01:15:00.005000,2013-01-02 01:15:00,2013-01-02,1,0.0012557295849546,0.0048566227778792
2013-01-02 01:15:00.010000,2013-01-02 01:15:00,2013-01-02,1,0.00125572970137,0.0048566223122179
2013-01-02 01:15:00.015000,2013-01-02 01:15:00,2013-01-02,1,0.0012557298177853,0.0048566218465566
2013-01-02 01:15:00.020000,2013-01-02 01:15:00,2013-01-02,1,0.0012557299342006,0.0048566213808953
2013-01-02 01:15:00.025000,2013-01-02 01:15:00,2013-01-02,1,0.0012557300506159,0.004856620915234
2013-01-02 01:15:00.030000,2013-01-02 01:15:00,2013-01-02,1,0.0012557301670312,0.0048566199839115
2013-01-02 01:15:00.035000,2013-01-02 01:15:00,2013-01-02,1,0.0012557302834466,0.0048566195182502
2013-01-02 01:15:00.040000,2013-01-02 01:15:00,2013-01-02,1,0.0012557303998619,0.0048566190525889
2013-01-02 01:15:00.045000,2013-01-02 01:15:00,2013-01-02,1,0.0012557305162772,0.0048566185869276


In [40]:
%%sql

SELECT
    CONCAT(DATE(timestamp), '--', 15_minutes_interval) as home_power_readings_id,
    DATE(timestamp) as date,
    15_minutes_interval,
    SUM(min_consumption_wh) as min_consumption_power_wh,
    SUM(max_consumption_wh) as max_consumption_power_wh
FROM 
    SolarX_Raw_Transactions.home_power_readings
WHERE 
    DAY(timestamp) = 1
GROUP BY 
    DATE(timestamp), 15_minutes_interval
SORT BY
    15_minutes_interval
LIMIT 10

                                                                                

home_power_readings_id,date,15_minutes_interval,min_consumption_power_wh,max_consumption_power_wh
2013-01-01--0,2013-01-01,0,210.4949002956273,1234.37793413084
2013-01-01--1,2013-01-01,1,210.90460421564055,1195.791982460767
2013-01-01--2,2013-01-01,2,211.31430813518816,1157.2060307911595
2013-01-01--3,2013-01-01,3,211.72401205578356,1118.620079122018
2013-01-01--4,2013-01-01,4,206.7439376907423,1116.9222173416056
2013-01-01--5,2013-01-01,5,196.37402515218128,1152.1128553207964
2013-01-01--6,2013-01-01,6,186.00411261827685,1187.3034932990558
2013-01-01--7,2013-01-01,7,175.6342000772711,1222.494131277781
2013-01-01--8,2013-01-01,8,175.57182770763757,1223.2657041028142
2013-01-01--9,2013-01-01,9,185.8171100311447,1189.6178293367848


In [41]:
%%sql

SELECT
    DATE_TRUNC('minute', timestamp) AS truncated_timestamp,
    DATE(timestamp) as date,
    SUM(min_consumption_wh) as min_consumption_power_wh,
    SUM(max_consumption_wh) as max_consumption_power_wh
FROM 
    SolarX_Raw_Transactions.home_power_readings
WHERE 
    DAY(timestamp) = 1
GROUP BY 
    DATE(timestamp), DATE_TRUNC('minute', timestamp)
SORT BY
    DATE_TRUNC('minute', timestamp)
LIMIT 10

                                                                                

truncated_timestamp,date,min_consumption_power_wh,max_consumption_power_wh
2013-01-01 01:00:00,2013-01-01,14.02024700876791,83.49231410492212
2013-01-01 01:01:00,2013-01-01,14.02206791518256,83.32082098629326
2013-01-01 01:02:00,2013-01-01,14.02388882136438,83.1493278676644
2013-01-01 01:03:00,2013-01-01,14.025709727895446,82.97783474903554
2013-01-01 01:04:00,2013-01-01,14.027530634077266,82.806341631338
2013-01-01 01:05:00,2013-01-01,14.029351540491916,82.63484851270914
2013-01-01 01:06:00,2013-01-01,14.031172446790151,82.46335539454594
2013-01-01 01:07:00,2013-01-01,14.032993352971973,82.29186227545142
2013-01-01 01:08:00,2013-01-01,14.03481425938662,82.1203691563569
2013-01-01 01:09:00,2013-01-01,14.036635165684856,81.9488760381937


In [86]:
%%sql

SELECT
     CAST(CONCAT(
        YEAR(timestamp), '-', 
        LPAD(MONTH(timestamp), 2, '0'), '-', 
        LPAD(DAY(timestamp), 2, '0'), ' ',
        LPAD(HOUR(timestamp), 2, '0'), ':',
        LPAD(FLOOR(MINUTE(timestamp) / 15) * 15, 2, '0'), ':00'
    ) AS TIMESTAMP) AS home_power_reading_key,
    DATE(timestamp) AS date,
    15_minutes_interval,
    SUM(min_consumption_wh) AS min_consumption_power_wh,
    SUM(max_consumption_wh) AS max_consumption_power_wh
FROM 
    SolarX_Raw_Transactions.home_power_readings
WHERE 
    DAY(timestamp) = 1
GROUP BY 
    15_minutes_interval, home_power_reading_key, DATE(timestamp)
ORDER BY 
    home_power_reading_key
LIMIT 10

                                                                                

home_power_reading_key,date,15_minutes_interval,min_consumption_power_wh,max_consumption_power_wh
2013-01-01 01:00:00,2013-01-01,0,210.4949002956273,1234.37793413084
2013-01-01 01:15:00,2013-01-01,1,210.90460421564055,1195.791982460767
2013-01-01 01:30:00,2013-01-01,2,211.31430813518816,1157.2060307911595
2013-01-01 01:45:00,2013-01-01,3,211.72401205578356,1118.620079122018
2013-01-01 02:00:00,2013-01-01,4,206.7439376907423,1116.9222173416056
2013-01-01 02:15:00,2013-01-01,5,196.37402515218128,1152.1128553207964
2013-01-01 02:30:00,2013-01-01,6,186.00411261827685,1187.3034932990558
2013-01-01 02:45:00,2013-01-01,7,175.6342000772711,1222.494131277781
2013-01-01 03:00:00,2013-01-01,8,175.57182770763757,1223.2657041028142
2013-01-01 03:15:00,2013-01-01,9,185.8171100311447,1189.6178293367848


In [16]:
%%sql
SELECT * FROM SolarX_WH.dim_home
WHERE dim_home.current_flag = TRUE

home_key,min_consumption_power_wh,max_consumption_power_wh,min_consumption_power_wh_start_date,min_consumption_power_wh_end_date,max_consumption_power_wh_start_date,max_consumption_power_wh_end_date,current_flag
1,7880.0,19895.0,2025-01-04 03:36:36.486163,,2025-01-04 03:36:36.486163,,True


### Insert day 1 data

In [85]:
%%sql

DESCRIBE SolarX_WH.fact_home_power_readings

col_name,data_type,comment
home_power_reading_key,timestamp,
home_key,int,
date_key,timestamp,
min_consumption_power_wh,float,
max_consumption_power_wh,float,
,,
# Partitioning,,
Part 0,months(date_key),


In [18]:
%%sql

WITH staging_table AS (
    SELECT
         CAST(CONCAT(
            YEAR(timestamp), '-', 
            LPAD(MONTH(timestamp), 2, '0'), '-', 
            LPAD(DAY(timestamp), 2, '0'), ' ',
            LPAD(HOUR(timestamp), 2, '0'), ':',
            LPAD(FLOOR(MINUTE(timestamp) / 15) * 15, 2, '0'), ':00'
        ) AS TIMESTAMP) AS home_power_reading_key,
        DATE(timestamp) AS date,
        15_minutes_interval,
        SUM(min_consumption_wh) AS min_consumption_power_wh,
        SUM(max_consumption_wh) AS max_consumption_power_wh
    FROM 
        SolarX_Raw_Transactions.home_power_readings
    WHERE 
        DAY(timestamp) = 1
    GROUP BY 
        15_minutes_interval, home_power_reading_key, DATE(timestamp)
)


    
MERGE INTO SolarX_WH.fact_home_power_readings AS target
USING staging_table AS source
ON target.home_power_reading_key = source.home_power_reading_key
      
WHEN NOT MATCHED THEN
    INSERT (home_power_reading_key, 
            home_key, 
            date_key, 
            min_consumption_power_wh,
            max_consumption_power_wh
    
    ) 
    VALUES (source.home_power_reading_key,
            (SELECT home_key FROM SolarX_WH.dim_home WHERE dim_home.current_flag = TRUE), 
            source.home_power_reading_key,
            source.min_consumption_power_wh,
            source.max_consumption_power_wh     
    );

                                                                                

In [112]:
%%sql
SELECT * FROM SolarX_WH.fact_home_power_readings
LIMIT 10

home_power_reading_key,home_key,date_key,min_consumption_power_wh,max_consumption_power_wh
2013-01-01 03:30:00,1,2013-01-01 03:30:00,196.06239318847656,1155.969970703125
2013-01-01 10:30:00,1,2013-01-01 10:30:00,199.0711975097656,977.2131958007812
2013-01-01 02:15:00,1,2013-01-01 02:15:00,196.3740234375,1152.1129150390625
2013-01-01 17:45:00,1,2013-01-01 17:45:00,281.0512390136719,1090.866943359375
2013-01-01 01:45:00,1,2013-01-01 01:45:00,211.72401428222656,1118.6201171875
2013-01-01 17:00:00,1,2013-01-01 17:00:00,287.3873596191406,1215.2205810546875
2013-01-01 16:45:00,1,2013-01-01 16:45:00,352.7113342285156,1371.6483154296875
2013-01-01 07:45:00,1,2013-01-01 07:45:00,254.65789794921875,1092.5604248046875
2013-01-01 07:30:00,1,2013-01-01 07:30:00,401.4898681640625,1292.362548828125
2013-01-01 15:30:00,1,2013-01-01 15:30:00,752.6585693359375,2330.212646484375


### Insert day 2 data after changing the scd2 in the home dimension

In [113]:
%%sql

WITH staging_table AS (
    SELECT
         CAST(CONCAT(
            YEAR(timestamp), '-', 
            LPAD(MONTH(timestamp), 2, '0'), '-', 
            LPAD(DAY(timestamp), 2, '0'), ' ',
            LPAD(HOUR(timestamp), 2, '0'), ':',
            LPAD(FLOOR(MINUTE(timestamp) / 15) * 15, 2, '0'), ':00'
        ) AS TIMESTAMP) AS home_power_reading_key,
        DATE(timestamp) AS date,
        15_minutes_interval,
        SUM(min_consumption_wh) AS min_consumption_power_wh,
        SUM(max_consumption_wh) AS max_consumption_power_wh
    FROM 
        SolarX_Raw_Transactions.home_power_readings
    WHERE 
        DAY(timestamp) = 2
    GROUP BY 
        15_minutes_interval, home_power_reading_key, DATE(timestamp)
)


    
MERGE INTO SolarX_WH.fact_home_power_readings AS target
USING staging_table AS source
ON target.home_power_reading_key = source.home_power_reading_key
      
WHEN NOT MATCHED THEN
    INSERT (home_power_reading_key, 
            home_key, 
            date_key, 
            min_consumption_power_wh,
            max_consumption_power_wh
    
    ) 
    VALUES (source.home_power_reading_key,
            (SELECT home_key FROM SolarX_WH.dim_home WHERE dim_home.current_flag = TRUE), 
            source.home_power_reading_key,
            source.min_consumption_power_wh,
            source.max_consumption_power_wh     
    );

                                                                                

In [121]:
%%sql
SELECT * FROM SolarX_WH.fact_home_power_readings
WHERE home_key = 1
LIMIT 10

home_power_reading_key,home_key,date_key,min_consumption_power_wh,max_consumption_power_wh
2013-01-01 03:30:00,1,2013-01-01 03:30:00,196.06239318847656,1155.969970703125
2013-01-01 10:30:00,1,2013-01-01 10:30:00,199.0711975097656,977.2131958007812
2013-01-01 02:15:00,1,2013-01-01 02:15:00,196.3740234375,1152.1129150390625
2013-01-01 17:45:00,1,2013-01-01 17:45:00,281.0512390136719,1090.866943359375
2013-01-01 01:45:00,1,2013-01-01 01:45:00,211.72401428222656,1118.6201171875
2013-01-01 17:00:00,1,2013-01-01 17:00:00,287.3873596191406,1215.2205810546875
2013-01-01 16:45:00,1,2013-01-01 16:45:00,352.7113342285156,1371.6483154296875
2013-01-01 07:45:00,1,2013-01-01 07:45:00,254.65789794921875,1092.5604248046875
2013-01-01 07:30:00,1,2013-01-01 07:30:00,401.4898681640625,1292.362548828125
2013-01-01 15:30:00,1,2013-01-01 15:30:00,752.6585693359375,2330.212646484375


In [119]:
%%sql
SELECT COUNT(*) FROM SolarX_WH.fact_home_power_readings
WHERE DAY(date_key) = 1

count(1)
92


In [122]:
%%sql
SELECT * FROM SolarX_WH.fact_home_power_readings
WHERE home_key = 2
LIMIT 10

home_power_reading_key,home_key,date_key,min_consumption_power_wh,max_consumption_power_wh
2013-01-02 01:15:00,2,2013-01-02 01:15:00,227.9564666748047,865.0037231445312
2013-01-02 17:00:00,2,2013-01-02 17:00:00,302.9609375,999.7213745117188
2013-01-02 08:30:00,2,2013-01-02 08:30:00,186.36361694335935,867.9696044921875
2013-01-02 16:15:00,2,2013-01-02 16:15:00,640.4440307617188,1595.903564453125
2013-01-02 22:15:00,2,2013-01-02 22:15:00,194.01031494140625,1192.3475341796875
2013-01-02 05:45:00,2,2013-01-02 05:45:00,185.2525177001953,963.6498413085938
2013-01-02 20:30:00,2,2013-01-02 20:30:00,649.01611328125,1902.3560791015625
2013-01-02 20:45:00,2,2013-01-02 20:45:00,840.80859375,2255.948486328125
2013-01-02 12:30:00,2,2013-01-02 12:30:00,501.8223876953125,1733.60009765625
2013-01-02 19:15:00,2,2013-01-02 19:15:00,184.49920654296875,1024.291015625


In [120]:
%%sql
SELECT COUNT(*) FROM SolarX_WH.fact_home_power_readings
WHERE DAY(date_key) = 2

count(1)
92


# Solar Panel Dimension

In [109]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.solar_panel

id,name,capacity_kwh,intensity_power_rating,temperature_power_rating
1,roof panel,3.0,1000.0,25.0
3,flush panel,10.0,1500.0,25.0
2,pole panel,6.0,1300.0,25.0


In [110]:
%%sql

SELECT 
    CAST(CONCAT(solar_panel.id, date_format(NOW(), 'yyyyMMdd')) AS INT) as key,
    solar_panel.id,
    solar_panel.name,
    solar_panel.capacity_kwh,
    solar_panel.intensity_power_rating,
    solar_panel.temperature_power_rating
FROM SolarX_Raw_Transactions.solar_panel solar_panel

key,id,name,capacity_kwh,intensity_power_rating,temperature_power_rating
220250104,2,pole panel,6.0,1300.0,25.0
320250104,3,flush panel,10.0,1500.0,25.0
120250104,1,roof panel,3.0,1000.0,25.0


In [117]:
%%sql

MERGE INTO SolarX_WH.dim_solar_panel dim_solar_panel
USING SolarX_Raw_Transactions.solar_panel solar_panel_raw
ON dim_solar_panel.solar_panel_id = solar_panel_raw.id AND dim_solar_panel.current_flag = TRUE

WHEN MATCHED AND (
    dim_solar_panel.capacity_kwh != solar_panel_raw.capacity_kwh OR
    dim_solar_panel.intensity_power_rating_wh != solar_panel_raw.intensity_power_rating OR
    dim_solar_panel.temperature_power_rating_c != solar_panel_raw.temperature_power_rating
) THEN UPDATE SET
    dim_solar_panel.end_date   = NOW(),
    dim_solar_panel.current_flag = FALSE;


In [118]:
%%sql

MERGE INTO SolarX_WH.dim_solar_panel dim_solar_panel
USING SolarX_Raw_Transactions.solar_panel solar_panel_raw
ON dim_solar_panel.solar_panel_id = solar_panel_raw.id AND dim_solar_panel.current_flag = TRUE

WHEN NOT MATCHED THEN 
INSERT (
    solar_panel_key,
    solar_panel_id,
    name, 
    capacity_kwh,
    intensity_power_rating_wh,
    temperature_power_rating_c,
    start_date,
    end_date,
    current_flag
) VALUES (
    CAST(CONCAT(solar_panel_raw.id, date_format(NOW(), 'yyyyMMdd')) AS INT),
    solar_panel_raw.id,
    solar_panel_raw.name,
    solar_panel_raw.capacity_kwh,
    solar_panel_raw.intensity_power_rating,
    solar_panel_raw.temperature_power_rating,
    NOW(),
    NULL,
    TRUE
);

In [113]:
%%sql

SELECT * FROM SolarX_WH.dim_solar_panel

solar_panel_key,solar_panel_id,name,capacity_kwh,intensity_power_rating_wh,temperature_power_rating_c,start_date,end_date,current_flag
220250104,2,pole panel,6.0,1300.0,25.0,2025-01-05 13:44:36.833911,,True
320250104,3,flush panel,10.0,1500.0,25.0,2025-01-05 13:44:36.833911,,True
120250104,1,roof panel,3.0,1000.0,25.0,2025-01-05 13:44:36.833911,,True


### test scd2

In [114]:
%%sql

UPDATE SolarX_Raw_Transactions.solar_panel solar_panel_raw
SET capacity_kwh = 4
WHERE id = 1

In [115]:
%%sql

UPDATE SolarX_Raw_Transactions.solar_panel solar_panel_raw
SET capacity_kwh = 12
WHERE id = 3

In [116]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.solar_panel

id,name,capacity_kwh,intensity_power_rating,temperature_power_rating
2,pole panel,6.0,1300.0,25.0
3,flush panel,12.0,1500.0,25.0
1,roof panel,4.0,1000.0,25.0


In [119]:
%%sql

SELECT * FROM SolarX_WH.dim_solar_panel

solar_panel_key,solar_panel_id,name,capacity_kwh,intensity_power_rating_wh,temperature_power_rating_c,start_date,end_date,current_flag
220250104,2,pole panel,6.0,1300.0,25.0,2025-01-05 13:44:36.833911,,True
320250104,3,flush panel,10.0,1500.0,25.0,2025-01-05 13:44:36.833911,2025-01-05 13:45:07.557670,False
120250104,1,roof panel,3.0,1000.0,25.0,2025-01-05 13:44:36.833911,2025-01-05 13:45:07.557670,False
120250105,1,roof panel,4.0,1000.0,25.0,2025-01-05 13:45:08.443905,,True
320250105,3,flush panel,12.0,1500.0,25.0,2025-01-05 13:45:08.443905,,True


# Solar Panel Fact

In [130]:
%%sql

DROP TABLE IF EXISTS SolarX_WH.fact_solar_panel_power_readings PURGE

In [131]:
%%sql

CREATE TABLE SolarX_WH.fact_solar_panel_power_readings(
    solar_panel_key                 SMALLINT      NOT NULL,   -- REFERENCES dim_solar_panel(solar_panel_key)
    date_key                        TIMESTAMP     NOT NULL,   -- REFERENCES dim_date(date_key)
    
    solar_panel_id                  INT           NOT NULL,
    generation_power_wh             FLOAT         NOT NULL 
)

USING iceberg
PARTITIONED BY (MONTH(date_key), solar_panel_id)

In [115]:
%%sql

DESCRIBE SolarX_WH.fact_solar_panel_power_readings

col_name,data_type,comment
solar_panel_key,int,
date_key,timestamp,
solar_panel_id,int,
generation_power_wh,float,
,,
# Partitioning,,
Part 0,months(date_key),
Part 1,solar_panel_id,


In [116]:
%%sql

SELECT * FROM SolarX_Raw_Transactions.solar_panel_readings LIMIT 10

timestamp,15_minutes_interval,panel_id,generation_power_wh
2013-01-01 09:11:16.300000,32,1,0.0059584765695035
2013-01-01 09:11:16.305000,32,1,0.005958529189229
2013-01-01 09:11:16.310000,32,1,0.0059585813432931
2013-01-01 09:11:16.315000,32,1,0.0059586325660347
2013-01-01 09:11:16.320000,32,1,0.0059586851857602
2013-01-01 09:11:16.325000,32,1,0.0059587373398244
2013-01-01 09:11:16.330000,32,1,0.0059587899595499
2013-01-01 09:11:16.335000,32,1,0.0059588411822915
2013-01-01 09:11:16.340000,32,1,0.0059588933363556
2013-01-01 09:11:16.345000,32,1,0.0059589459560811


In [117]:
%%sql


SELECT
    panel_id,
    CAST(CONCAT(
        YEAR(timestamp), '-', 
        LPAD(MONTH(timestamp), 2, '0'), '-', 
        LPAD(DAY(timestamp), 2, '0'), ' ',
        LPAD(HOUR(timestamp), 2, '0'), ':',
        LPAD(FLOOR(MINUTE(timestamp) / 15) * 15, 2, '0'), ':00'
    ) AS TIMESTAMP) AS truncated_timestamp,
    DATE(timestamp) AS date,
    15_minutes_interval,
    SUM(generation_power_wh) AS generation_power_wh
FROM 
    SolarX_Raw_Transactions.solar_panel_readings
WHERE 
    DAY(timestamp) = 1
GROUP BY 
    panel_id, 15_minutes_interval, truncated_timestamp, DATE(timestamp)
LIMIT 15

                                                                                

panel_id,truncated_timestamp,date,15_minutes_interval,generation_power_wh
1,2013-01-01 20:30:00,2013-01-01,78,0.0
1,2013-01-01 04:45:00,2013-01-01,15,0.0
3,2013-01-01 11:00:00,2013-01-01,40,2040.5081551698968
3,2013-01-01 19:00:00,2013-01-01,72,0.0
1,2013-01-01 10:45:00,2013-01-01,39,1033.8308916147798
3,2013-01-01 08:30:00,2013-01-01,30,1021.4276418288064
3,2013-01-01 17:15:00,2013-01-01,65,458.7722477841271
1,2013-01-01 03:45:00,2013-01-01,11,0.0
1,2013-01-01 15:45:00,2013-01-01,59,778.789446229348
1,2013-01-01 07:00:00,2013-01-01,24,0.0


In [119]:
%%sql

SELECT solar_panel_readings.*,
       dim_solar_panel.solar_panel_key
FROM (
    SELECT
        panel_id,
        CAST(CONCAT(
            YEAR(timestamp), '-', 
            LPAD(MONTH(timestamp), 2, '0'), '-', 
            LPAD(DAY(timestamp), 2, '0'), ' ',
            LPAD(HOUR(timestamp), 2, '0'), ':',
            LPAD(FLOOR(MINUTE(timestamp) / 15) * 15, 2, '0'), ':00'
        ) AS TIMESTAMP) AS truncated_timestamp,
        DATE(timestamp) AS date,
        15_minutes_interval,
        SUM(generation_power_wh) AS generation_power_wh
    FROM 
        SolarX_Raw_Transactions.solar_panel_readings
    WHERE 
        DAY(timestamp) = 1
    GROUP BY 
        panel_id, 15_minutes_interval, truncated_timestamp, DATE(timestamp)
    LIMIT 15
) AS solar_panel_readings

LEFT JOIN SolarX_WH.dim_solar_panel dim_solar_panel
ON solar_panel_readings.panel_id = dim_solar_panel.solar_panel_id AND dim_solar_panel.current_flag = TRUE


                                                                                

panel_id,truncated_timestamp,date,15_minutes_interval,generation_power_wh,solar_panel_key
2,2013-01-01 05:30:00,2013-01-01,18,0.0,220250104
2,2013-01-01 12:00:00,2013-01-01,44,1147.125088701956,220250104
2,2013-01-01 19:15:00,2013-01-01,73,0.0,220250104
2,2013-01-01 11:00:00,2013-01-01,40,1177.216243297793,220250104
2,2013-01-01 16:00:00,2013-01-01,60,844.1341747590341,220250104
2,2013-01-01 16:15:00,2013-01-01,61,822.3532354556955,220250104
2,2013-01-01 23:00:00,2013-01-01,88,0.0,220250104
2,2013-01-01 01:45:00,2013-01-01,3,0.0,220250104
3,2013-01-01 03:30:00,2013-01-01,10,0.0,320250105
2,2013-01-01 07:30:00,2013-01-01,26,0.0,220250104


### Leverge broadcast join to avoid shuffling latency

In [121]:
staging_query = """
SELECT
    panel_id,
    CAST(CONCAT(
        YEAR(timestamp), '-', 
        LPAD(MONTH(timestamp), 2, '0'), '-', 
        LPAD(DAY(timestamp), 2, '0'), ' ',
        LPAD(HOUR(timestamp), 2, '0'), ':',
        LPAD(FLOOR(MINUTE(timestamp) / 15) * 15, 2, '0'), ':00'
    ) AS TIMESTAMP) AS truncated_timestamp,
    DATE(timestamp) AS date,
    15_minutes_interval,
    SUM(generation_power_wh) AS generation_power_wh
FROM 
    SolarX_Raw_Transactions.solar_panel_readings
WHERE 
    DAY(timestamp) = 1
GROUP BY 
    panel_id, 15_minutes_interval, truncated_timestamp, DATE(timestamp)
"""

In [122]:
dim_solar_panel_current_query = """
SELECT 
    solar_panel_key,	
    solar_panel_id
FROM 
    SolarX_WH.dim_solar_panel
WHERE 
    dim_solar_panel.current_flag = TRUE
"""

In [126]:
from pyspark.sql.functions import broadcast


staging_df = spark.sql(staging_query)
dimension_df = spark.sql(dim_solar_panel_current_query)

# Broadcast the smaller dimension table for the join
joined_df = staging_df.join(
    broadcast(dimension_df),
    (staging_df.panel_id == dimension_df.solar_panel_id),
    "left"
)

In [127]:
joined_df.show()

                                                                                

+--------+-------------------+----------+-------------------+-------------------+---------------+--------------+
|panel_id|truncated_timestamp|      date|15_minutes_interval|generation_power_wh|solar_panel_key|solar_panel_id|
+--------+-------------------+----------+-------------------+-------------------+---------------+--------------+
|       2|2013-01-01 01:00:00|2013-01-01|                  0|                0.0|      220250104|             2|
|       2|2013-01-01 19:15:00|2013-01-01|                 73|                0.0|      220250104|             2|
|       2|2013-01-01 11:00:00|2013-01-01|                 40|  1177.216243297793|      220250104|             2|
|       2|2013-01-01 10:45:00|2013-01-01|                 39| 1325.4242200478911|      220250104|             2|
|       2|2013-01-01 16:15:00|2013-01-01|                 61|  822.3532354556955|      220250104|             2|
|       2|2013-01-01 23:00:00|2013-01-01|                 88|                0.0|      220250104

In [129]:
joined_df.createOrReplaceTempView("staging_temp_view")

In [132]:
%%sql
    
MERGE INTO SolarX_WH.fact_solar_panel_power_readings AS target
USING staging_temp_view AS source
ON target.solar_panel_id = source.panel_id AND target.date_key = source.truncated_timestamp
      
WHEN NOT MATCHED THEN
    INSERT (solar_panel_key, 
            date_key, 
            solar_panel_id,
            generation_power_wh
    
    ) 
    VALUES (source.solar_panel_key, 
            source.truncated_timestamp,
            source.panel_id,
            source.generation_power_wh     
    );

                                                                                

In [139]:
%%sql

SELECT * FROM SolarX_WH.fact_solar_panel_power_readings LIMIT 10

solar_panel_key,date_key,solar_panel_id,generation_power_wh
320250105,2013-01-01 16:00:00,3,1463.1658935546875
320250105,2013-01-01 05:15:00,3,0.0
320250105,2013-01-01 12:15:00,3,2019.437255859375
320250105,2013-01-01 20:00:00,3,0.0
320250105,2013-01-01 15:45:00,3,1730.6431884765625
320250105,2013-01-01 08:15:00,3,1263.3809814453125
320250105,2013-01-01 14:45:00,3,1873.4549560546875
320250105,2013-01-01 17:00:00,3,730.4469604492188
320250105,2013-01-01 23:00:00,3,0.0
320250105,2013-01-01 07:00:00,3,0.0


In [135]:
%%sql

-- 276 --> 3*92, 92 record for each panel for day 1
SELECT COUNT(*) FROM SolarX_WH.fact_solar_panel_power_readings

count(1)
276


In [136]:
spark.stop()