In [26]:
from pyspark.sql import SparkSession

In [92]:
spark = (
    SparkSession
    .builder
    .appName("wh-etl")
    .master("spark://b0f96e8e1cf0:7077")
    .config("spark.executor.cores", 1)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

25/01/03 01:51:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [93]:
spark

In [5]:
%%sql

SHOW TABLES IN demo.SolarX_Raw_Transactions

25/01/02 22:03:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace,tableName,isTemporary
SolarX_Raw_Transactions,home_power_readings,False
SolarX_Raw_Transactions,solar_panel,False
SolarX_Raw_Transactions,solar_panel_readings,False


In [50]:
%%sql

SHOW TABLES IN demo.SolarX_WH

25/01/03 00:25:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace,tableName,isTemporary
SolarX_WH,dim_date,False
SolarX_WH,dim_home,False
SolarX_WH,dim_home_appliances,False
SolarX_WH,dim_solar_panel,False
SolarX_WH,fact_home_power_readings,False
SolarX_WH,fact_solar_panel_power_readings,False


# Date dimension

In [7]:
%%sql

DESCRIBE SolarX_WH.dim_date

col_name,data_type,comment
date_key,timestamp,
year,int,
quarter,int,
month,int,
week,int,
day,int,
hour,int,
minute,int,
is_weekend,boolean,
# Partition Information,,


In [8]:
import datetime

def generate_15min_intervals(start_time, end_time):
    current_time = start_time
    timestamps = []
    while current_time <= end_time:
        timestamps.append((current_time,))
        current_time += datetime.timedelta(minutes=15)
    return timestamps

# start date
start_time = datetime.datetime(2013, 1, 1, 0, 0, 0)

# future end date
end_time = datetime.datetime.now() + datetime.timedelta(days=365 * 1)

# generate a list of 15-minute intervals
timestamp_list = generate_15min_intervals(start_time, end_time)

In [9]:
timestamp_list[0:5]

[(datetime.datetime(2013, 1, 1, 0, 0),),
 (datetime.datetime(2013, 1, 1, 0, 15),),
 (datetime.datetime(2013, 1, 1, 0, 30),),
 (datetime.datetime(2013, 1, 1, 0, 45),),
 (datetime.datetime(2013, 1, 1, 1, 0),)]

In [10]:
# load this list into a df
from pyspark.sql import functions as F
timestamp_df = spark.createDataFrame(timestamp_list, schema="timestamp timestamp")\
                    .withColumn("15_minutes_interval", F.floor((F.hour(F.col("timestamp"))*60 + F.minute(F.col("timestamp")) - 60) / 15))

In [11]:
timestamp_df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------------------+-------------------+
|          timestamp|15_minutes_interval|
+-------------------+-------------------+
|2013-01-01 00:00:00|                 -4|
|2013-01-01 00:15:00|                 -3|
|2013-01-01 00:30:00|                 -2|
|2013-01-01 00:45:00|                 -1|
|2013-01-01 01:00:00|                  0|
|2013-01-01 01:15:00|                  1|
|2013-01-01 01:30:00|                  2|
|2013-01-01 01:45:00|                  3|
|2013-01-01 02:00:00|                  4|
|2013-01-01 02:15:00|                  5|
|2013-01-01 02:30:00|                  6|
|2013-01-01 02:45:00|                  7|
|2013-01-01 03:00:00|                  8|
|2013-01-01 03:15:00|                  9|
|2013-01-01 03:30:00|                 10|
|2013-01-01 03:45:00|                 11|
|2013-01-01 04:00:00|                 12|
|2013-01-01 04:15:00|                 13|
|2013-01-01 04:30:00|                 14|
|2013-01-01 04:45:00|                 15|
+-------------------+-------------

                                                                                

In [13]:
timestamp_df.createOrReplaceTempView("temp_view")

In [15]:
%%sql
    
INSERT INTO SolarX_WH.dim_date (date_key, year, quarter, month, week, day, hour, minute, is_weekend)
SELECT timestamp                      as date_key,
       YEAR(timestamp)                as year,
       QUARTER(timestamp)             as quarter,
       MONTH(timestamp)               as month,
       EXTRACT(WEEK FROM timestamp)   as week,
       DAY(timestamp)                 as day,
       HOUR(timestamp)                as hour,
       15_minutes_interval            as minute,
       CASE 
               WHEN WEEKDAY(timestamp) in (6,7) then true 
               ELSE FALSE
       END
       
FROM temp_view

                                                                                

In [19]:
%%sql

SELECT * FROM SolarX_WH.dim_date 
WHERE day = 1 AND year = 2013
LIMIT 10

date_key,year,quarter,month,week,day,hour,minute,is_weekend
2013-04-01 02:45:00,2013,2,4,14,1,2,7,False
2013-10-01 05:30:00,2013,4,10,40,1,5,18,False
2013-10-01 04:45:00,2013,4,10,40,1,4,15,False
2013-10-01 04:15:00,2013,4,10,40,1,4,13,False
2013-03-01 11:45:00,2013,1,3,9,1,11,43,False
2013-03-01 15:15:00,2013,1,3,9,1,15,57,False
2013-09-01 17:45:00,2013,3,9,35,1,17,67,True
2013-03-01 14:30:00,2013,1,3,9,1,14,54,False
2013-10-01 07:00:00,2013,4,10,40,1,7,24,False
2013-03-01 14:15:00,2013,1,3,9,1,14,53,False


In [23]:
%%sql

SELECT DISTINCT(year) FROM SolarX_WH.dim_date 

                                                                                

year
2025
2018
2015
2023
2022
2013
2014
2019
2020
2016


In [24]:
%%sql

SELECT COUNT(year) FROM SolarX_WH.dim_date 

count(year)
455993


# Home appliances dimension

In [None]:
%%sql

DROP TABLE IF EXISTS SolarX_WH.dim_home_appliances PURGE

In [55]:
%%sql

DESCRIBE SolarX_WH.dim_home_appliances

col_name,data_type,comment
home_appliance_key,int,
home_key,int,
appliance,string,
min_consumption_power_wh,float,
max_consumption_power_wh,float,
usage_time,string,


In [144]:
import json
import pandas as pd

def get_home_appliances_df():
    with open('/home/iceberg/warehouse/home_appliances_consumption.json') as f:
        HOME_USAGE_POWER = json.load(f)
        HOME_USAGE_POWER.items()
        
        df = pd.DataFrame([
                {
                    "home_key" : 1,
                    "name": name,
                    "min_consumption_rating": info["consumption"][0],
                    "max_consumption_rating": info["consumption"][1],
                    "usage_time": info["time"]
                }
                for name, info in HOME_USAGE_POWER.items()
            ])
        df.loc[len(df)] = [1, "app", 452, 895, "545454"]
        df.index += 1 
        df.index.name = 'home_appliance_key'
    return df.reset_index()

In [145]:
home_appliances_df = spark.createDataFrame(get_home_appliances_df())

In [146]:
home_appliances_df.show()

+------------------+--------+--------------------+----------------------+----------------------+--------------------+
|home_appliance_key|home_key|                name|min_consumption_rating|max_consumption_rating|          usage_time|
+------------------+--------+--------------------+----------------------+----------------------+--------------------+
|                 1|       1|        Refrigerator|                   300|                  1500|         00:00-24:00|
|                 2|       1|       Electric Oven|                  2000|                  5000|16:00-16:30,21:00...|
|                 3|       1|     Electric Kettle|                  1500|                  1500|07:00-07:15,12:00...|
|                 4|       1|     Air Conditioner|                   500|                  3000|         00:00-24:00|
|                 5|       1|Incandescent Ligh...|                    60|                    60|         00:00-24:00|
|                 6|       1|     LED Light Bulbs|      

In [148]:
home_appliances_df.createOrReplaceTempView("temp_view_2")

In [149]:
%%sql

MERGE INTO SolarX_WH.dim_home_appliances dim_app
USING 
    (SELECT    home_appliance_key        as home_appliance_key, 
               home_key                  as home_key,
               name                      as appliance,
               min_consumption_rating    as min_consumption_power_wh,
               max_consumption_rating    as max_consumption_power_wh,
               usage_time                as usage_time
    FROM temp_view_2) tmp
    
ON dim_app.home_appliance_key = tmp.home_appliance_key

WHEN MATCHED AND (
    dim_app.min_consumption_power_wh != tmp.min_consumption_power_wh OR
    dim_app.max_consumption_power_wh != tmp.max_consumption_power_wh
) THEN UPDATE SET 
    dim_app.min_consumption_power_wh = tmp.min_consumption_power_wh,
    dim_app.max_consumption_power_wh = tmp.max_consumption_power_wh

WHEN NOT MATCHED THEN INSERT *

In [150]:
%%sql

SELECT * FROM SolarX_WH.dim_home_appliances

home_appliance_key,home_key,appliance,min_consumption_power_wh,max_consumption_power_wh,usage_time
1,1,Refrigerator,300.0,1500.0,00:00-24:00
2,1,Electric Oven,2000.0,5000.0,"16:00-16:30,21:00-21:30"
3,1,Electric Kettle,1500.0,1500.0,"07:00-07:15,12:00-12:15,16:30-17:00,21:30-22:00"
4,1,Air Conditioner,500.0,3000.0,00:00-24:00
5,1,Incandescent Light Bulbs,60.0,60.0,00:00-24:00
6,1,LED Light Bulbs,10.0,10.0,00:00-24:00
7,1,Laptop,50.0,100.0,00:00-24:00
8,1,Computer,100.0,600.0,00:00-24:00
9,1,LCD Monitor,50.0,300.0,"10:00-12:00,16:00-20:00,22:00-24:00"
10,1,Router,5.0,20.0,00:00-24:00


# Home dimension

In [113]:
%%sql

DROP TABLE IF EXISTS SolarX_WH.dim_home PURGE

                                                                                

In [114]:
%%sql

CREATE TABLE SolarX_WH.dim_home(
    home_key                                   SMALLINT    NOT NULL,
    min_consumption_power_wh                   FLOAT       NOT NULL,
    max_consumption_power_wh                   FLOAT       NOT NULL,

    -- scd type2 for min_consumption_power_wh
    min_consumption_power_wh_start_date        TIMESTAMP   NOT NULL,
    min_consumption_power_wh_end_date          TIMESTAMP,

    -- scd type2 for max_consumption_power_wh
    max_consumption_power_wh_start_date        TIMESTAMP  NOT NULL,
    max_consumption_power_wh_end_date          TIMESTAMP,

    current_flag                               BOOLEAN NOT NULL
)
USING iceberg;

In [115]:
%%sql

DESCRIBE SolarX_WH.dim_home

col_name,data_type,comment
home_key,int,
min_consumption_power_wh,float,
max_consumption_power_wh,float,
min_consumption_power_wh_start_date,timestamp,
min_consumption_power_wh_end_date,timestamp,
max_consumption_power_wh_start_date,timestamp,
max_consumption_power_wh_end_date,timestamp,
current_flag,boolean,


In [116]:
%%sql 
SELECT home_key, SUM(min_consumption_power_wh) as min_consumption_power_wh,
              SUM(max_consumption_power_wh) as max_consumption_power_wh 
       FROM SolarX_WH.dim_home_appliances
GROUP BY home_key

home_key,min_consumption_power_wh,max_consumption_power_wh
1,7880.0,19895.0


In [151]:
%%sql

MERGE INTO SolarX_WH.dim_home dim_home
USING (
    SELECT 
        home_key, 
        SUM(min_consumption_power_wh) AS min_consumption_power_wh,
        SUM(max_consumption_power_wh) AS max_consumption_power_wh 
    FROM SolarX_WH.dim_home_appliances
    GROUP BY home_key
) dim_app
ON dim_home.home_key = dim_app.home_key AND dim_home.current_flag = TRUE

WHEN MATCHED AND (
    dim_home.max_consumption_power_wh != dim_app.max_consumption_power_wh OR
    dim_home.min_consumption_power_wh != dim_app.min_consumption_power_wh
) THEN UPDATE SET 
    dim_home.min_consumption_power_wh_end_date = NOW(),
    dim_home.max_consumption_power_wh_end_date = NOW(),
    dim_home.current_flag = FALSE;

                                                                                

In [152]:
%%sql

MERGE INTO SolarX_WH.dim_home dim_home
USING (
    SELECT 
        home_key, 
        SUM(min_consumption_power_wh) AS min_consumption_power_wh,
        SUM(max_consumption_power_wh) AS max_consumption_power_wh 
    FROM SolarX_WH.dim_home_appliances
    GROUP BY home_key
) dim_app
ON dim_home.home_key = dim_app.home_key AND dim_home.current_flag = TRUE

WHEN NOT MATCHED THEN 
INSERT (
    home_key,
    min_consumption_power_wh, 
    max_consumption_power_wh,
    min_consumption_power_wh_start_date,
    min_consumption_power_wh_end_date,
    max_consumption_power_wh_start_date,
    max_consumption_power_wh_end_date,
    current_flag
) VALUES (
    dim_app.home_key,
    dim_app.min_consumption_power_wh,
    dim_app.max_consumption_power_wh,
    NOW(),
    NULL,
    NOW(),
    NULL,
    TRUE
);

In [153]:
%%sql

SELECT * FROM SolarX_WH.dim_home LIMIT 10

home_key,min_consumption_power_wh,max_consumption_power_wh,min_consumption_power_wh_start_date,min_consumption_power_wh_end_date,max_consumption_power_wh_start_date,max_consumption_power_wh_end_date,current_flag
1,22880.0,34895.0,2025-01-03 02:53:51.311170,2025-01-03 02:59:04.925843,2025-01-03 02:53:51.311170,2025-01-03 02:59:04.925843,False
1,8332.0,20790.0,2025-01-03 02:59:06.361727,,2025-01-03 02:59:06.361727,,True
1,7880.0,19895.0,2025-01-03 02:36:45.591559,2025-01-03 02:53:42.315472,2025-01-03 02:36:45.591559,2025-01-03 02:53:42.315472,False


In [154]:
spark.stop()