In [0]:
%pip install dbl-tempo==0.1.29
%pip install holidays==0.74
dbutils.library.restartPython()

In [0]:
import warnings
import holidays
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import minute, dayofweek, when, col, udf, expr, first
from tempo import *
warnings.filterwarnings("ignore")

# Overview
**Prepare data for ML Forecasting**


1. Load data from Bronze tables


2. Clean Data
  - Resample to common timebase
  - Combine datasets, drop duplicates
  - Filter datasets to desired time interval
  - Create new features (weekends, holidays, lagged dependent variable)


3. Save data to Silver table for ML Forecast
![Image](artifacts/DataProcess.png)

### 1. Load Data

In [0]:
catalog = "users"
schema = "david_hurley"
bronze_electricity_load_data = "synthetic_electricity_load_data"
bronze_weather_data = "synthetic_weather_data"

In [0]:
df_electricity_load_data = spark.table(f"{catalog}.{schema}.{bronze_electricity_load_data}")
display(df_electricity_load_data)

In [0]:
df_weather_data = spark.table(f"{catalog}.{schema}.{bronze_weather_data}")
display(df_weather_data)

### 2. Clean Data

##### Resample Data to Minute Frequency

In [0]:
def resample_data(df, ts_col: str, partition_cols: list, freq: str):
  """ 
  Resample and interpolate time series data.

  Parameters:
  df (DataFrame): Input DataFrame containing time series data.
  ts_col (str): Name of the timestamp column.
  partition_cols (list): List of columns to partition by.
  freq (str): Frequency to resample the data (e.g., 'min' for minute).

  Returns:
  DataFrame: Resampled and interpolated DataFrame.
  """
  tsdf = TSDF(df, ts_col=ts_col, partition_cols=partition_cols)

  tsdf_resampled = tsdf.resample(freq=freq, func='mean')

  tsdf_interpolated = tsdf_resampled.interpolate(method="linear")
  
  return tsdf_interpolated.df

In [0]:
# create new dataframes with resampled data at 1min interval
df_electricity_load_data_resampled = resample_data(df_electricity_load_data, "datetime", ["tagId"], "min")
df_weather_data_resampled = resample_data(df_weather_data, "datetime", [], "min")

In [0]:
display(df_electricity_load_data_resampled.orderBy("datetime", "tagId"))

In [0]:
display(df_weather_data_resampled.orderBy("datetime"))

##### Combine Datasets, Drop Duplicates, Filter to 15min Interval

In [0]:
# combine resampled datasets
combined_df = df_electricity_load_data_resampled.join(df_weather_data_resampled, on="datetime", how="inner")

# drop any duplicate tagId and datetime, keep first
combined_df = combined_df.dropDuplicates(["tagId", "datetime"])

# keep only data every 15min
combined_df_15min = combined_df.filter(minute(combined_df['datetime']) % 15 == 0)

display(combined_df_15min.orderBy("datetime", "tagId"))

##### Create New Columns for Weekend and Holidays

In [0]:
# saturday and sunday are get a "1", rest are "0"
combined_df_15min = combined_df_15min.withColumn(
    "weekend",
    when(dayofweek(col("datetime")).isin(1, 7), 1).otherwise(0)
)

# panama electricity data so get list of panama holidays
# create new column with "1" for holiday
panama_holidays = holidays.PA()

def is_holiday(date):
    return 1 if date in panama_holidays else 0

is_holiday_udf = udf(is_holiday, IntegerType())

combined_df_15min = combined_df_15min.withColumn("is_holiday", is_holiday_udf(col("datetime").cast("date")))

display(combined_df_15min.orderBy("datetime", "tagId"))

##### Create New Column for Lagged Load Data (T-1day)

In [0]:
# shift date on load by 1 day and join on tagId to get T-1 load as a predictor
lagged_load = combined_df_15min.select("tagId", "datetime", "load") \
              .withColumn("datetime", expr("datetime + interval 1 day")) \
              .withColumnRenamed("load", "lagged_load")

combined_df_15min = combined_df_15min.join(lagged_load, on=["tagId", "datetime"], how="left")

# the first 1 day will have null in lagged load, so drop
combined_df_15min = combined_df_15min.filter(col("lagged_load").isNotNull())

display(combined_df_15min.orderBy("datetime", "tagId"))

### 3. Save Silver Table


In [0]:
combined_df_15min.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.silver_load_forecasting")