In [0]:
%pip install dbl-tempo==0.1.29
dbutils.library.restartPython()

In [0]:
from pyspark.sql.functions import col, to_date, date_sub, lit, rand
from tempo import *
import matplotlib.pyplot as plt
import pandas as pd

#### Unity Catalog Data Sources

In [0]:
catalog = "users"
schema = "david_hurley"
volume = "electricity_load_data"
filename = "panama_electricity_load_dataset.csv"

#### Create Synthetic Bronze Electricity Load Data

In [0]:
df_load_data = spark.read.csv(f"/Volumes/{catalog}/{schema}/{volume}/{filename}", header=True, inferSchema=True)

# keep certain columns and rename
df_load_data = df_load_data.select(*['datetime', 'nat_demand'])
df_load_data = df_load_data.toDF(*['datetime', 'load'])

# keep only a subset of the data, based on date
df_load_data = df_load_data.filter((col("datetime") > "2019-09-01") & (col("datetime") < "2020-01-31")).withColumn("tagId", lit("1"))

# create fake tagId and fake load data for each tag
synthetic_load_data = []
for i in range(2, 10):
  # fake load data is a multiple of load between 0.5 and 1.0
  df_temp = df_load_data.withColumn("load", col("load") * (rand() * 0.5 + 0.5)).withColumn("tagId", lit(str(i)))
  synthetic_load_data.append(df_temp)

# create a new combined dataset with fake load data for all tags
df_load_data = df_load_data.unionByName(synthetic_load_data[0])
for temp_df in synthetic_load_data[1:]:
    df_load_data = df_load_data.unionByName(temp_df)

# order by date and tagId and re-order columns
df_synthetic_load_data = df_load_data.orderBy(["datetime", "tagId"], ascending=[True, True])
df_synthetic_load_data = df_synthetic_load_data[["datetime", "tagId", "load"]]

# save to Bronze table
df_synthetic_load_data.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.bronze_electricity_load_data")

#### Create Synthetic Bronze Weather Data

In [0]:
df_load_data = spark.read.csv(f"/Volumes/{catalog}/{schema}/{volume}/{filename}", header=True, inferSchema=True)

# keep certain columns and rename
df_wx_data = df_load_data.select(*['datetime', 'T2M_toc'])
df_wx_data = df_wx_data.toDF(*['datetime', 'temperature'])

# keep only a subset of the data, based on date
df_wx_data = df_wx_data.filter((col("datetime") > "2019-09-01") & (col("datetime") < "2020-01-31"))

# resample to every minute and filter to keep only 10min interval
# do this to demonstrate combining datasets of different datetime
wx_tsdf = TSDF(df_wx_data, ts_col="datetime", partition_cols=[])
wx_tsdf_resampled = wx_tsdf.resample(freq='min', func='mean')
wx_tsdf_interpolated = wx_tsdf_resampled.interpolate(method="linear")

wx_interpolated_df_pandas = wx_tsdf_interpolated.df.toPandas()
wx_interpolated_df_pandas = wx_interpolated_df_pandas.sort_values(by="datetime")
wx_interpolated_df_pandas = wx_interpolated_df_pandas[wx_interpolated_df_pandas['datetime'].dt.minute % 10 == 0]

# save to Bronze table
spark.createDataFrame(wx_interpolated_df_pandas).write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.bronze_weather_data")