#### Data Preperation
Split raw CSV data into historical and incoming data and save as Unity Catalog delta table

In [0]:
from pyspark.sql.functions import col, to_date, date_sub
import matplotlib.pyplot as plt

In [0]:
# Read raw data from Volume into DataFrame
df = spark.read.csv("/Volumes/dhurley_catalog/electricity_load_forecasting/raw/continuous_dataset.csv", header=True, inferSchema=True)

# Rename columns for clarity
keep_columns = ['datetime', 'nat_demand', 'T2M_toc', 'QV2M_toc', 'TQL_toc', 'W2M_toc', 'holiday', 'school']
df = df.select(*keep_columns)

rename_columns = ['datetime', 'load', 'temperature', 'humidity', 'precipitation', 'wind_speed', 'holiday', 'school_day']
df = df.toDF(*rename_columns)

In [0]:
# Weird drop after 2020-01-31, remove for demo purpose
df = df.filter(col("datetime") <= "2020-01-31")

# Split into incoming and historical data
incoming_df = df.filter(col("datetime") >= "2019-12-01")
historical_df = df.filter(col("datetime") < "2019-12-01")

# Write to table
incoming_df.write.saveAsTable("dhurley_catalog.electricity_load_forecasting.incoming_load_data")
historical_df.write.saveAsTable("dhurley_catalog.electricity_load_forecasting.historical_load_data")

In [0]:
# Plot load for incoming and historical data
incoming_pdf = incoming_df.select("datetime", "load").toPandas()
historical_pdf = historical_df.select("datetime", "load").toPandas()

plt.figure(figsize=(12, 6))
plt.plot(incoming_pdf['datetime'], incoming_pdf['load'], label='Incoming Load', color='blue')
plt.plot(historical_pdf['datetime'], historical_pdf['load'], label='Historical Load', color='red')
plt.xlabel('Datetime')
plt.ylabel('Load')
plt.title('Electricity Load')
plt.legend()
plt.grid(True)
plt.show()