# Notebook Documentation: Daily Forecast with Prophet & Exogenous Variables (Unscaled)
This notebook generates daily energy consumption forecasts per feeder using a Prophet model augmented with selected exogenous variables.

We're looking at this part of the flow:

<img src="../docs/imgs/energy-sa-forecasting-prophet.png " width="300">

## Environment Setup and Configuration

In [0]:
%run ./includes/common_functions_and_imports

In [0]:
%pip install prophet==1.1.4
%pip install holidays


In [0]:
from prophet import Prophet
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType, DoubleType, LongType

## Data Ingestion & Aggregation

In [0]:
source_table_name = (
  f"{CONFIG.target_catalog}.{CONFIG.target_schema}.unscaled_train_features"
)

if not spark.catalog.tableExists(source_table_name):
  dbutils.notebook.exit('Source table does not exist')

target_table_name = f"{CONFIG.target_catalog}.{CONFIG.target_schema}.predictions_daily_forecast_exogenous_noscaling_ext"

if spark.catalog.tableExists(target_table_name) and not CONFIG.overwrite_data:
  dbutils.notebook.exit('Target table already exists, skipping run to save on processing')
  
df_training = spark.table(source_table_name)


In [0]:
# Aggregate the data to a daily level by truncating the timestamp to 'day'
# and computing the sum of normalized_consumption_kwh as the target (y)
# and averages for the numerical regressors.
df_daily = (
  df_training
    .withColumn("ds", F.date_trunc("day", F.col("data_collection_log_timestamp")))
    .groupBy("lv_feeder_unique_id", "ds")                # ← include feeder id here
    .agg(
      F.sum("normalized_consumption_kwh").alias("y"),
      F.avg("cyc_halfhour").alias("cyc_halfhour"),
      F.avg("cyc_hour").alias("cyc_hour"),
      F.avg("approx_distance_to_weather_station").alias("approx_distance_to_weather_station"),
      F.avg("aggregated_device_count_active").alias("aggregated_device_count_active"),
      F.avg("t2m").alias("t2m"),
      F.avg("u10").alias("u10"),
      F.avg("v10").alias("v10"),
      F.avg("ssrd").alias("ssrd"),
      F.avg("strd").alias("strd"),
      F.avg("cyc_day").alias("cyc_day"),
      F.avg("cyc_week").alias("cyc_week"),
      F.avg("cyc_month").alias("cyc_month"),
      F.avg("cyc_halfhour_sin").alias("cyc_halfhour_sin"),
      F.avg("cyc_halfhour_cos").alias("cyc_halfhour_cos"),
      F.avg("cyc_hour_sin").alias("cyc_hour_sin"),
      F.avg("cyc_hour_cos").alias("cyc_hour_cos"),
      F.avg("cyc_day_sin").alias("cyc_day_sin"),
      F.avg("cyc_day_cos").alias("cyc_day_cos"),
      F.avg("cyc_week_sin").alias("cyc_week_sin"),
      F.avg("cyc_week_cos").alias("cyc_week_cos"),
      F.avg("cyc_month_sin").alias("cyc_month_sin"),
      F.avg("cyc_month_cos").alias("cyc_month_cos")
    )
)

df_daily.show(5, truncate=False)


## Analysis of numerical input features

In [0]:

target_col = "y"

# Extract all numeric columns (excluding the target) from the daily DataFrame.
numeric_types = (IntegerType, FloatType, DoubleType, LongType)
candidate_cols = [
    field.name for field in df_daily.schema.fields 
    if isinstance(field.dataType, numeric_types) and field.name != target_col
]

print("Candidate numerical columns:")
print(candidate_cols)

# Compute the Pearson correlation between the target variable and each candidate column.
correlations = []
for col_name in candidate_cols:
    try:
        corr_val = df_daily.stat.corr(target_col, col_name)
        correlations.append((col_name, corr_val))
    except Exception as e:
        print(f"Error computing correlation for {col_name}: {e}")

# Sort the correlations by the absolute value (strongest correlations first).
sorted_correlations = sorted(correlations, key=lambda x: abs(x[1]), reverse=True)

print("Correlations with", target_col, "sorted by absolute value (highest to lowest):")
for col_name, corr_val in sorted_correlations:
    print(f"{col_name}: {corr_val:.3f}")

Based on the correlations, we will focus on the exogenous regressors on the truly dynamic, non‐seasonal drivers, especially since Prophet already handles weekly/annual cycles internally. In particular:

* t2m (mean daily temperature): |corr| ≈ 0.268
* ssrd (surface solar radiation downwards): |corr| ≈ 0.187
* strd (surface solar radiation upwelling): |corr| ≈ 0.140
* aggregated_device_count_active: |corr| ≈ 0.145

## Forecast Function (Pandas UDF)

t2m (mean daily temperature), ssrd (surface solar radiation downwards), strd (surface solar radiation upwelling), and aggregated_device_count_active were selected due to their day‑to‑day variability and significant correlations with daily consumption (|corr| ≈ 0.268, 0.187, 0.140, and 0.145, respectively). Including these covariates enhances Prophet’s seasonal trend modeling by capturing external drivers such as temperature fluctuations, solar energy availability, and device usage.

In [0]:
# Forecast horizon (number of days to forecast)
forecast_horizon = 90  # Adjust as needed (e.g., to cover your test period)

# Selected regressors based on prior analysis
selected_regressors = ["t2m", "ssrd", "strd", "aggregated_device_count_active"]

# Prophet hyperparameters (tune these if necessary)
changepoint_prior_scale = 0.05   # Increased for more flexibility
seasonality_mode = 'additive'    # Can try additive if multiplicative is unstable
n_changepoints = 50              # Number of potential changepoints

In [0]:
# Reduces the dataframe to include only the selected regressors
df_daily = (
    df_training.withColumn(
        "ds", F.date_trunc("day", F.col("data_collection_log_timestamp"))
    )
    .groupBy("lv_feeder_unique_id", "ds")
    .agg(
        F.sum("normalized_consumption_kwh").alias("y"),
        *[F.avg(r).alias(r) for r in selected_regressors]
    )
)

In [0]:
def apply_forecast_daily_noscaling(pdf):
    # If there are fewer than 2 non-NaN rows, return empty.
    if pdf.shape[0] < 2:
        return pd.DataFrame(columns=["lv_feeder_unique_id", "ds", "yhat"])
    
    # Ensure the 'ds' column is datetime.
    pdf['ds'] = pd.to_datetime(pdf['ds'])
    
    # Initialize Prophet with the defined hyperparameters.
    m = Prophet(
            changepoint_prior_scale=changepoint_prior_scale,
            seasonality_mode=seasonality_mode,
            daily_seasonality=True
        ).add_country_holidays(country_name="GB")\
         .add_seasonality(name="weekly", period=7, fourier_order=3)\
         .add_seasonality(name="annual", period=365, fourier_order=10)
         
    # Add the selected extra regressors directly, without scaling.
    for reg in selected_regressors:
        if reg in pdf.columns:
            m.add_regressor(reg)
    
    # Fit the Prophet model on this feeder's historical daily data.
    try:
        m.fit(pdf)
    except Exception as e:
        # In case of any model fitting issues, return an empty DataFrame.
        return pd.DataFrame(columns=["lv_feeder_unique_id", "ds", "yhat"])
    
    # Create a future DataFrame for the forecast horizon using daily frequency.
    future = m.make_future_dataframe(periods=forecast_horizon, freq='D', include_history=False)
    
    # Fill in the extra regressors in the future DataFrame with the last observed values.
    last_vals = pdf.iloc[-1][selected_regressors].to_dict() if set(selected_regressors).issubset(pdf.columns) else {}
    for reg in last_vals:
        future[reg] = last_vals[reg]
    
    # Generate forecast using Prophet.
    forecast = m.predict(future)[["ds", "yhat"]]
    forecast["lv_feeder_unique_id"] = pdf["lv_feeder_unique_id"].iloc[0]
    
    return forecast[["lv_feeder_unique_id", "ds", "yhat"]]

In [0]:
output_schema = T.StructType([
    T.StructField("lv_feeder_unique_id", T.StringType(), True),
    T.StructField("ds", T.DateType(), True),
    T.StructField("yhat", T.DoubleType(), True)
])

In [0]:
# Apply the forecasting function per feeder using applyInPandas.
results_df = df_daily.groupBy("lv_feeder_unique_id").applyInPandas(apply_forecast_daily_noscaling, schema=output_schema)

In [0]:
# Write the forecast results to a table for downstream analysis.
results_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(target_table_name)