In [1]:
import numpy as np
import pandas as pd
from tscv import GapRollForward
from tqdm.notebook import tqdm
from sklearn.ensemble import HistGradientBoostingRegressor
import seaborn as sns
sns.set_theme(palette='viridis', font_scale=2, font='times', style='white')

region = 'sa'

# extract holidays from file
holiday_df = pd.read_csv('../data/holidays2017_2024.csv', dtype='str')
holiday_df['Date'] = holiday_df['Date'].astype('datetime64[ns]').dt.date
holidays = holiday_df.loc[holiday_df['Jurisdiction'] == region, ['Date', 'Holiday Name']]

# import and preprocess load and weather data
df = pd.read_csv(f'../data/{region}/merged.csv')
df['datetime'] = df['datetime'].astype('datetime64')
dt = df['datetime'].dt
df['year'] = dt.year
df['month'] = dt.month
df['day'] = dt.day
df['hour'] = dt.hour
df['minute'] = dt.minute
df['dow'] = dt.day_of_week
df['week'] = dt.isocalendar().week

# compute universal X and y column indices
X_EXCLUDE = ['datetime', 'net_load', 'total_load']
X_cols = np.setdiff1d(df.columns.values, X_EXCLUDE)
y_ind = df.columns.get_loc('net_load')

# add holiday encodings
df['holiday'] = dt.date.isin(holidays['Date']).astype('int')

# for convenience below
obs = np.arange(1000) * 24

# create train/test window strategies
tscv = GapRollForward(
    min_train_size = obs[365], 
    max_train_size = obs[365],
    min_test_size = obs[7], 
    max_test_size = obs[7],
    roll_size = obs[30])

print(sum(1 for i in tscv.split(df)), f' windows to be trained')

49  windows to be trained


In [2]:
prdfs = []
# execute train/test window strategies
for i, (train_ind, test_ind) in tqdm(enumerate(tscv.split(df))):
    for include in [[], ['holiday']]:
        X_inds = sorted(df.columns.get_indexer_for(X_cols.tolist() + include))

        X_train, X_test = df.iloc[train_ind, X_inds], df.iloc[test_ind, X_inds]
        y_train, y_test = df.iloc[train_ind, y_ind], df.iloc[test_ind, y_ind]

        # train model
        begin, end = df.iloc[[train_ind[0], train_ind[-1]], 0].dt.date
        model = HistGradientBoostingRegressor()
        model.fit(X_train, y_train)

        # predict
        prd = model.predict(X_test)
        prdf = pd.DataFrame({'datetime': df.iloc[test_ind, 0],
                            'model': i,
                            'include': len(include),
                            'holiday': df.iloc[test_ind,:]['holiday'],
                            'train_end': end,
                            'predicted': prd,
                            'net_load': y_test})
        prdfs.append(prdf)

# concatenate predictions and compute discrete error metrics
predictions = pd.concat(prdfs)
predictions['Residual'] = predictions['predicted'] - predictions['net_load']
predictions['Absolute Error'] = predictions['Residual'].abs()
predictions['Percent Error'] = predictions['Residual'] / predictions['net_load']
predictions['Absolute Percent Error'] = predictions['Percent Error'].abs()
predictions['Squared Error'] = predictions['Residual'] ** 2

0it [00:00, ?it/s]

In [4]:
metrics = predictions.groupby(['include', 'holiday']).agg(
    count = ('datetime', 'count'), 
    MAE = ('Absolute Error', 'mean'), 
    MAPE = ('Absolute Percent Error', 'mean'), 
    RMSE = ('Squared Error', lambda x: np.sqrt(x.mean()))
).reset_index().round(4)

metrics

Unnamed: 0,include,holiday,count,MAE,MAPE,RMSE
0,0,0,7886,78.6884,0.0656,114.4355
1,0,1,346,126.6982,0.1491,159.2306
2,1,0,7886,78.6376,0.0655,113.8731
3,1,1,346,78.5736,0.0978,104.8791
