In [86]:
import numpy as np
import pandas as pd
from tscv import GapRollForward
from tqdm.notebook import tqdm
from sklearn.ensemble import HistGradientBoostingRegressor

region = 'sa'

# extract holidays from file
holiday_df = pd.read_csv('../data/holidays2017_2024.csv', dtype='str')
holiday_df['Date'] = holiday_df['Date'].astype('datetime64[ns]').dt.date
holidays = holiday_df.loc[holiday_df['Jurisdiction'] == region, ['Date', 'Holiday Name']]
holidict = {name: i+1 for i, name in enumerate(holidays['Holiday Name'].unique())}

# import and preprocess load and weather data
df = pd.read_csv(f'../data/{region}/merged.csv')
df['datetime'] = df['datetime'].astype('datetime64')
dt = df['datetime'].dt
df['year'] = dt.year
df['month'] = dt.month
df['day'] = dt.day
df['hour'] = dt.hour
df['minute'] = dt.minute
df['dow'] = dt.day_of_week
df['week'] = dt.isocalendar().week

# compute universal X and y column indices
X_EXCLUDE = ['datetime', 'net_load', 'total_load']
X_cols = np.setdiff1d(df.columns.values, X_EXCLUDE)
y_ind = df.columns.get_loc('net_load')

# add holiday encodings
df['holiday_bin'] = dt.date.isin(holidays['Date']).astype('int')
merged = pd.merge(dt.date, holidays, left_on='datetime', right_on='Date', how='left')
df['holiday_cat'] = merged['Holiday Name'].replace(holidict).fillna(0).astype('int')
df['working_day'] = np.logical_not(np.logical_or(df['holiday_bin'], df['dow'] >= 5))

# for convenience below
obs = np.arange(1000) * 24

# create train/test window strategies
tscv_sliding = GapRollForward(
    min_train_size = obs[365], 
    max_train_size = obs[365],
    min_test_size = obs[7], 
    max_test_size = obs[7],
    roll_size = obs[30])

tscv_expanding = GapRollForward(
    min_train_size = obs[365], 
    max_train_size = np.inf,
    min_test_size = obs[7], 
    max_test_size = obs[7],
    roll_size = obs[30])

print(sum(1 for i in tscv_sliding.split(df)), f' sliding windows to be trained')
print(sum(1 for i in tscv_expanding.split(df)), f' expanding windows to be trained')

49  sliding windows to be trained
49  expanding windows to be trained


In [87]:
tscv = {'sliding': tscv_sliding, 'expanding': tscv_expanding}

prdfs = []
# execute train/test window strategies
for strategy, cv in tscv.items():
    for i, (train_ind, test_ind) in tqdm(enumerate(cv.split(df))):
        for include in [[], ['holiday_bin'], ['holiday_cat'], ['working_day']]:
            X_inds = sorted(df.columns.get_indexer_for(X_cols.tolist() + include))

            X_train, X_test = df.iloc[train_ind, X_inds], df.iloc[test_ind, X_inds]
            y_train, y_test = df.iloc[train_ind, y_ind], df.iloc[test_ind, y_ind]

            # train model
            begin, end = df.iloc[[train_ind[0], train_ind[-1]], 0].dt.date
            model = HistGradientBoostingRegressor()
            model.fit(X_train, y_train)

            # predict
            prd = model.predict(X_test)
            prdf = pd.DataFrame({'datetime': df.iloc[test_ind, 0],
                                'model': i,
                                'encoding': include[0] if include else 'no_holiday',
                                'holiday': df.iloc[test_ind, :]['holiday_bin'],
                                'window': strategy,
                                'train_end': end,
                                'predicted': prd,
                                'net_load': y_test})
            prdfs.append(prdf)

# concatenate predictions and compute discrete error metrics
predictions = pd.concat(prdfs)
predictions['Residual'] = predictions['predicted'] - predictions['net_load']
predictions['Absolute Error'] = predictions['Residual'].abs()
predictions['Percent Error'] = predictions['Residual'] / predictions['net_load']
predictions['Absolute Percent Error'] = predictions['Percent Error'].abs()
predictions['Squared Error'] = predictions['Residual'] ** 2

0it [00:00, ?it/s]

0it [00:00, ?it/s]

week ahead performance metrics

In [88]:
metrics = predictions.groupby(['window', 'encoding']).mean(numeric_only = True).reset_index()

metrics['RMSE'] = np.sqrt(metrics['Squared Error']).round(2)
metrics['MAE'] = metrics['Absolute Error'].round(2)
metrics['MAPE'] = np.round(metrics['Absolute Percent Error'] * 100, 2)

metrics[['window', 'encoding', 'MAE', 'MAPE', 'RMSE']].sort_values(['RMSE'])

Unnamed: 0,window,encoding,MAE,MAPE,RMSE
0,expanding,holiday_bin,76.44,6.57,110.03
1,expanding,holiday_cat,76.19,6.57,110.3
3,expanding,working_day,76.5,6.59,110.76
7,sliding,working_day,78.43,6.67,112.84
5,sliding,holiday_cat,78.17,6.69,113.42
4,sliding,holiday_bin,78.63,6.69,113.51
2,expanding,no_holiday,79.49,6.85,114.08
6,sliding,no_holiday,80.71,6.91,116.67


In [102]:
metrics = predictions.groupby(['window', 'encoding', 'holiday']).agg(
    RMSE = ('Squared Error', lambda x: np.sqrt(x.mean())),
    MAPE = ('Absolute Percent Error', lambda x: 100*np.mean(x)),
    MAE = ('Absolute Error', np.mean),
    count = ('datetime', 'count')
).reset_index().round(2)
#metrics['RMSE'] = np.sqrt(metrics['Squared Error']).round(2)

#metrics[['window', 'encoding', 'holiday', 'RMSE']].sort_values(['holiday', 'RMSE'])
metrics.pivot(index=['window', 'encoding'], columns=['holiday'], values=['MAE', 'MAPE', 'RMSE'])

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,MAE,MAPE,MAPE,RMSE,RMSE
Unnamed: 0_level_1,holiday,0,1,0,1,0,1
window,encoding,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
expanding,holiday_bin,76.54,74.18,6.45,9.23,110.23,105.37
expanding,holiday_cat,76.25,74.94,6.45,9.31,110.61,103.0
expanding,no_holiday,76.74,142.32,6.42,16.66,110.39,178.7
expanding,working_day,76.05,86.88,6.4,11.04,110.43,118.0
sliding,holiday_bin,78.64,78.57,6.55,9.78,113.87,104.88
sliding,holiday_cat,77.95,83.14,6.53,10.18,113.54,110.57
sliding,no_holiday,78.69,126.7,6.56,14.91,114.44,159.23
sliding,working_day,78.36,79.87,6.53,10.04,113.2,104.45


In [90]:
vals = metrics.pivot(index=['window', 'encoding'], columns=['holiday'], values=['MAE', 'MAPE', 'RMSE']).values
for row in vals:
    print('\t&\t'.join([str(x) for x in row]))

76.54	&	74.18	&	6.45	&	9.23	&	110.23	&	105.37
76.25	&	74.94	&	6.45	&	9.31	&	110.61	&	103.0
76.74	&	142.32	&	6.42	&	16.66	&	110.39	&	178.7
76.05	&	86.88	&	6.4	&	11.04	&	110.43	&	118.0
78.64	&	78.57	&	6.55	&	9.78	&	113.87	&	104.88
77.95	&	83.14	&	6.53	&	10.18	&	113.54	&	110.57
78.69	&	126.7	&	6.56	&	14.91	&	114.44	&	159.23
78.36	&	79.87	&	6.53	&	10.04	&	113.2	&	104.45


48 hours ahead performance metrics for comparison (not significantly different)

In [91]:
st_inds = (predictions['datetime'].dt.date - predictions['train_end']) <= pd.Timedelta('2d')
metrics = predictions.loc[st_inds,:].groupby(['window', 'encoding']).mean(numeric_only = True).reset_index()
metrics['RMSE'] = np.sqrt(metrics['Squared Error'])
metrics['MAE'] = metrics['Absolute Error']
metrics['MAPE'] = metrics['Absolute Percent Error']

metrics[['window', 'holiday', 'MAE', 'MAPE', 'RMSE']]

Unnamed: 0,window,holiday,MAE,MAPE,RMSE
0,expanding,0.032916,83.904651,0.073838,123.963393
1,expanding,0.032916,84.554559,0.075125,126.101528
2,expanding,0.032916,86.780334,0.076886,127.618932
3,expanding,0.032916,83.648653,0.073763,124.910321
4,sliding,0.032916,85.616412,0.074362,129.14448
5,sliding,0.032916,84.702865,0.074408,128.912368
6,sliding,0.032916,88.002291,0.076616,132.702151
7,sliding,0.032916,84.752013,0.073453,127.315858
