In [3]:
import numpy as np
import pandas as pd
from tscv import GapRollForward
from tqdm.notebook import tqdm
from sklearn.ensemble import HistGradientBoostingRegressor

region = 'sa'

# extract holidays from file
holiday_df = pd.read_csv('../data/holidays2017_2024.csv', dtype='str')
holiday_df['Date'] = holiday_df['Date'].astype('datetime64[ns]').dt.date
holidays = holiday_df.loc[holiday_df['Jurisdiction'] == region, ['Date', 'Holiday Name']]
holidict = {name: i+1 for i, name in enumerate(holidays['Holiday Name'].unique())}

# import and preprocess load and weather data
df = pd.read_csv(f'../data/{region}/merged.csv')
df['datetime'] = df['datetime'].astype('datetime64')
dt = df['datetime'].dt
df['year'] = dt.year
df['month'] = dt.month
df['day'] = dt.day
df['hour'] = dt.hour
df['minute'] = dt.minute
df['dow'] = dt.day_of_week
df['week'] = dt.isocalendar().week

# compute universal X and y column indices
X_EXCLUDE = ['datetime', 'net_load', 'total_load', 'pv_est']
X_cols = np.setdiff1d(df.columns.values, X_EXCLUDE)
y_ind = df.columns.get_loc('net_load')

# add holiday encodings
df['holiday_bin'] = dt.date.isin(holidays['Date']).astype('int')
merged = pd.merge(dt.date, holidays, left_on='datetime', right_on='Date', how='left')
df['holiday_cat'] = merged['Holiday Name'].replace(holidict).fillna(0).astype('int')
df['working_day'] = np.logical_not(np.logical_or(df['holiday_bin'], df['dow'] >= 5))

# for convenience below
obs = np.arange(1000) * 24

# create train/test window strategies
tscv_sliding = GapRollForward(
    min_train_size = obs[365], 
    max_train_size = obs[365],
    min_test_size = obs[7], 
    max_test_size = obs[7],
    roll_size = obs[30])

tscv_expanding = GapRollForward(
    min_train_size = obs[365], 
    max_train_size = np.inf,
    min_test_size = obs[7], 
    max_test_size = obs[7],
    roll_size = obs[30])

print(sum(1 for i in tscv_sliding.split(df)), f' sliding windows to be trained')
print(sum(1 for i in tscv_expanding.split(df)), f' expanding windows to be trained')
X_cols

49  sliding windows to be trained
49  expanding windows to be trained


array(['cloud8', 'day', 'dow', 'hour', 'humid', 'minute', 'month',
       'radkjm2', 'rainmm', 'tempc', 'wdir', 'week', 'windk', 'year'],
      dtype=object)

In [4]:
tscv = {'sliding': tscv_sliding, 'expanding': tscv_expanding}

prdfs = []
# execute train/test window strategies
for strategy, cv in tscv.items():
    for i, (train_ind, test_ind) in tqdm(enumerate(cv.split(df))):
        for include in [[], ['holiday_bin'], ['holiday_cat'], ['working_day']]:
            X_inds = sorted(df.columns.get_indexer_for(X_cols.tolist() + include))

            X_train, X_test = df.iloc[train_ind, X_inds], df.iloc[test_ind, X_inds]
            y_train, y_test = df.iloc[train_ind, y_ind], df.iloc[test_ind, y_ind]

            # train model
            begin, end = df.iloc[[train_ind[0], train_ind[-1]], 0].dt.date
            model = HistGradientBoostingRegressor()
            model.fit(X_train, y_train)

            # predict
            prd = model.predict(X_test)
            prdf = pd.DataFrame({'datetime': df.iloc[test_ind, 0],
                                'model': i,
                                'encoding': include[0] if include else 'no_holiday',
                                'holiday': df.iloc[test_ind, :]['holiday_bin'],
                                'window': strategy,
                                'train_end': end,
                                'predicted': prd,
                                'net_load': y_test})
            prdfs.append(prdf)

# concatenate predictions and compute discrete error metrics
predictions = pd.concat(prdfs)
predictions['Residual'] = predictions['predicted'] - predictions['net_load']
predictions['Absolute Error'] = predictions['Residual'].abs()
predictions['Percent Error'] = predictions['Residual'] / predictions['net_load']
predictions['Absolute Percent Error'] = predictions['Percent Error'].abs()
predictions['Squared Error'] = predictions['Residual'] ** 2

0it [00:00, ?it/s]

0it [00:00, ?it/s]

week ahead performance metrics

In [5]:
metrics = predictions.groupby(['window', 'encoding']).mean(numeric_only = True).reset_index()

metrics['RMSE'] = np.sqrt(metrics['Squared Error']).round(2)
metrics['MAE'] = metrics['Absolute Error'].round(2)
metrics['MAPE'] = np.round(metrics['Absolute Percent Error'] * 100, 2)

metrics[['window', 'encoding', 'MAE', 'MAPE', 'RMSE']].sort_values(['RMSE'])

Unnamed: 0,window,encoding,MAE,MAPE,RMSE
3,expanding,working_day,92.23,8.4,131.58
0,expanding,holiday_bin,93.42,8.56,133.62
1,expanding,holiday_cat,93.5,8.58,134.01
2,expanding,no_holiday,95.55,8.81,136.1
4,sliding,holiday_bin,98.71,9.2,140.98
7,sliding,working_day,98.19,9.14,141.09
5,sliding,holiday_cat,99.18,9.25,141.57
6,sliding,no_holiday,99.99,9.33,143.0


In [6]:
metrics = predictions.groupby(['window', 'encoding', 'holiday']).agg(
    RMSE = ('Squared Error', lambda x: np.sqrt(x.mean())),
    MAPE = ('Absolute Percent Error', lambda x: 100*np.mean(x)),
    MAE = ('Absolute Error', np.mean),
    count = ('datetime', 'count')
).reset_index().round(2)
#metrics['RMSE'] = np.sqrt(metrics['Squared Error']).round(2)

#metrics[['window', 'encoding', 'holiday', 'RMSE']].sort_values(['holiday', 'RMSE'])
metrics.pivot(index=['window', 'encoding'], columns=['holiday'], values=['MAE', 'MAPE', 'RMSE'])

Unnamed: 0_level_0,Unnamed: 1_level_0,MAE,MAE,MAPE,MAPE,RMSE,RMSE
Unnamed: 0_level_1,holiday,0,1,0,1,0,1
window,encoding,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
expanding,holiday_bin,93.78,85.17,8.46,11.02,134.3,117.17
expanding,holiday_cat,93.7,88.82,8.46,11.35,134.56,120.77
expanding,no_holiday,93.33,146.2,8.44,17.31,133.24,190.02
expanding,working_day,92.25,91.82,8.27,11.5,131.92,123.37
sliding,holiday_bin,98.84,95.64,9.08,11.98,141.39,131.29
sliding,holiday_cat,99.19,98.78,9.13,12.16,141.94,132.85
sliding,no_holiday,97.94,146.64,8.97,17.67,140.43,192.45
sliding,working_day,98.2,98.06,8.99,12.71,141.3,136.27


In [7]:
vals = metrics.pivot(index=['window', 'encoding'], columns=['holiday'], values=['MAE', 'MAPE', 'RMSE']).values
for row in vals:
    print('\t&\t'.join([str(x) for x in row]))

93.78	&	85.17	&	8.46	&	11.02	&	134.3	&	117.17
93.7	&	88.82	&	8.46	&	11.35	&	134.56	&	120.77
93.33	&	146.2	&	8.44	&	17.31	&	133.24	&	190.02
92.25	&	91.82	&	8.27	&	11.5	&	131.92	&	123.37
98.84	&	95.64	&	9.08	&	11.98	&	141.39	&	131.29
99.19	&	98.78	&	9.13	&	12.16	&	141.94	&	132.85
97.94	&	146.64	&	8.97	&	17.67	&	140.43	&	192.45
98.2	&	98.06	&	8.99	&	12.71	&	141.3	&	136.27
