In [22]:
import numpy as np
import pandas as pd
from tscv import GapRollForward
from tqdm.notebook import tqdm
from sklearn.ensemble import HistGradientBoostingRegressor

region = 'sa'

# extract holidays from file
holiday_df = pd.read_csv('../data/holidays2017_2024.csv', dtype='str')
holiday_df['Date'] = holiday_df['Date'].astype('datetime64[ns]').dt.date
holidays = holiday_df.loc[holiday_df['Jurisdiction'] == region, ['Date', 'Holiday Name']]

# import and preprocess load and weather data
df = pd.read_csv(f'../data/{region}/merged.csv')
df['datetime'] = df['datetime'].astype('datetime64')
dt = df['datetime'].dt
df['year'] = dt.year
df['month'] = dt.month
df['day'] = dt.day
df['hour'] = dt.hour
df['minute'] = dt.minute
df['dow'] = dt.day_of_week
df['week'] = dt.isocalendar().week

# add holiday encodings
df['holiday_bin'] = dt.date.isin(holidays['Date']).astype('int')

# compute X and y column indices
X_EXCLUDE = ['datetime', 'net_load', 'total_load']
X_cols = np.setdiff1d(df.columns.values, X_EXCLUDE)
y_ind = df.columns.get_loc('net_load')

# for convenience below
obs = np.arange(df.shape[0]) * 24

# create train/test window strategies

# tscv = GapRollForward(
#     min_train_size = df.shape[0] - obs[200],
#     max_train_size = np.inf,
#     min_test_size = obs[7], 
#     max_test_size = obs[7],
#     roll_size = obs[7])
tscv = GapRollForward(
    min_train_size = obs[365*4],
    max_train_size = obs[365*4],
    min_test_size = obs[14], 
    max_test_size = obs[14],
    roll_size = obs[10])

print(sum(1 for i in tscv.split(df)), f' windows to be trained')

36  windows to be trained


In [23]:
prdfs = []
# execute train/test window strategies
for i, (train_ind, test_ind) in tqdm(enumerate(tscv.split(df))):
    for wday in range(7):
        X_inds = sorted(df.columns.get_indexer_for(X_cols.tolist()))

        X_train, X_test = df.iloc[train_ind, X_inds], df.iloc[test_ind, X_inds]
        y_train, y_test = df.iloc[train_ind, y_ind], df.iloc[test_ind, y_ind]
        dt = df.iloc[test_ind, 0]
        
        train_wdays, test_wdays = X_train['dow'] == wday, X_test['dow'] == wday
        X_train, X_test = X_train[train_wdays], X_test[test_wdays]
        y_train, y_test = y_train[train_wdays], y_test[test_wdays]

        # train model
        begin, end = df.iloc[[train_ind[0], train_ind[-1]], 0].dt.date
        model = HistGradientBoostingRegressor()
        model.fit(X_train, y_train)

        # predict
        prd = model.predict(X_test)
        prdf = pd.DataFrame({'datetime': dt[test_wdays],
                            'model': i,
                            'wday': wday,
                            'train_end': end,
                            'predicted': prd,
                            'net_load': y_test})
        prdfs.append(prdf)

# concatenate predictions and compute discrete error metrics
predictions = pd.concat(prdfs)
predictions['Residual'] = predictions['predicted'] - predictions['net_load']
predictions['Absolute Error'] = predictions['Residual'].abs()
predictions['Percent Error'] = predictions['Residual'] / predictions['net_load']
predictions['Absolute Percent Error'] = predictions['Percent Error'].abs()
predictions['Squared Error'] = predictions['Residual'] ** 2

0it [00:00, ?it/s]

In [24]:
metrics = predictions.groupby(['wday']).mean(numeric_only = True).reset_index()
metrics['RMSE'] = np.sqrt(metrics['Squared Error'])
metrics['MAE'] = metrics['Absolute Error']
metrics['MAPE'] = metrics['Absolute Percent Error']

metrics[['wday', 'MAE', 'MAPE', 'RMSE']]

Unnamed: 0,wday,MAE,MAPE,RMSE
0,0,83.533177,0.070835,116.368904
1,1,91.633439,0.069499,134.06172
2,2,85.295281,0.065626,118.484179
3,3,82.024835,0.079031,119.887489
4,4,87.744152,0.076541,141.800954
5,5,90.129276,0.092257,129.038225
6,6,87.41013,0.092383,127.432973
