In [93]:
import os
import numpy as np
import pandas as pd
from tscv import GapRollForward

from sklearn.ensemble import HistGradientBoostingRegressor

# specify load and weather data details
DATA_PATH = '../data'
REGION = 'sa'
DATA_FILENAME = 'merged2.csv'
OBS_PER_DAY = 24
X_EXCLUDE = ['datetime', 'net_load', 'total_load']
HOLIDAY_FILENAME = 'holidays2017_2024.csv'

# for convenience below
obs = np.arange(1000) * OBS_PER_DAY

# specify methodology and model parameters
TRAIN_BEGIN = '2018-03-07'
TRAIN_MIN_SIZE = obs[365]   # change for expanding window
TRAIN_MAX_SIZE = obs[365]   # change for expanding window (np.inf)
TEST_MIN_SIZE = obs[7]
TEST_MAX_SIZE = obs[7]
TEST_FINAL_N = None         # set to None for rolling test window, or n to test final observations
ROLL_SIZE = obs[60]

# extract holidays from file
holiday_df = pd.read_csv(os.path.join(DATA_PATH, HOLIDAY_FILENAME), dtype='str')
holiday_df['Date'] = holiday_df['Date'].astype('datetime64[ns]')

# import and preprocess data
full_data_path = os.path.join(DATA_PATH, REGION, DATA_FILENAME)
df = pd.read_csv(os.path.relpath(full_data_path))
df['datetime'] = df['datetime'].astype('datetime64')
dt = df['datetime'].dt
df['year'] = dt.year
df['month'] = dt.month
df['day'] = dt.day
df['hour'] = dt.hour
df['minute'] = dt.minute
df['dow'] = dt.day_of_week
df['week'] = dt.isocalendar().week

holidays = holiday_df.loc[holiday_df['Jurisdiction'] == REGION, ['Date', 'Holiday Name']]
df['holiday'] = dt.date.isin(holidays['Date'].dt.date).astype('int')

df_subset = df[TRAIN_BEGIN <= df['datetime']]

# compute X and y column indices
X_cols = np.setdiff1d(df.columns.values, X_EXCLUDE)
X_inds = sorted(df.columns.get_indexer_for(X_cols))
y_ind = df.columns.get_loc('net_load')

X = df_subset.iloc[:, X_inds]
y = df_subset.iloc[:, y_ind]

# create train/test window strategy
tscv = GapRollForward(min_train_size=TRAIN_MIN_SIZE, max_train_size=TRAIN_MAX_SIZE,
                    min_test_size=TEST_MIN_SIZE, max_test_size=TEST_MAX_SIZE,
                    roll_size=ROLL_SIZE)
print(sum(1 for i in tscv.split(df_subset)), 'CV folds')
X.head()

25 CV folds


Unnamed: 0,tempc,cloud8,windk,wdir,humid,rainmm,radkjm2,pv_est,year,month,day,hour,minute,dow,week,holiday
14,18.0,1.0,7.0,180.0,74.0,0.0,0.0,0.0,2018,3,7,0,0,2,10,0
15,19.0,1.0,12.0,130.0,64.0,0.0,0.0,0.0,2018,3,7,1,0,2,10,0
16,19.5,0.0,13.0,120.0,55.0,0.0,0.0,0.0,2018,3,7,2,0,2,10,0
17,19.4,1.0,11.0,120.0,55.0,0.0,0.0,0.0,2018,3,7,3,0,2,10,0
18,20.0,3.0,14.0,120.0,53.0,0.0,0.0,0.0,2018,3,7,4,0,2,10,0


In [94]:
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

params = [
    {},
    {'learning_rate': [0.5, 1]},
    {'max_iter': [200, 400]},
    {'max_leaf_nodes': [None, 50]},
    {'interaction_cst': ['pairwise']},
    {'l2_regularization': [0.1, 0.25, 0.5]},
]
param_distributions = {
    "max_iter": [100, 500, 1000, 2000],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
    "learning_rate": loguniform(0.01, 1),
    #"interaction_cst": [None, 'pairwise'],
    "l2_regularization": loguniform(0.01, 1)
}

search = RandomizedSearchCV(
    HistGradientBoostingRegressor(), 
    param_distributions = param_distributions, 
    cv = tscv.split(df_subset), 
    n_jobs = -1, 
    n_iter = 10,
    random_state = 0,
    refit = 'neg_mean_absolute_percentage_error',
    scoring = ['neg_mean_absolute_percentage_error', 
               'neg_mean_absolute_error',
               'neg_root_mean_squared_error',
               'r2']
)
search.fit(X, y)


In [95]:

rename_d = {
    "mean_test_neg_mean_absolute_error": 'MAE', 
    #"std_test_neg_mean_absolute_error": 'std_AE',
    "mean_test_neg_mean_absolute_percentage_error": 'MAPE', 
    #"std_test_neg_mean_absolute_percentage_error": 'std_APE',
    "mean_test_neg_root_mean_squared_error": 'RMSE', 
    #"std_test_neg_root_mean_squared_error": 'std_MSE',
    "mean_test_r2": 'R2',
    #"std_test_r2": 'std_R2'
}
results_df = pd.DataFrame(search.cv_results_).rename(rename_d, axis=1)
results_df[['MAPE', 'MAE', 'RMSE']] = -results_df[['MAPE', 'MAE', 'RMSE']]
#results_df[['MAPE', 'std_APE']] = results_df[['MAPE', 'std_APE']] * 100
results_df['MAPE'] = results_df['MAPE'] * 100

results_df[['param_' + k for k in param_distributions] + list(rename_d.values())].sort_values('MAPE').round(2)

Unnamed: 0,param_max_iter,param_max_leaf_nodes,param_learning_rate,param_l2_regularization,MAE,MAPE,RMSE,R2
9,2000,10,0.033815,0.067503,64.73,5.77,83.84,0.91
1,500,10,0.070357,0.122961,65.34,5.81,84.11,0.91
0,2000,20,0.269388,0.125207,68.08,6.05,90.14,0.9
4,100,5,0.470065,0.061034,74.89,6.48,96.79,0.88
6,2000,100,0.396568,0.906226,72.17,6.56,94.67,0.89
2,100,50,0.6075,0.07502,80.03,7.09,106.63,0.86
8,2000,10,0.775064,0.019351,85.23,7.6,111.31,0.85
3,2000,2,0.383222,0.058463,101.96,8.47,126.88,0.79
5,500,2,0.054511,0.197854,116.55,10.37,144.35,0.74
7,100,5,0.01724,0.363964,155.16,15.36,197.43,0.56


In [96]:
latek = results_df[['param_' + k for k in param_distributions] + list(rename_d.values())].sort_values('MAPE')
for p in ['param_learning_rate', 'param_l2_regularization']:
    latek[p] = latek[p].astype('float')
print(latek.columns.tolist())
latek = latek.round(2).values
for row in latek:
    print('\t&\t'.join([str(x) for x in row]) + ' \\\\')

['param_max_iter', 'param_max_leaf_nodes', 'param_learning_rate', 'param_l2_regularization', 'MAE', 'MAPE', 'RMSE', 'R2']
2000	&	10	&	0.03	&	0.07	&	64.73	&	5.77	&	83.84	&	0.91 \\
500	&	10	&	0.07	&	0.12	&	65.34	&	5.81	&	84.11	&	0.91 \\
2000	&	20	&	0.27	&	0.13	&	68.08	&	6.05	&	90.14	&	0.9 \\
100	&	5	&	0.47	&	0.06	&	74.89	&	6.48	&	96.79	&	0.88 \\
2000	&	100	&	0.4	&	0.91	&	72.17	&	6.56	&	94.67	&	0.89 \\
100	&	50	&	0.61	&	0.08	&	80.03	&	7.09	&	106.63	&	0.86 \\
2000	&	10	&	0.78	&	0.02	&	85.23	&	7.6	&	111.31	&	0.85 \\
2000	&	2	&	0.38	&	0.06	&	101.96	&	8.47	&	126.88	&	0.79 \\
500	&	2	&	0.05	&	0.2	&	116.55	&	10.37	&	144.35	&	0.74 \\
100	&	5	&	0.02	&	0.36	&	155.16	&	15.36	&	197.43	&	0.56 \\
