# Labour dataset

- use a simple base forecaster, the same formulation as the one in fable
- data sourced from [here](https://github.com/Nixtla/hierarchicalforecast/) (thanks!)

In [1]:
# :)
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [2]:
from datasetsforecast.hierarchical import HierarchicalData, HierarchicalInfo

from sktime.forecasting.sarimax import SARIMAX
from sktime.transformations.hierarchical.aggregate import Aggregator
from sktime.forecasting.reconcile import ReconcilerForecaster
from sktime.forecasting.model_selection import temporal_train_test_split

import pandas as pd
import numpy as np

The next few cells are also code from [here](https://github.com/Nixtla/hierarchicalforecast/) (thanks x3)

In [3]:
group = 'Labour'
init_cols, hier_cols, sep = ['Employment', 'Gender', 'Region'], ['Region', 'Employment', 'Gender'], ','
y, S, tags = HierarchicalData.load('data', group)
n_series = y['unique_id'].nunique()
n_series

57

* some info on the ts

In [4]:
meta_info_group = HierarchicalInfo[group]
h = meta_info_group.horizon
freq = meta_info_group.freq
sp = meta_info_group.seasonality

(h, freq, sp)

(8, 'MS', 12)

*  remove the aggregated levels

In [5]:
y = y.query('unique_id in @S.columns')
y[init_cols] = y['unique_id'].str.split(sep, expand=True)
y['ds'] = pd.PeriodIndex(y['ds'], freq="M")
y = y.set_index(hier_cols+['ds'])[['y']]

y


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,y
Region,Employment,Gender,ds,Unnamed: 4_level_1
'Australian Capital Territory'],['Employed full-time','Females',1978-02,28.743290
'Australian Capital Territory'],['Employed full-time','Females',1978-03,26.649367
'Australian Capital Territory'],['Employed full-time','Females',1978-04,25.784798
'Australian Capital Territory'],['Employed full-time','Females',1978-05,27.543975
'Australian Capital Territory'],['Employed full-time','Females',1978-06,27.784657
...,...,...,...,...
'Western Australia'],['Employed part-time','Males',2019-08,133.887697
'Western Australia'],['Employed part-time','Males',2019-09,125.901203
'Western Australia'],['Employed part-time','Males',2019-10,133.986953
'Western Australia'],['Employed part-time','Males',2019-11,133.947213


* remove unwanted characters from index

In [6]:
for i in range(y.index.nlevels-1):
    y[i] = (
        y.index.get_level_values(i)
        .str.replace("\['", "", regex=True)
        .str.replace("'\]", "", regex=True)
        .str.replace("'", "", regex=True)
    )

y = y.rename(columns={0: "Region", 1: "Employment", 2: "Gender"})
y = (
    y
    .reset_index('ds')
    .set_index(["Region", "Employment", "Gender", "ds"])
    .sort_index()
)
y

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,y
Region,Employment,Gender,ds,Unnamed: 4_level_1
Australian Capital Territory,Employed full-time,Females,1978-02,28.743290
Australian Capital Territory,Employed full-time,Females,1978-03,26.649367
Australian Capital Territory,Employed full-time,Females,1978-04,25.784798
Australian Capital Territory,Employed full-time,Females,1978-05,27.543975
Australian Capital Territory,Employed full-time,Females,1978-06,27.784657
...,...,...,...,...
Western Australia,Employed part-time,Males,2019-08,133.887697
Western Australia,Employed part-time,Males,2019-09,125.901203
Western Australia,Employed part-time,Males,2019-10,133.986953
Western Australia,Employed part-time,Males,2019-11,133.947213


Save data for fable

In [7]:
y.to_csv("./data/hierarchical/labour_bottomlevels.csv")
len(y.index.droplevel('ds').unique())

32

* now aggregate for totals

In [8]:
agg = Aggregator()
y = agg.fit_transform(y)
y

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,y
Region,Employment,Gender,ds,Unnamed: 4_level_1
Australian Capital Territory,Employed full-time,Females,1978-02,28.743290
Australian Capital Territory,Employed full-time,Females,1978-03,26.649367
Australian Capital Territory,Employed full-time,Females,1978-04,25.784798
Australian Capital Territory,Employed full-time,Females,1978-05,27.543975
Australian Capital Territory,Employed full-time,Females,1978-06,27.784657
...,...,...,...,...
__total,__total,__total,2019-08,12858.337240
__total,__total,__total,2019-09,12925.936198
__total,__total,__total,2019-10,12898.478058
__total,__total,__total,2019-11,12992.526833


Quick check on hierarchy with aggregated levels

* all good!

In [9]:
len(y.index.droplevel('ds').unique()) == n_series

True

Train test split

* to be the same as [here](https://github.com/Nixtla/hierarchicalforecast/) I've left out the last 8 obs

In [10]:
y_train, y_test = temporal_train_test_split(y, test_size=8)
y_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,y
Region,Employment,Gender,ds,Unnamed: 4_level_1
Australian Capital Territory,Employed full-time,Females,1978-02,28.743290
Australian Capital Territory,Employed full-time,Females,1978-03,26.649367
Australian Capital Territory,Employed full-time,Females,1978-04,25.784798
Australian Capital Territory,Employed full-time,Females,1978-05,27.543975
Australian Capital Territory,Employed full-time,Females,1978-06,27.784657
...,...,...,...,...
__total,__total,__total,2018-12,12839.166618
__total,__total,__total,2019-01,12603.075613
__total,__total,__total,2019-02,12802.157755
__total,__total,__total,2019-03,12785.249118


Fit the base forecasts

* use a simple AR1 model
* no intercept seems to perform better

In [11]:

base_forecaster = SARIMAX(order=(1,0,0), trend="n", enforce_stationarity=False)
base_forecaster.fit(y_train)
prds = base_forecaster.predict(fh=np.arange(1, h + 1)).rename(columns={'y': 'base'})
prds

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,base
Region,Employment,Gender,ds,Unnamed: 4_level_1
Australian Capital Territory,Employed full-time,Females,2019-05,70.487332
Australian Capital Territory,Employed full-time,Females,2019-06,70.569944
Australian Capital Territory,Employed full-time,Females,2019-07,70.652653
Australian Capital Territory,Employed full-time,Females,2019-08,70.735459
Australian Capital Territory,Employed full-time,Females,2019-09,70.818362
...,...,...,...,...
__total,__total,__total,2019-08,12909.480694
__total,__total,__total,2019-09,12929.003599
__total,__total,__total,2019-10,12948.556028
__total,__total,__total,2019-11,12968.138027


Now fit the hierrachical reconciler forecasters

* note that each loop the base forecasters will be retrained
* we could do this a bit quicker for some reconciliation methods (Foecaster * Reconciler), but this is the easiest way

In [12]:
methods = sorted(ReconcilerForecaster.METHOD_LIST)

for method in methods:
    print(method)
    reconciler = ReconcilerForecaster(forecaster=base_forecaster, method=method)
    prds_recon = reconciler.fit_predict(y=y_train, fh=np.arange(1, h + 1)).rename(columns={'y': method})
    prds = pd.concat([prds, prds_recon], axis=1)

prds

bu
mint_cov
mint_shrink
ols
td_fcst
wls_str
wls_var


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,base,bu,mint_cov,mint_shrink,ols,td_fcst,wls_str,wls_var
Region,Employment,Gender,ds,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Australian Capital Territory,Employed full-time,Females,2019-05,70.487332,70.487332,70.533325,70.636396,70.533518,70.527296,70.533114,70.525225
Australian Capital Territory,Employed full-time,Females,2019-06,70.569944,70.569944,70.661959,70.868021,70.662349,70.649977,70.661552,70.645768
Australian Capital Territory,Employed full-time,Females,2019-07,70.652653,70.652653,70.790722,71.099691,70.791310,70.772861,70.790131,70.766445
Australian Capital Territory,Employed full-time,Females,2019-08,70.735459,70.735459,70.919616,71.331405,70.920400,70.895949,70.918851,70.887257
Australian Capital Territory,Employed full-time,Females,2019-09,70.818362,70.818362,71.048639,71.563161,71.049620,71.019240,71.047713,71.008203
...,...,...,...,...,...,...,...,...,...,...,...
__total,__total,__total,2019-08,12909.480694,12899.606564,12908.851977,12909.462296,12908.851970,12909.480694,12909.436601,12909.480567
__total,__total,__total,2019-09,12929.003599,12916.660189,12928.217907,12928.980622,12928.217898,12929.003599,12928.948504,12929.003441
__total,__total,__total,2019-10,12948.556028,12933.743045,12947.613445,12948.528482,12947.613433,12948.556028,12948.489941,12948.555839
__total,__total,__total,2019-11,12968.138027,12950.855181,12967.038638,12968.105919,12967.038622,12968.138027,12968.060955,12968.137806


Calculate the RMSE

* this is much the same as the fable package
* there are some minor diffences between the methods based on the residual covariance matrix

In [13]:
eval = pd.concat([prds, y_test], axis=1)
eval = eval.melt(id_vars='y', ignore_index=False)

eval.groupby('variable').apply(
    lambda x: np.round((((x['y'] - x['value'])**2).mean())**(1/2), 3)
)

variable
base           17.523
bu             17.819
mint_cov       17.519
mint_shrink    17.580
ols            17.519
td_fcst        17.466
wls_str        17.505
wls_var        17.573
dtype: float64