In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

sns.set_theme(style="darkgrid")

In [97]:
load_df = pd.read_csv('data/load.csv')
hierarchy_df = pd.read_csv("data/hierarchy.csv")
humidity_df = pd.read_csv("data/relative humidity.csv")
temperature_df = pd.read_csv("data/temperature.csv")

# Preparation of Hierarchical Time Series

In [98]:
from gluonts.dataset.hierarchical import HierarchicalTimeSeries

# Load (only!) the time series at the bottom level of the hierarchy.
ts_at_bottom_level_csv = (
    "https://gist.githubusercontent.com/rshyamsundar/39e57075743537c4100a716a7b7ec047/"
    "raw/f02f5aeadad73e3f3e9cf54478606d3507826492/example_bottom_ts.csv"
)

In [99]:
# Make sure the dataframe has `PeriodIndex` by explicitly casting it to `PeriodIndex`.
ts_at_bottom_level = pd.read_csv(
    ts_at_bottom_level_csv,
    index_col=0,
    parse_dates=True,
).to_period()

ts_at_bottom_level.head()

Unnamed: 0,0,1,2,3
2020-03-22 00:00,0.056962,0.099911,0.039827,0.489971
2020-03-22 01:00,0.246535,0.422727,0.763164,0.756702
2020-03-22 02:00,0.314393,0.26782,0.169645,0.400996
2020-03-22 03:00,0.609158,0.043981,0.235009,0.310741
2020-03-22 04:00,0.380788,0.297702,0.898429,0.492278


In [100]:
# OUR DATA

load_df = pd.melt(load_df, id_vars=["meter_id", "date"], value_vars=load_df.columns.difference(["meter_id", "date"]),
                                var_name="hour", value_name="load")
load_df["hour"] = load_df["hour"].str.strip("h").astype(int) - 1
load_df["timestamp"] = pd.to_datetime(load_df["date"] + " " + load_df["hour"].astype(str) + ":00:00", format="%m/%d/%Y %H:%M:%S")
load_df["meter_id"] = load_df["meter_id"].astype(int)
load_df = load_df.drop(columns=["date", "hour"])
load_df

Unnamed: 0,meter_id,load,timestamp
0,1,3304.0,2005-01-01 00:00:00
1,1,2485.0,2005-01-02 00:00:00
2,1,2417.0,2005-01-03 00:00:00
3,1,2060.0,2005-01-04 00:00:00
4,1,1629.0,2005-01-05 00:00:00
...,...,...,...
9539107,499,878.0,2011-12-27 08:00:00
9539108,499,970.0,2011-12-28 08:00:00
9539109,499,1234.0,2011-12-29 08:00:00
9539110,499,1138.0,2011-12-30 08:00:00


In [101]:
df_pivoted = load_df.pivot(index='timestamp', columns='meter_id', values='load')
df_pivoted

meter_id,1,2,3,4,5,6,7,8,9,10,...,472,477,478,481,482,486,491,492,496,499
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01 00:00:00,3304.0,1278.0,3664.0,4221.0,2099.0,2214.0,2803.0,1070.0,945.0,3149.0,...,,,,,,,,,,
2005-01-01 01:00:00,3178.0,1263.0,3552.0,4011.0,2039.0,2140.0,2677.0,1044.0,897.0,3072.0,...,,,,,,,,,,
2005-01-01 02:00:00,2981.0,1190.0,3314.0,3871.0,1953.0,2046.0,2509.0,986.0,849.0,2870.0,...,,,,,,,,,,
2005-01-01 03:00:00,2944.0,1189.0,3261.0,3780.0,1939.0,2060.0,2446.0,990.0,851.0,2851.0,...,,,,,,,,,,
2005-01-01 04:00:00,2934.0,1170.0,3230.0,3808.0,1907.0,1999.0,2467.0,1007.0,851.0,2900.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-12-31 19:00:00,3416.0,1081.0,4067.0,,,1776.0,3374.0,1159.0,1034.0,3666.0,...,647.0,4057.0,0.0,5457.0,1781.0,,18396.0,38.0,3360.0,931.0
2011-12-31 20:00:00,3430.0,1036.0,4102.0,,,1675.0,3318.0,1181.0,1080.0,3764.0,...,616.0,4200.0,0.0,5356.0,1708.0,,17703.0,38.0,3408.0,869.0
2011-12-31 21:00:00,3388.0,1019.0,4025.0,,,1589.0,3304.0,1147.0,1066.0,3715.0,...,638.0,4318.0,0.0,5245.0,1658.0,,16947.0,38.0,3427.0,859.0
2011-12-31 22:00:00,3353.0,1008.0,3962.0,,,1538.0,3196.0,1159.0,1030.0,3724.0,...,652.0,4418.0,0.0,5095.0,1660.0,,16191.0,38.0,3370.0,864.0


In [102]:
# count number of nan values in each one of the columns
nan_values = df_pivoted.isna().sum()
nan_values

meter_id
1          7
2          7
3          7
4      45506
5      23357
       ...  
486    59112
491    43082
492    43514
496    45242
499    55464
Length: 169, dtype: int64

In [103]:
# show the columns with more than 10% of nan values
nan_values[nan_values > 0.1 * len(df_pivoted)]

# drop columns with more than 10% of nan values
df_pivoted = df_pivoted.drop(columns=nan_values[nan_values > 0.1 * len(df_pivoted)].index)

# drop the 236 column because it has nan values in the test set
df_pivoted = df_pivoted.drop(columns=[236])

# also drop 13 and 144 before of 0 ans nans
df_pivoted = df_pivoted.drop(columns=[28, 453])
df_pivoted

meter_id,1,2,3,6,7,8,9,10,11,14,...,422,424,427,428,429,435,436,443,444,451
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01 00:00:00,3304.0,1278.0,3664.0,2214.0,2803.0,1070.0,945.0,3149.0,10577.0,52416.0,...,8018.0,659.0,21045.0,13358.0,3417.0,599.0,8727.0,1305.0,1699.0,2434.0
2005-01-01 01:00:00,3178.0,1263.0,3552.0,2140.0,2677.0,1044.0,897.0,3072.0,9854.0,49536.0,...,8016.0,619.0,19089.0,13027.0,3431.0,576.0,8394.0,1286.0,1677.0,2376.0
2005-01-01 02:00:00,2981.0,1190.0,3314.0,2046.0,2509.0,986.0,849.0,2870.0,9538.0,47419.0,...,7813.0,625.0,17577.0,12713.0,3359.0,563.0,7938.0,1307.0,1651.0,2266.0
2005-01-01 03:00:00,2944.0,1189.0,3261.0,2060.0,2446.0,990.0,851.0,2851.0,9600.0,46469.0,...,7527.0,653.0,16944.0,12569.0,3336.0,556.0,7860.0,1314.0,1629.0,2290.0
2005-01-01 04:00:00,2934.0,1170.0,3230.0,1999.0,2467.0,1007.0,851.0,2900.0,9775.0,46267.0,...,7529.0,635.0,16887.0,12886.0,3340.0,563.0,7985.0,1335.0,1668.0,2318.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-12-31 19:00:00,3416.0,1081.0,4067.0,1776.0,3374.0,1159.0,1034.0,3666.0,13423.0,68918.0,...,9545.0,1011.0,48085.0,14774.0,3499.0,1121.0,7291.0,1411.0,2256.0,3816.0
2011-12-31 20:00:00,3430.0,1036.0,4102.0,1675.0,3318.0,1181.0,1080.0,3764.0,13121.0,67493.0,...,9528.0,979.0,46100.0,14679.0,3480.0,1134.0,7121.0,1407.0,2198.0,3811.0
2011-12-31 21:00:00,3388.0,1019.0,4025.0,1589.0,3304.0,1147.0,1066.0,3715.0,12960.0,65664.0,...,9450.0,979.0,43880.0,14774.0,3514.0,1127.0,7088.0,1342.0,2138.0,3778.0
2011-12-31 22:00:00,3353.0,1008.0,3962.0,1538.0,3196.0,1159.0,1030.0,3724.0,12528.0,63648.0,...,9197.0,970.0,41801.0,14490.0,3494.0,1062.0,7024.0,1304.0,2138.0,3720.0


<span style="color:red"> check what series to exclude </span>


The aggregated time series are automatically constructed and should not be provided.

In [104]:
# Load the aggregation matrix `S`.
S_csv = (
    "https://gist.githubusercontent.com/rshyamsundar/17084fd1f28021867bcf6f2d69d9b73a/raw/"
    "32780ca43f57a78f2d521a75e73b136b17f34a02/example_agg_mat.csv"
)
S = pd.read_csv(S_csv).values

S

array([[1, 1, 1, 1],
       [1, 1, 0, 0],
       [0, 0, 1, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1]])

In [105]:
# OUR DATA

ts_mapping = {}
num_bottom_ts = len(df_pivoted.columns)
S = []
counter = 0

# add top time-seroes
name = "top"
ts_mapping[name] = counter
counter += 1
S.append([1 for i in range(num_bottom_ts)])

# add aggregated time-series
for i, row in hierarchy_df.iterrows():
    meter_id, mid_level, aggregate = row
    name = aggregate
    if aggregate not in ts_mapping and meter_id in df_pivoted.columns:
        ts_mapping[name] = counter
        counter += 1
        hierarchy = [0 for i in range(num_bottom_ts)]
        hierarchy[df_pivoted.columns.get_loc(meter_id)] = 1
        S.append(hierarchy)
    elif aggregate in ts_mapping and meter_id in df_pivoted.columns:
        hierarchy = S[ts_mapping[aggregate]]
        hierarchy[df_pivoted.columns.get_loc(meter_id)] = 1
        S[ts_mapping[aggregate]] = hierarchy


# add mid_level time-series
for i, row in hierarchy_df.iterrows():
    meter_id, mid_level, aggregate = row
    name = mid_level
    if mid_level not in ts_mapping and meter_id in df_pivoted.columns:
        ts_mapping[name] = counter
        counter += 1
        hierarchy = [0 for i in range(num_bottom_ts)]
        hierarchy[df_pivoted.columns.get_loc(meter_id)] = 1
        S.append(hierarchy)
    elif mid_level in ts_mapping and meter_id in df_pivoted.columns:
        hierarchy = S[ts_mapping[mid_level]]
        hierarchy[df_pivoted.columns.get_loc(meter_id)] = 1
        S[ts_mapping[mid_level]] = hierarchy

# add bottom time-series
for i in range(num_bottom_ts):
    name = df_pivoted.columns[i]
    if name not in ts_mapping:
        ts_mapping[name] = counter
        counter += 1
        hierarchy = [0 for i in range(num_bottom_ts)]
        hierarchy[i] = 1
        S.append(hierarchy)

S = np.array(S)

assert (S[1] + S[2] == S[0]).all()
assert (S[3:18].sum(axis=0) == S[0]).all()

print(S)

[[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 0 0]
 [0 0 0 ... 0 1 1]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]


In [106]:
ts_mapping

{'top': 0,
 'I003': 1,
 'I002': 2,
 'E001': 3,
 'E002': 4,
 'E004': 5,
 'E029': 6,
 'E009': 7,
 'E012': 8,
 'E014': 9,
 'E017': 10,
 'E018': 11,
 'E021': 12,
 'E022': 13,
 'E024': 14,
 'E025': 15,
 'E027': 16,
 'E028': 17,
 1: 18,
 2: 19,
 3: 20,
 6: 21,
 7: 22,
 8: 23,
 9: 24,
 10: 25,
 11: 26,
 14: 27,
 15: 28,
 23: 29,
 45: 30,
 46: 31,
 47: 32,
 48: 33,
 49: 34,
 51: 35,
 52: 36,
 53: 37,
 54: 38,
 55: 39,
 56: 40,
 57: 41,
 60: 42,
 61: 43,
 62: 44,
 63: 45,
 64: 46,
 65: 47,
 66: 48,
 67: 49,
 68: 50,
 69: 51,
 70: 52,
 71: 53,
 72: 54,
 73: 55,
 74: 56,
 77: 57,
 78: 58,
 79: 59,
 80: 60,
 81: 61,
 83: 62,
 85: 63,
 86: 64,
 88: 65,
 89: 66,
 102: 67,
 103: 68,
 104: 69,
 106: 70,
 112: 71,
 113: 72,
 114: 73,
 155: 74,
 156: 75,
 157: 76,
 158: 77,
 163: 78,
 164: 79,
 168: 80,
 187: 81,
 189: 82,
 190: 83,
 191: 84,
 192: 85,
 193: 86,
 195: 87,
 196: 88,
 197: 89,
 198: 90,
 199: 91,
 200: 92,
 202: 93,
 204: 94,
 205: 95,
 206: 96,
 207: 97,
 208: 98,
 209: 99,
 210: 100,
 2

In [107]:
# rename columns in df_pivoted to 1,2,3...
df_pivoted.columns = range(1, len(df_pivoted.columns) + 1)

# Make sure the dataframe has `PeriodIndex` by explicitly casting it to `PeriodIndex`.
df_pivoted2 = df_pivoted.to_period()

# replace nan valyes with closest non-nan value
df_pivoted2 = df_pivoted2.fillna(method='backfill')


df_pivoted2

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,133,134,135,136,137,138,139,140,141,142
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01 00:00,3304.0,1278.0,3664.0,2214.0,2803.0,1070.0,945.0,3149.0,10577.0,52416.0,...,8018.0,659.0,21045.0,13358.0,3417.0,599.0,8727.0,1305.0,1699.0,2434.0
2005-01-01 01:00,3178.0,1263.0,3552.0,2140.0,2677.0,1044.0,897.0,3072.0,9854.0,49536.0,...,8016.0,619.0,19089.0,13027.0,3431.0,576.0,8394.0,1286.0,1677.0,2376.0
2005-01-01 02:00,2981.0,1190.0,3314.0,2046.0,2509.0,986.0,849.0,2870.0,9538.0,47419.0,...,7813.0,625.0,17577.0,12713.0,3359.0,563.0,7938.0,1307.0,1651.0,2266.0
2005-01-01 03:00,2944.0,1189.0,3261.0,2060.0,2446.0,990.0,851.0,2851.0,9600.0,46469.0,...,7527.0,653.0,16944.0,12569.0,3336.0,556.0,7860.0,1314.0,1629.0,2290.0
2005-01-01 04:00,2934.0,1170.0,3230.0,1999.0,2467.0,1007.0,851.0,2900.0,9775.0,46267.0,...,7529.0,635.0,16887.0,12886.0,3340.0,563.0,7985.0,1335.0,1668.0,2318.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-12-31 19:00,3416.0,1081.0,4067.0,1776.0,3374.0,1159.0,1034.0,3666.0,13423.0,68918.0,...,9545.0,1011.0,48085.0,14774.0,3499.0,1121.0,7291.0,1411.0,2256.0,3816.0
2011-12-31 20:00,3430.0,1036.0,4102.0,1675.0,3318.0,1181.0,1080.0,3764.0,13121.0,67493.0,...,9528.0,979.0,46100.0,14679.0,3480.0,1134.0,7121.0,1407.0,2198.0,3811.0
2011-12-31 21:00,3388.0,1019.0,4025.0,1589.0,3304.0,1147.0,1066.0,3715.0,12960.0,65664.0,...,9450.0,979.0,43880.0,14774.0,3514.0,1127.0,7088.0,1342.0,2138.0,3778.0
2011-12-31 22:00,3353.0,1008.0,3962.0,1538.0,3196.0,1159.0,1030.0,3724.0,12528.0,63648.0,...,9197.0,970.0,41801.0,14490.0,3494.0,1062.0,7024.0,1304.0,2138.0,3720.0


<span style="color:red"> check filling of nan values </span>


In [112]:
assert S.shape[0] == len(ts_mapping)
assert S.shape[1] == len(df_pivoted2.columns)

In [109]:
hts = HierarchicalTimeSeries(
    ts_at_bottom_level=df_pivoted2,
    S=S,
)

In [110]:
hts.ts_at_all_levels.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,159
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-01 00:00,742959.0,111528.0,631431.0,15278.0,131815.0,10944.0,208877.0,23506.0,13702.0,12143.0,...,8018.0,659.0,21045.0,13358.0,3417.0,599.0,8727.0,1305.0,1699.0,2434.0
2005-01-01 01:00,710765.0,108101.0,602664.0,14751.0,124924.0,10512.0,201124.0,23003.0,13097.0,11286.0,...,8016.0,619.0,19089.0,13027.0,3431.0,576.0,8394.0,1286.0,1677.0,2376.0
2005-01-01 02:00,683586.0,103755.0,579831.0,13875.0,119974.0,9972.0,195041.0,22160.0,12485.0,10774.0,...,7813.0,625.0,17577.0,12713.0,3359.0,563.0,7938.0,1307.0,1651.0,2266.0
2005-01-01 03:00,674909.0,101552.0,573357.0,13741.0,117857.0,9972.0,193471.0,21539.0,12263.0,10671.0,...,7527.0,653.0,16944.0,12569.0,3336.0,556.0,7860.0,1314.0,1629.0,2290.0
2005-01-01 04:00,679809.0,100771.0,579038.0,13658.0,118095.0,10116.0,195394.0,21537.0,12291.0,10817.0,...,7529.0,635.0,16887.0,12886.0,3340.0,563.0,7985.0,1335.0,1668.0,2318.0


# Adding external dynamic features

In [None]:
dynamic_features_csv = (
    "https://gist.githubusercontent.com/rshyamsundar/d8e63bad43397c95a4f5daaa17e122f8/"
    "raw/a50657cf89f86d48cee41122f02cf5b1fcafdd2f/example_dynamic_features.csv"
)

dynamic_features_df = pd.read_csv(
    dynamic_features_csv,
    index_col=0,
    parse_dates=True,
).to_period()

In [None]:
dynamic_features_df_train = dynamic_features_df.iloc[:-prediction_length, :]

In [None]:
dataset_train = hts.to_dataset(feat_dynamic_real=dynamic_features_df_train)

# Model Training and Forecasting

In [None]:
from gluonts.mx.model.deepvar_hierarchical import DeepVARHierarchicalEstimator
from gluonts.mx.trainer import Trainer

prediction_length = 24


estimator = DeepVARHierarchicalEstimator(
    freq=hts.freq,
    prediction_length=prediction_length,
    trainer=Trainer(epochs=2),
    S=S,
)
predictor = estimator.train(dataset_train)

In [None]:
predictor_input = hts.to_dataset(feat_dynamic_real=dynamic_features_df)
forecast_it = predictor.predict(predictor_input)

# There is only one element in `forecast_it` containing forecasts for all the time series in the hierarchy.
forecasts = next(forecast_it)

# Model Evaluation via Backtesting