## Hour

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import plotly.express as px


file_path = '/home/olga/ts_year_project/TimeSeriesMasters2023/eda_olga/energy_hourly_dataset_2012.csv'
df = pd.read_csv(file_path)
# df

In [2]:
def transform_to_custom_freq(df, freq):
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    # df.set_index('date', inplace=True)
    df_resampled = df.resample(freq).sum()
    # df_resampled.reset_index(inplace=True)
    
    return df_resampled

## week

In [3]:
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
selected_freq = 'W'
df_resampled_week = transform_to_custom_freq(df, selected_freq)
# df_resampled_week

## Daily

In [4]:
df.reset_index(inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
selected_freq = 'D'
df_resampled_day = transform_to_custom_freq(df, selected_freq)
# df_resampled_day

## Month

In [5]:
df.reset_index(inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
# df.reset_index(inplace=True)
selected_freq = 'M'
df_resampled_month = transform_to_custom_freq(df, selected_freq)
# df_resampled_month

In [6]:
from hierarchicalforecast.utils import aggregate
from hierarchicalforecast.methods import BottomUp, TopDown, MinTrace, ERM
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.evaluation import HierarchicalEvaluation

## Full aggregation

In [7]:
# df

In [8]:
# df_reduced = df.drop(columns=['index', 'id'])
df_reduced = df

In [9]:
df_reduced.reset_index(inplace=True)
df_reduced['date'] = pd.to_datetime(df_reduced['date'])
# Добавление столбцов 'hour', 'day' и 'week'
df_reduced['hour'] = df_reduced['date'].astype(str)
df_reduced['day'] = df_reduced['date'].dt.strftime('%Y-%m-%d %H:%M:%S').apply(lambda x: x.split(' ')[0]).astype(str)
df_reduced['week'] = df_reduced['date'].dt.to_period("W-SUN").apply(lambda x: x.start_time).astype(str)
df_reduced['month'] = df_reduced['date'].dt.strftime('%Y-%m').astype(str)

# Вывод результата
# df_reduced

In [10]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2184 entries, 0 to 2183
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2184 non-null   datetime64[ns]
 1   id      2184 non-null   int64         
 2   value   2184 non-null   float64       
 3   hour    2184 non-null   object        
 4   day     2184 non-null   object        
 5   week    2184 non-null   object        
 6   month   2184 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 119.6+ KB


In [11]:
df_reduced = df_reduced.rename(columns={'value': 'y', 'date': 'ds'})
# Create hierarchical structure and constraints
hierarchy_levels = [
    ['month'],
    ['month', 'week'],
    ['month', 'week', 'day'],
    ['month', 'week', 'day', 'hour']
]
Y_hier_df, S_df, tags = aggregate(df=df_reduced, spec=hierarchy_levels)
Y_hier_df = Y_hier_df.reset_index()
print('S_df.shape', S_df.shape)
print('Y_hier_df.shape', Y_hier_df.shape)
print("tags['month']", tags['month'])

S_df.shape (2294, 2184)
Y_hier_df.shape (8736, 3)
tags['month'] ['2012-01' '2012-02' '2012-03']




In [12]:
tags

{'month': array(['2012-01', '2012-02', '2012-03'], dtype=object),
 'month/week': array(['2012-01/2011-12-26', '2012-01/2012-01-02', '2012-01/2012-01-09',
        '2012-01/2012-01-16', '2012-01/2012-01-23', '2012-01/2012-01-30',
        '2012-02/2012-01-30', '2012-02/2012-02-06', '2012-02/2012-02-13',
        '2012-02/2012-02-20', '2012-02/2012-02-27', '2012-03/2012-02-27',
        '2012-03/2012-03-05', '2012-03/2012-03-12', '2012-03/2012-03-19',
        '2012-03/2012-03-26'], dtype=object),
 'month/week/day': array(['2012-01/2011-12-26/2012-01-01', '2012-01/2012-01-02/2012-01-02',
        '2012-01/2012-01-02/2012-01-03', '2012-01/2012-01-02/2012-01-04',
        '2012-01/2012-01-02/2012-01-05', '2012-01/2012-01-02/2012-01-06',
        '2012-01/2012-01-02/2012-01-07', '2012-01/2012-01-02/2012-01-08',
        '2012-01/2012-01-09/2012-01-09', '2012-01/2012-01-09/2012-01-10',
        '2012-01/2012-01-09/2012-01-11', '2012-01/2012-01-09/2012-01-12',
        '2012-01/2012-01-09/2012-01-13', '

In [13]:
# S_df

In [14]:
S_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2294 entries, 2012-01 to 2012-03/2012-03-26/2012-03-31/2012-03-31 23:00:01
Columns: 2184 entries, 2012-01/2011-12-26/2012-01-01/2012-01-01 00:00:01 to 2012-03/2012-03-26/2012-03-31/2012-03-31 23:00:01
dtypes: float32(2184)
memory usage: 19.1+ MB


In [15]:
# Y_hier_df

## We have the hierarchical setup ready, so let us build a set of baseline forecasts: we will use the last 7 days as test set.

In [16]:
FH=7

In [17]:
# Split train/test sets
Y_test_df  = Y_hier_df.groupby('unique_id').tail(FH)
Y_train_df = Y_hier_df.drop(Y_test_df.index)

Y_test_df = Y_test_df.set_index('unique_id')
Y_train_df = Y_train_df.set_index('unique_id')

Y_train_df.info(), Y_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5782 entries, 2012-01 to 2012-03/2012-03-26/2012-03-31
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ds      5782 non-null   datetime64[ns]
 1   y       5782 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 135.5+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 2954 entries, 2012-01 to 2012-03/2012-03-26/2012-03-31/2012-03-31 23:00:01
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ds      2954 non-null   datetime64[ns]
 1   y       2954 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 69.2+ KB


(None, None)

In [18]:
# Y_train_df

In [19]:
Y_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5782 entries, 2012-01 to 2012-03/2012-03-26/2012-03-31
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ds      5782 non-null   datetime64[ns]
 1   y       5782 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 135.5+ KB


In [20]:
# Создаем новый столбец со старым индексом
Y_test_df['unique_id'] = Y_test_df.index

# Сбрасываем индекс и заменяем его числовым индексом от 0 до длины датафрейма
Y_test_df.reset_index(drop=True, inplace=True)

# Устанавливаем новый индекс обратно на старый индекс
# Y_test_df.set_index('unique_id', inplace=True)
# Y_test_df

In [21]:
# Создаем новый столбец со старым индексом
Y_train_df['unique_id'] = Y_train_df.index

# Сбрасываем индекс и заменяем его числовым индексом от 0 до длины датафрейма
Y_train_df.reset_index(drop=True, inplace=True)
# Y_train_df

## попытка сделать предсказание без реконсиляции

In [22]:
metric = "mse"
stat_level = "month/week/day/hour"

In [23]:
Y_train_df.groupby("unique_id").tail(2), Y_test_df.groupby("unique_id").tail(2)

(                      ds      y                      unique_id
 735  2012-01-31 15:00:01  100.0                        2012-01
 736  2012-01-31 16:00:01  103.0                        2012-01
 1424 2012-02-29 15:00:01  106.0                        2012-02
 1425 2012-02-29 16:00:01  161.0                        2012-02
 2161 2012-03-31 15:00:01  114.0                        2012-03
 ...                  ...    ...                            ...
 5747 2012-03-29 16:00:01  107.0  2012-03/2012-03-26/2012-03-29
 5763 2012-03-30 15:00:01   99.0  2012-03/2012-03-26/2012-03-30
 5764 2012-03-30 16:00:01   99.0  2012-03/2012-03-26/2012-03-30
 5780 2012-03-31 15:00:01  114.0  2012-03/2012-03-26/2012-03-31
 5781 2012-03-31 16:00:01  103.0  2012-03/2012-03-26/2012-03-31
 
 [220 rows x 3 columns],
                       ds      y  \
 5    2012-01-31 22:00:01  105.0   
 6    2012-01-31 23:00:01   94.0   
 12   2012-02-29 22:00:01  109.0   
 13   2012-02-29 23:00:01   97.0   
 19   2012-03-31 22:00:01

In [24]:
import numpy as np
import pandas as pd

# compute base forecast no coherent
from statsforecast.models import (
    Naive,
    AutoARIMA,
    HoltWinters,
    CrostonClassic as Croston, 
    HistoricAverage,
    DynamicOptimizedTheta as DOT,
    SeasonalNaive,
    ETS,
    IMAPA,
    RandomWalkWithDrift,
    SeasonalExponentialSmoothing,
    SeasonalWindowAverage,
    SimpleExponentialSmoothing,
    TSB,
    WindowAverage,
    DynamicOptimizedTheta,
    AutoETS,
    AutoCES
)
from statsforecast.core import StatsForecast

#obtain hierarchical reconciliation methods and evaluation
from hierarchicalforecast.utils import aggregate
from hierarchicalforecast.methods import BottomUp, TopDown
from hierarchicalforecast.core import HierarchicalReconciliation

  from tqdm.autonotebook import tqdm


In [25]:
# Compute base Naive predictions
# Careful identifying correct data freq, this data quarterly 'Q'
# NG: Changed freq from Q to M since inut data is monthly
# NG: Changed model from Naive to ARIMA since Naive output is already reconciled without doing anything
SP = 5  
fcst = StatsForecast(df=Y_train_df,
                     models=[AutoARIMA(season_length=SP), AutoETS(season_length=SP)],
                     freq='H', n_jobs=-1)



In [26]:
Y_hat_df = fcst.forecast(h=FH, fitted=True)
Y_fitted_df = fcst.forecast_fitted_values()



In [27]:
Y_hat_df.groupby("unique_id").tail(2), Y_fitted_df.groupby("unique_id").tail(2)

(                                               ds   AutoARIMA     AutoETS
 unique_id                                                                
 2012-01                       2012-01-31 22:00:01  102.269241  102.999702
 2012-01                       2012-01-31 23:00:01  101.871391  102.999702
 2012-01/2011-12-26            2012-01-01 22:00:01   83.366928  109.000298
 2012-01/2011-12-26            2012-01-01 23:00:01   83.274742  109.000298
 2012-01/2011-12-26/2012-01-01 2012-01-01 22:00:01   83.366928  109.000298
 ...                                           ...         ...         ...
 2012-03/2012-03-26/2012-03-29 2012-03-29 23:00:01   96.397438  106.996399
 2012-03/2012-03-26/2012-03-30 2012-03-30 22:00:01   99.000000   99.000000
 2012-03/2012-03-26/2012-03-30 2012-03-30 23:00:01   99.000000   99.000000
 2012-03/2012-03-26/2012-03-31 2012-03-31 22:00:01   80.211235  103.001099
 2012-03/2012-03-26/2012-03-31 2012-03-31 23:00:01   79.289597  103.001099
 
 [220 rows x 3 columns]

In [28]:
# Y_hat_df

In [29]:
# Y_fitted_df

In [30]:
# You can select a reconciler from our collection
reconcilers = [BottomUp()] # MinTrace(method='mint_shrink')
hrec = HierarchicalReconciliation(reconcilers=reconcilers)

In [31]:
Y_rec_df = hrec.reconcile(Y_h=Y_hat_df, 
                          Y_df=Y_fitted_df,
                          S=S_df, tags=tags)
Y_rec_df.groupby('unique_id').head(2)

ValueError: Length of values (70070) does not match length of index (770)

In [32]:
# S_df

In [33]:
S_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2294 entries, 2012-01 to 2012-03/2012-03-26/2012-03-31/2012-03-31 23:00:01
Columns: 2184 entries, 2012-01/2011-12-26/2012-01-01/2012-01-01 00:00:01 to 2012-03/2012-03-26/2012-03-31/2012-03-31 23:00:01
dtypes: float32(2184)
memory usage: 19.2+ MB


In [34]:
# Y_hat_df

In [35]:
Y_hat_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 770 entries, 2012-01 to 2012-03/2012-03-26/2012-03-31
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   ds         770 non-null    datetime64[ns]
 1   AutoARIMA  770 non-null    float32       
 2   AutoETS    770 non-null    float32       
dtypes: datetime64[ns](1), float32(2)
memory usage: 34.2+ KB


In [36]:
# Y_fitted_df

In [37]:
from sklearn.metrics import mean_squared_error as mse

In [38]:
# Вычисляем MSE для AutoARIMA
mse_autoarima = mse(Y_fitted_df['y'], Y_fitted_df['AutoARIMA'])

# Вычисляем MSE для AutoETS
mse_autoets = mse(Y_fitted_df['y'], Y_fitted_df['AutoETS'])

print(f"MSE для AutoARIMA: {mse_autoarima}")
print(f"MSE для AutoETS: {mse_autoets}")

MSE для AutoARIMA: 177.22064208984375
MSE для AutoETS: 201.31861877441406


In [39]:
from sklearn.metrics import mean_absolute_error

# Фактические и прогнозные значения
actual_values = Y_fitted_df['y']
predicted_values = Y_fitted_df['AutoARIMA']  # или другой столбец с прогнозными значениями

# Вычисление MAE
mae = mean_absolute_error(actual_values, predicted_values)
print("MAE:", mae)

MAE: 7.881965


In [40]:
Y_fitted_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5782 entries, 2012-01 to 2012-03/2012-03-26/2012-03-31
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   ds         5782 non-null   datetime64[ns]
 1   y          5782 non-null   float32       
 2   AutoARIMA  5782 non-null   float32       
 3   AutoETS    5782 non-null   float32       
dtypes: datetime64[ns](1), float32(3)
memory usage: 287.1+ KB


In [41]:
tags

{'month': array(['2012-01', '2012-02', '2012-03'], dtype=object),
 'month/week': array(['2012-01/2011-12-26', '2012-01/2012-01-02', '2012-01/2012-01-09',
        '2012-01/2012-01-16', '2012-01/2012-01-23', '2012-01/2012-01-30',
        '2012-02/2012-01-30', '2012-02/2012-02-06', '2012-02/2012-02-13',
        '2012-02/2012-02-20', '2012-02/2012-02-27', '2012-03/2012-02-27',
        '2012-03/2012-03-05', '2012-03/2012-03-12', '2012-03/2012-03-19',
        '2012-03/2012-03-26'], dtype=object),
 'month/week/day': array(['2012-01/2011-12-26/2012-01-01', '2012-01/2012-01-02/2012-01-02',
        '2012-01/2012-01-02/2012-01-03', '2012-01/2012-01-02/2012-01-04',
        '2012-01/2012-01-02/2012-01-05', '2012-01/2012-01-02/2012-01-06',
        '2012-01/2012-01-02/2012-01-07', '2012-01/2012-01-02/2012-01-08',
        '2012-01/2012-01-09/2012-01-09', '2012-01/2012-01-09/2012-01-10',
        '2012-01/2012-01-09/2012-01-11', '2012-01/2012-01-09/2012-01-12',
        '2012-01/2012-01-09/2012-01-13', '

In [42]:
tags.info()

AttributeError: 'dict' object has no attribute 'info'

In [43]:
# You can select a reconciler from our collection
reconcilers = [
      BottomUp(),
      TopDown(method='forecast_proportions'),
      # TopDown(method='average_proportions'),
      # TopDown(method='proportion_averages'),
      MinTrace(method='ols'),
      # MinTrace(method='wls_var'),
      # MinTrace(method='mint_shrink'),
      # #ERM(method='reg_bu', lambda_reg=100) # Extremely inneficient
      ERM(method='closed')
]
hrec = HierarchicalReconciliation(reconcilers=reconcilers)

In [44]:
Y_rec_df = hrec.reconcile(Y_h=Y_hat_df, 
                          Y_df=Y_fitted_df,
                          S=S_df, tags=tags)
Y_rec_df.groupby('unique_id').head(FH)

ValueError: Length of values (70070) does not match length of index (770)

In [45]:
Y_hat_df.shape

(770, 3)

In [46]:
Y_hat_df.index.shape

(770,)