In [2]:
import pandas as pd 
from datetime import datetime, timedelta

In [13]:

data = pd.read_csv('ELIA_23August2024.csv',sep=';', index_col=0, parse_dates=True)

In [14]:
# find the first not na in Total Load 
first_valid_index = data['Total Load'].first_valid_index()
data = data.loc[data.index < first_valid_index]
data

Unnamed: 0_level_0,Resolution code,Total Load,Most recent forecast,Most recent P10,Most recent P90,Day-ahead 6PM forecast,Day-ahead 6PM P10,Day-ahead 6PM P90,Week-ahead forecast
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-08-23 04:15:00+02:00,PT15M,7378.27,7387.74,7206.74,7568.74,7530.12,7309.20,7751.03,
2024-08-23 04:00:00+02:00,PT15M,7361.58,7382.33,7201.47,7563.20,7502.57,7282.47,7722.68,
2024-08-23 03:45:00+02:00,PT15M,7215.68,7227.33,7046.09,7408.57,7485.72,7264.91,7706.53,
2024-08-23 03:30:00+02:00,PT15M,7273.88,7218.03,7037.02,7399.03,7478.08,7257.50,7698.66,
2024-08-23 03:15:00+02:00,PT15M,7407.23,7230.58,7049.28,7411.88,7486.94,7266.10,7707.79,
...,...,...,...,...,...,...,...,...,...
2015-01-01 01:00:00+01:00,PT15M,9755.00,9222.33,8880.23,9564.44,10130.36,9777.87,10482.86,9203.25
2015-01-01 00:45:00+01:00,PT15M,9821.78,9025.46,8432.07,9618.85,9216.09,8625.97,9806.20,9319.45
2015-01-01 00:30:00+01:00,PT15M,9952.87,9174.72,8571.51,9777.92,9367.66,8767.84,9967.47,9473.22
2015-01-01 00:15:00+01:00,PT15M,10051.28,9329.17,8715.80,9942.53,9526.54,8916.55,10136.53,9653.31


In [15]:
data = data.fillna(method='bfill')

  data = data.fillna(method='bfill')


To measure the perofrmances of a persistent model, we create a new column that repeats the last available time at 45 minutes (previous hour) for the next four steps. This way, prediction for 05:00, 05:15, 05:30, 05:45 will always be 04:45. 

In [16]:
# Function to get the value of Total Load at the previous day, 45 minutes past the hour
def get_previous_day_value(df, current_time):
    previous_day = current_time - pd.DateOffset(hours=1)
    # change minutes to 45
    previous_time = previous_day.replace(minute=45)
    if previous_time in df.index:
        return df.loc[previous_time, 'Total Load']
    return None

# Apply the function to create the 'Total Load Persistence' column
data['Total Load Persistence'] = data.index.to_series().apply(lambda x: get_previous_day_value(data, x))

print(data[['Total Load','Total Load Persistence']].head(10))

                           Total Load  Total Load Persistence
Datetime                                                     
2024-08-23 04:15:00+02:00     7378.27                 7215.68
2024-08-23 04:00:00+02:00     7361.58                 7215.68
2024-08-23 03:45:00+02:00     7215.68                 7331.02
2024-08-23 03:30:00+02:00     7273.88                 7331.02
2024-08-23 03:15:00+02:00     7407.23                 7331.02
2024-08-23 03:00:00+02:00     7386.33                 7331.02
2024-08-23 02:45:00+02:00     7331.02                 7186.31
2024-08-23 02:30:00+02:00     7311.89                 7186.31
2024-08-23 02:15:00+02:00     7233.88                 7186.31
2024-08-23 02:00:00+02:00     7282.41                 7186.31


In [17]:
data = data.dropna()

In [20]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, root_mean_squared_error

starting_hour = data.index.min()
ending_hour = data.index.max()

metrics = pd.DataFrame(columns=['1-step-ahead','2-steps-ahead','3-steps-ahead','4-steps-ahead'], index=['rmse','mse','mape','mae'])

one_step = data.loc[data.index.map(lambda x: x.minute == 00)]
two_steps = data.loc[data.index.map(lambda x: x.minute == 15)]
three_steps = data.loc[data.index.map(lambda x: x.minute == 30)]
four_steps = data.loc[data.index.map(lambda x: x.minute == 45)]

for step, step_data in enumerate([one_step,two_steps,three_steps,four_steps]):

    y_true = step_data['Total Load']
    y_hat = step_data['Total Load Persistence']

    rmse = root_mean_squared_error(y_true, y_hat)
    mse = mean_squared_error(y_true,y_hat)
    mape = mean_absolute_percentage_error(y_true,y_hat)
    mae = mean_absolute_error(y_true,y_hat)

    metrics.iloc[:,step] = [rmse,mse,mape,mae]


In [21]:
metrics

Unnamed: 0,1-step-ahead,2-steps-ahead,3-steps-ahead,4-steps-ahead
rmse,135.196172,226.247432,308.989001,385.832047
mse,18278.004867,51187.900338,95474.202756,148866.368247
mape,0.010833,0.018201,0.024934,0.031287
mae,102.548216,173.040749,237.5281,297.761971
