In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/dl_project/hungary_merged_dataset_2015_2024.csv")

In [None]:
data.head()

Unnamed: 0,datetime,load_MW,load_MW_lag_48h,load_MW_lag_72h,load_MW_lag_96h,load_MW_lag_120h,load_MW_lag_144h,load_MW_lag_168h,load_MW_rolling_mean_48h,load_MW_rolling_std_48h,...,datetime_local,date,year,month,day,day_of_week,day_name,day_of_year,hour_local,minute
0,2015-01-01 00:00:00+00:00,4164.73,,,,,,,,,...,2015-01-01 01:00:00+01:00,2015-01-01,2015,1,1,3,Thursday,1,1,0
1,2015-01-01 00:15:00+00:00,4106.2,,,,,,,,,...,2015-01-01 01:15:00+01:00,2015-01-01,2015,1,1,3,Thursday,1,1,15
2,2015-01-01 00:30:00+00:00,4053.31,,,,,,,,,...,2015-01-01 01:30:00+01:00,2015-01-01,2015,1,1,3,Thursday,1,1,30
3,2015-01-01 00:45:00+00:00,3952.49,,,,,,,,,...,2015-01-01 01:45:00+01:00,2015-01-01,2015,1,1,3,Thursday,1,1,45
4,2015-01-01 01:00:00+00:00,3863.72,,,,,,,,,...,2015-01-01 02:00:00+01:00,2015-01-01,2015,1,1,3,Thursday,1,2,0


In [None]:
data["datetime"] = pd.to_datetime(data["datetime"])

In [None]:
PANDEMIC_CUTOFF = pd.Timestamp("2020-03-01", tz="UTC")
WARMUP_DAYS = 14

In [None]:
df_pre_covid = data[data["datetime"] < PANDEMIC_CUTOFF].copy()

In [None]:
start_date = df_pre_covid["datetime"].min() + pd.Timedelta(days=WARMUP_DAYS)

df_pre_covid = df_pre_covid[df_pre_covid["datetime"] >= start_date].copy()

In [None]:
print("Original range:")
print(data["datetime"].min(), "→", data["datetime"].max())

print("\nPre-COVID range:")
print(df_pre_covid["datetime"].min(), "→", df_pre_covid["datetime"].max())

print("\nRows kept:", len(df_pre_covid))
print("Rows dropped:", len(data) - len(df_pre_covid))


Original range:
2015-01-01 00:00:00+00:00 → 2024-12-31 23:45:00+00:00

Pre-COVID range:
2015-01-15 00:00:00+00:00 → 2020-02-29 23:45:00+00:00

Rows kept: 179712
Rows dropped: 170976


In [None]:
WINDOW_SIZE = 24
TARGET_COL = "load_MW"
TIME_COL = "datetime"
COUNTRY_COL = None

In [None]:
exclude_cols = {TIME_COL, TARGET_COL}
if COUNTRY_COL in df_pre_covid.columns:
    exclude_cols.add(COUNTRY_COL)

feature_cols = [c for c in df_pre_covid.columns if c not in exclude_cols]

print("Number of features:", len(feature_cols))
print(feature_cols[:10])


Number of features: 77
['load_MW_lag_48h', 'load_MW_lag_72h', 'load_MW_lag_96h', 'load_MW_lag_120h', 'load_MW_lag_144h', 'load_MW_lag_168h', 'load_MW_rolling_mean_48h', 'load_MW_rolling_std_48h', 'load_MW_rolling_min_48h', 'load_MW_rolling_max_48h']


In [None]:
import numpy as np

def build_sliding_windows(
    df,
    feature_cols,
    target_col,
    window_size,
    country_col=None
):
    X_list, y_list = [], []

    if country_col and country_col in df.columns:
        groups = df.groupby(country_col)
    else:
        groups = [(None, df)]

    for _, gdf in groups:
        gdf = gdf.sort_values(TIME_COL).reset_index(drop=True)

        values_X = gdf[feature_cols].values
        values_y = gdf[target_col].values

        for i in range(len(gdf) - window_size):
            X_list.append(values_X[i:i + window_size])
            y_list.append(values_y[i + window_size])

    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.float32)

    return X, y


In [None]:
NON_NUMERIC_COLS = {
    "datetime",
    "datetime_local",
    "date",
}

exclude_cols = {TIME_COL, TARGET_COL} | NON_NUMERIC_COLS

if COUNTRY_COL in df_pre_covid.columns:
    exclude_cols.add(COUNTRY_COL)

feature_cols = [
    c for c in df_pre_covid.columns
    if c not in exclude_cols
]


In [None]:
feature_cols = [
    c for c in feature_cols
    if np.issubdtype(df_pre_covid[c].dtype, np.number)
]

print("Final feature count:", len(feature_cols))
print(feature_cols)

Final feature count: 66
['load_MW_lag_48h', 'load_MW_lag_72h', 'load_MW_lag_96h', 'load_MW_lag_120h', 'load_MW_lag_144h', 'load_MW_lag_168h', 'load_MW_rolling_mean_48h', 'load_MW_rolling_std_48h', 'load_MW_rolling_min_48h', 'load_MW_rolling_max_48h', 'load_MW_rolling_mean_72h', 'load_MW_rolling_std_72h', 'load_MW_rolling_min_72h', 'load_MW_rolling_max_72h', 'load_MW_rolling_mean_96h', 'load_MW_rolling_std_96h', 'load_MW_rolling_min_96h', 'load_MW_rolling_max_96h', 'load_MW_rolling_mean_120h', 'load_MW_rolling_std_120h', 'load_MW_rolling_min_120h', 'load_MW_rolling_max_120h', 'load_MW_rolling_mean_144h', 'load_MW_rolling_std_144h', 'load_MW_rolling_min_144h', 'load_MW_rolling_max_144h', 'load_MW_rolling_mean_168h', 'load_MW_rolling_std_168h', 'load_MW_rolling_min_168h', 'load_MW_rolling_max_168h', 'temperature_2m', 'hdd', 'cdd', 'hour_sin', 'hour_cos', 'q_of_h_sin', 'q_of_h_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'is_dst', 'is_holiday', 'is_we

In [None]:
X_all, y_all = build_sliding_windows(
    df=df_pre_covid,
    feature_cols=feature_cols,
    target_col=TARGET_COL,
    window_size=WINDOW_SIZE,
    country_col=COUNTRY_COL
)

print("X shape:", X_all.shape)
print("y shape:", y_all.shape)


X shape: (179688, 24, 66)
y shape: (179688,)


In [None]:
N = len(X_all)

train_end = int(0.70 * N)
val_end   = int(0.85 * N)

X_train = X_all[:train_end]
y_train = y_all[:train_end]

X_val = X_all[train_end:val_end]
y_val = y_all[train_end:val_end]

X_test = X_all[val_end:]
y_test = y_all[val_end:]

print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)


Train: (125781, 24, 66)
Val:   (26953, 24, 66)
Test:  (26954, 24, 66)


In [None]:
SAVE_DIR = "/content/drive/MyDrive/dl_project/data_processed_pre_covid"
os.makedirs(SAVE_DIR, exist_ok=True)

np.save(os.path.join(SAVE_DIR, "X_train_ws24.npy"), X_train)
np.save(os.path.join(SAVE_DIR, "y_train_ws24.npy"), y_train)

np.save(os.path.join(SAVE_DIR, "X_val_ws24.npy"), X_val)
np.save(os.path.join(SAVE_DIR, "y_val_ws24.npy"), y_val)

np.save(os.path.join(SAVE_DIR, "X_test_ws24.npy"), X_test)
np.save(os.path.join(SAVE_DIR, "y_test_ws24.npy"), y_test)

print("Saved sliding windows to:", SAVE_DIR)

Saved sliding windows to: /content/drive/MyDrive/dl_project/data_processed_pre_covid


In [None]:
print("NaNs in X_train:", np.isnan(X_train).any())
print("NaNs in y_train:", np.isnan(y_train).any())

print("y_train stats:", y_train.min(), y_train.mean(), y_train.max())

NaNs in X_train: False
NaNs in y_train: False
y_train stats: 2921.46 4831.5635 6591.94
