In [1]:
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from scipy import stats

pandarallel.initialize(progress_bar=True, nb_workers=12)


def beta_dist_sampling(
    num_choices: int,
    alpha: float = 0.5,
    beta: float = 0.5,
):
    choices_x = np.array([(i + 1) / (num_choices + 1) for i in range(num_choices)])
    beta_dist = stats.beta.pdf(choices_x, alpha, beta)
    # Normalize to create valid probability distribution
    beta_dist_sampling_p = beta_dist / beta_dist.sum()
    return beta_dist_sampling_p


def create_sliding_windows(
    df: pd.DataFrame, context_sizes: list[int] | None = None, pred_len: int = 64
) -> pd.DataFrame:
    if context_sizes is None:
        context_sizes = list(range(64, 1025, 64))

    windows = []
    curr_i = 0

    beta_dist_sampling_p = beta_dist_sampling(len(context_sizes), alpha=0.25, beta=0.01)
    while curr_i < len(df) - max(context_sizes) - pred_len:
        if len(context_sizes) > 1:
            context_len = np.random.choice(context_sizes, p=beta_dist_sampling_p)
        else:
            context_len = context_sizes[0]
        total_len = context_len + pred_len

        window_data = {"context_len": context_len}
        for col in df.columns:
            window_data[col] = df[col].iloc[curr_i : curr_i + total_len].to_numpy()

        windows.append(window_data)

        curr_i += total_len

    return pd.DataFrame(windows)


def split_dataset(
    df: pd.DataFrame,
    train_ratio: float = 0.70,
    eval_ratio: float = 0.20,
    pred_len: int = 64,
    context_len: int | None = None,
    seed: int = 42,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    np.random.seed(seed)
    df.sort_values("timestamp").reset_index(drop=True, inplace=True)

    windowed_df = (
        create_sliding_windows(df, pred_len=pred_len, context_sizes=[context_len])
        .sample(frac=1, random_state=seed)
        .reset_index(drop=True)
    )

    n_samples = len(windowed_df)
    train_idx = int(n_samples * train_ratio)
    eval_idx = int(n_samples * (train_ratio + eval_ratio))

    train_data = windowed_df.iloc[:train_idx]
    eval_data = windowed_df.iloc[train_idx:eval_idx]
    test_data = windowed_df.iloc[eval_idx:]

    return train_data, eval_data, test_data


def save_splits(
    path: str,
    train: pd.DataFrame,
    eval: pd.DataFrame,
    test: pd.DataFrame,
):
    for sample, name in zip([train, eval, test], ["train", "eval", "test"]):
        sample.drop(columns=["context_len"], inplace=True)
        print(
            f"Saving {name} split - Examples: {sample.shape[0]} Features: {sample.shape[1]}"
        )
        sample.to_parquet(
            f"{path}/{name}.parquet", index=False, engine="pyarrow", compression="zstd"
        )

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
def format_raw_electricity_load_data():
    # ElectricityLoadDiagrams20112014 raw data
    file = "LD2011_2014.txt"
    df = pd.read_csv(file, sep=";", header=0, low_memory=False)
    df.rename(columns={"Unnamed: 0": "timestamp"}, inplace=True)

    columns = df.columns.tolist()
    columns.remove("timestamp")
    columns_dtypes = {col: "float64" for col in columns}
    for column in columns:
        if df[column].dtype == "object":
            df[column] = df[column].str.replace(",", ".")
    df = df.astype(columns_dtypes)
    # df.fillna(0, inplace=True)
    df.to_parquet("power.parquet", index=False, engine="pyarrow", compression="zstd")


format_raw_electricity_load_data()

In [None]:
file = "power.parquet"
# parameters used in powerformer paper for electricity dataset
context_len = 512
pred_len = 96

df = pd.read_parquet(file)
columns = df.columns.tolist()
columns.remove("timestamp")
df = df.astype({col: "float64[pyarrow]" for col in columns})
train_samples, eval_samples, test_samples = split_dataset(
    df, pred_len=pred_len, context_len=context_len, seed=42
)
save_splits(".", train_samples, eval_samples, test_samples)

Saving train split - Examples: 161 Features: 371
Saving eval split - Examples: 45 Features: 371
Saving test split - Examples: 24 Features: 371
