In [1]:
import pandas as pd
import numpy as np
import pickle
import sklearn
from sklearn.base import BaseEstimator, RegressorMixin
import ray
import torch

In [2]:
data = pd.read_csv("/workspaces/comed-pricing/data/raw_data.csv")

In [3]:
def get_x_y_splits(data, columns, targets, n_steps_in, n_steps_out, gap, include_target_in_X=False):
    """This function converts a dataframe into X and Y sequences for training"""

    # Include target column
    if include_target_in_X:
        columns = columns + targets

    complete_x_array = data[columns].to_numpy()
    complete_y_array = data[targets].to_numpy()

    upper_bound = len(data) - (n_steps_in + n_steps_out + gap)
    
    # Pre-allocate arrays for performance
    X_shape = (upper_bound, n_steps_in, complete_x_array.shape[1])
    y_shape = (upper_bound, n_steps_out, complete_y_array.shape[1])

    X_arrays = np.empty(X_shape, dtype=np.float32)
    y_arrays = np.empty(y_shape, dtype=np.float32)

    for index in range(upper_bound):
        starting_X_index = index
        ending_X_index = starting_X_index + n_steps_in
        starting_y_index = ending_X_index + gap
        ending_y_index = starting_y_index + n_steps_out

        X_arrays[index] = complete_x_array[starting_X_index: ending_X_index]
        y_arrays[index] = complete_y_array[starting_y_index: ending_y_index]

    return torch.tensor(X_arrays, dtype=torch.float32), torch.tensor(y_arrays, dtype=torch.float32)


In [6]:
def preprocess(data, columns, targets, n_steps_in, n_steps_out, gap, include_target_in_X=False, resample_units=None):
    # reset_index
     # Convert the 'timestamp' column to datetime format and set it as the index
    data['millisUTC'] = pd.to_datetime(data['millisUTC'])
    data.set_index('millisUTC', inplace=True)

    # Resample dataset
    if resample_units is not None:
        data = data.resample(resample_units, label="right").mean()

    # Need a better way to handle missing values
    data['price'] = data['price'].ffill()
    data.reset_index(drop=True, inplace=True)
    
    X, y = get_x_y_splits(
        data, 
        columns=columns, 
        targets=targets, 
        n_steps_in=n_steps_in, 
        n_steps_out=n_steps_out, 
        gap=gap, 
        include_target_in_X=include_target_in_X
    )
    
    return X, y

In [8]:
data = pd.read_csv("/workspaces/comed-pricing/data/raw_data.csv")
X, y = preprocess(
    data, 
    resample_units="60T", # Resample values by 60 minutes
    columns=['price'],
    targets=['price'], 
    n_steps_in=5, 
    n_steps_out=10, 
    gap=60, 
    include_target_in_X=True
) 