In [1]:
import os
import random
import torch
import pandas as pd
import numpy as np
from ray.data.preprocessor import Preprocessor
import ray


DATASET_LOC = "/workspaces/comed-pricing/data/raw_data.csv"

In [2]:
def get_x_y_splits(data, columns, targets, n_steps_in, n_steps_out, gap, include_target_in_X=False):
    """This function converts a dataframe into X and Y sequences for training"""

    # Include target column
    if include_target_in_X:
        columns = columns + targets

    complete_x_array = data[columns].to_numpy()
    complete_y_array = data[targets].to_numpy()

    upper_bound = len(data) - (n_steps_in + n_steps_out + gap)
    
    # Pre-allocate arrays for performance
    X_shape = (upper_bound, n_steps_in, complete_x_array.shape[1])
    y_shape = (upper_bound, n_steps_out, complete_y_array.shape[1])

    X_arrays = np.empty(X_shape, dtype=np.float32)
    y_arrays = np.empty(y_shape, dtype=np.float32)

    for index in range(upper_bound):
        starting_X_index = index
        ending_X_index = starting_X_index + n_steps_in
        starting_y_index = ending_X_index + gap
        ending_y_index = starting_y_index + n_steps_out

        X_arrays[index] = complete_x_array[starting_X_index: ending_X_index]
        y_arrays[index] = complete_y_array[starting_y_index: ending_y_index]

    return torch.tensor(X_arrays, dtype=torch.float32), torch.tensor(y_arrays, dtype=torch.float32)


def preprocess(data, columns, targets, n_steps_in, n_steps_out, gap, include_target_in_X=False, resample_units=None):
    # reset_index
     # Convert the 'timestamp' column to datetime format and set it as the index
    data['millisUTC'] = pd.to_datetime(data['millisUTC'])
    data.set_index('millisUTC', inplace=True)

    # Resample dataset
    if resample_units is not None:
        data = data.resample(resample_units, label="right").mean()

    # Need a better way to handle missing values
    data['price'] = data['price'].ffill()
    data.reset_index(drop=True, inplace=True)
    
    X, y = get_x_y_splits(
        data, 
        columns=columns, 
        targets=targets, 
        n_steps_in=n_steps_in, 
        n_steps_out=n_steps_out, 
        gap=gap, 
        include_target_in_X=include_target_in_X
    )
    
    return X, y



In [3]:
def set_seeds(seed=42):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    eval("setattr(torch.backends.cudnn, 'deterministic', True)")
    eval("setattr(torch.backends.cudnn, 'benchmark', False)")
    os.environ["PYTHONHASHSEED"] = str(seed)

In [4]:
def load_data(num_samples=None):
    ds = ray.data.read_csv(DATASET_LOC)
    ds = ds.random_shuffle(seed=1234)
    ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
    return ds

In [5]:
class CustomPreprocessor(Preprocessor):
    def _fit(self, ds):
        return self

    def _transform_pandas(self, batch):
        return preprocess(
            batch,
            resample_units="60T", # Resample values by 60 minutes
            columns=['price'],
            targets=['price'], 
            n_steps_in=5, 
            n_steps_out=10, 
            gap=60, 
            include_target_in_X=True
        )

In [6]:
import torch.nn as nn

In [7]:
class LSTM(torch.nn.Module):
    """LSTM neural network"""
    
    def __init__(self, input_size, hidden_size,  output_size=10):
        super(LSTM, self).__init__()

        self.lstm = torch.nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            batch_first=True
        )
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, X, h=None):
        output, hidden_state = self.lstm(X, h)
        last_hidden_state = output[:, -1]
        output = self.linear(last_hidden_state)
        return output, hidden_state

    @torch.inference_mode()
    def predict(self, batch):
        self.eval()
        z = self(batch)
        output = self.linear(last_hidden_state)
        return output


In [8]:
from ray.air import Checkpoint, session
from ray.air.config import CheckpointConfig, DatasetConfig, RunConfig, ScalingConfig
import ray.train as train
from ray.train.torch import TorchCheckpoint, TorchTrainer
import torch.nn.functional as F


In [9]:
def train_step(ds, batch_size, model, loss_fn, optimizer):
    model.train()
    loss = 0.0
    ds_generator = ds.iter_torch_batches(batch_size=batch_size)
    for i, batch in enumerate(ds_generator):
        optimizer.zero_grad()  # reset gradients
        z = model(batch)  # forward pass
        J = loss_fn(z, targets)  # define loss
        J.backward()  # backward pass
        optimizer.step()  # update weights
        loss += (J.detach().item() - loss) / (i + 1)  # cumulative loss
    return loss

In [10]:
def eval_step(ds, batch_size, model, num_classes, loss_fn):
    """Eval step."""
    model.eval()
    loss = 0.0
    y_trues, y_preds = [], []
    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=collate_fn)
    with torch.inference_mode():
        for i, batch in enumerate(ds_generator):
            z = model(batch)
            J = loss_fn(z, z).item()
            loss += (J - loss) / (i + 1)
            y_trues.extend(batch["targets"].cpu().numpy())
            y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())
    return loss, np.vstack(y_trues), np.vstack(y_preds)

In [11]:
# Training loop
def train_loop_per_worker(config):
    # Hyperparameters
    dropout_p = config["dropout_p"]
    lr = config["lr"]
    lr_factor = config["lr_factor"]
    lr_patience = config["lr_patience"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    input_size = config["input_size"]
    hidden_size = config["hidden_size"]

    # Get datasets
    set_seeds()
    train_ds = session.get_dataset_shard("train")
    val_ds = session.get_dataset_shard("val")

    # Model
    model =  LSTM(input_size=input_size, hidden_size=hidden_size)
    model = train.torch.prepare_model(model)

    # Training components
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)

    # Training
    batch_size_per_worker = batch_size // session.get_world_size()
    for epoch in range(num_epochs):
        # Step
        train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
        val_loss, _, _ = eval_step(val_ds, batch_size_per_worker, model, num_classes, loss_fn)
        scheduler.step(val_loss)

        # Checkpoint
        metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
        checkpoint = TorchCheckpoint.from_model(model=model)
        session.report(metrics, checkpoint=checkpoint)

In [13]:
# Train loop config
train_loop_config = {
    "dropout_p": 0.5,
    "lr": 1e-4,
    "lr_factor": 0.8,
    "lr_patience": 3,
    "num_epochs": 10,
    "batch_size": 256,
    "input_size": 2,
    "hidden_size": 32
}

num_workers = 1  # prefer to do a few less than total available CPU (1 for head node + 1 for background tasks)
resources_per_worker={"CPU": 1, "GPU": 0}

# Scaling config
scaling_config = ScalingConfig(
    num_workers=num_workers,
    use_gpu=bool(resources_per_worker["GPU"]),
    resources_per_worker=resources_per_worker,
    _max_cpu_fraction_per_node=0.8,
)

In [14]:
def split_dataset(data):
    # Split the data into training and testing sets
    train_size = int(0.8 * len(data))
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]

    return train_data, test_data

In [16]:
data = pd.read_csv("/workspaces/comed-pricing/data/raw_data.csv")
# This is a bit strange to do
# We're reading the dataset using pandas, then splitting it, and then creating distributed datasets from pandas
train_data, val_data = split_dataset(data)

In [18]:
train_ds = ray.data.from_pandas(train_data)
val_ds = ray.data.from_pandas(val_data)

2023-09-05 19:23:59,834	INFO worker.py:1621 -- Started a local Ray instance.


In [None]:
# Preprocess
preprocessor = CustomPreprocessor()
train_ds =  preprocessor.fit_transform(train_ds)


In [None]:
preprocess(
            train_data,
            resample_units="60T", # Resample values by 60 minutes
            columns=['price'],
            targets=['price'], 
            n_steps_in=5, 
            n_steps_out=10, 
            gap=60, 
            include_target_in_X=True
        )