In [1]:
import pandas as pd
import numpy as np
import pickle
import sklearn
from sklearn.base import BaseEstimator, RegressorMixin
import ray

In [15]:
def preprocess(data):
    # Convert the 'timestamp' column to datetime format and set it as the index
    data['millisUTC'] = pd.to_datetime(data['millisUTC'])
    
    data.set_index('millisUTC', inplace=True)
    
    # Resample dataset to 30 minutes
    data = data.resample("30T", label="right").mean()

    data = data.reset_index(drop=True)
    
    # Need a better way to handle missing values
    data['price'] = data['price'].ffill()

    return data

In [3]:
def split_dataset(data):
    # Split the data into training and testing sets
    train_size = int(0.8 * len(data))
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]

    return train_data, test_data


In [None]:
class NaiveModel(BaseEstimator, RegressorMixin):
    def __init__(self, value=3.083333):
        self.value = value

    def fit(self, X=data, y=None):
        return self

    def predict(self, data):
        return np.array([self.value] * len(data))

naive_model = NaiveModel()
naive_model = naive_model.fit()

In [None]:
def save_model(model, file_name, file_dir="/workspaces/comed-pricing/models/"):
    with open(f"{file_dir}/{file_name}", "wb") as file:
        pickle.dump(model, file)

def load_model(file_name, file_dir="/workspaces/comed-pricing/models/"):
    with open(f"{file_dir}/{file_name}", 'rb') as file:
        return pickle.load(file)

# Ray

In [26]:
data = pd.read_csv("/workspaces/comed-pricing/data/raw_data.csv")

# This is a bit strange to do
# We're reading the dataset using pandas, then splitting it, and then creating distributed datasets from pandas
train_data, test_data = split_dataset(data)

In [27]:
train_ds = ray.data.from_pandas(train_data)
test_ds = ray.data.from_pandas(test_data)

In [25]:
sample_ds = train_ds.map_batches(
    preprocess,
    batch_format="pandas"
)
sample_ds.show(1)

2023-09-05 17:13:31,662	INFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(preprocess)]
2023-09-05 17:13:31,663	INFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-09-05 17:13:31,664	INFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


{'price': 2.0500000000000003}


In [None]:
def load_data(num_samples=None):
    ds 