In [None]:
#!pip install -q transformers

In [None]:
#!pip install -q datasets

In [None]:
#!pip install -q evaluate

In [None]:
#!pip install -q accelerate

In [None]:
#!pip install -q gluonts ujson

In [None]:
#!pip install -q lightning lightning[extra]

In [None]:
#!pip install -q scipy

In [None]:
#!pip install -q matplotlib

In [None]:
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error

from gluonts.dataset.pandas import PandasDataset
from gluonts.transform import (
    AddAgeFeature,
    AddObservedValuesIndicator,
    AddTimeFeatures,
    AsNumpyArray,
    Chain,
    ExpectedNumInstanceSampler,
    InstanceSplitter,
    RemoveFields,
    SelectFields,
    SetField,
    TestSplitSampler,
    Transformation,
    ValidationSplitSampler,
    VstackFeatures,
    RenameFields,
)
from gluonts.dataset.split import split, InputDataset, LabelDataset
from gluonts.transform.sampler import InstanceSampler

from gluonts.torch import SimpleFeedForwardEstimator, TemporalFusionTransformerEstimator
# from gluonts.mx import SimpleFeedForwardEstimator, Trainer

from gluonts.evaluation import make_evaluation_predictions

from lightning.pytorch.callbacks import Callback

In [None]:
df = pd.read_csv(
    "optiver-trading-at-the-close/train.csv",
    dtype={
        "seconds_in_bucket": np.float32,
        "imbalance_size": np.float32,
        "imbalance_buy_sell_flag": np.float32,
        "reference_price": np.float32,
        "matched_size": np.float32,
        "far_price": np.float32,
        "near_price": np.float32,
        "bid_price": np.float32,
        "bid_size": np.float32,
        "ask_price": np.float32,
        "ask_size": np.float32,
        "wap": np.float32,
        "target": np.float32,
        "time_id": np.float32,
    },
)
raw_df = df

In [None]:
df

In [None]:
max_time_id = df["time_id"].max()
dti_by_time_id = pd.date_range("2018-01-01", periods=max_time_id + 1, freq="min")
print(dti_by_time_id)
dti_by_time_id = dti_by_time_id.to_period("1min")
dti_by_time_id_series = dti_by_time_id.to_series(index=np.arange(max_time_id + 1))
print(dti_by_time_id_series)

In [None]:
df["timestamp_by_time_id"] = df["time_id"].map(dti_by_time_id_series)

In [None]:
df["timestamp"] = df["timestamp_by_time_id"]
df.index = df["timestamp"]

In [None]:
df

In [None]:
df_grouped = df.groupby("stock_id")
print(len(df_grouped))
print(df_grouped.size())

In [None]:
dfs_dict = {}
for item_id, gdf in df_grouped:
    dfs_dict[item_id] = gdf.reindex(dti_by_time_id_series).drop("stock_id", axis=1)

In [None]:
freq = "1min"

In [None]:
feat_dynamic_real = ["imbalance_size", "reference_price", "matched_size"]
print(len(feat_dynamic_real), feat_dynamic_real)

In [None]:
for item_id, gdf in dfs_dict.items():
    gdf[gdf.columns.difference(["target"])] = gdf[gdf.columns.difference(["target"])].fillna(0.0)

In [None]:
dataset = PandasDataset(dfs_dict, target="target", feat_dynamic_real=feat_dynamic_real, freq=freq, assume_sorted=False)
print(dataset)
print(len(dataset))

In [None]:
prediction_length = 1

In [None]:
validation_length = 55 * 20
# validation_length = 10
# window will not overlap
validation_window_size = int(validation_length / prediction_length)

# Split the data for training and testing
training_data, test_gen = split(dataset, offset=-validation_length)
test_data = test_gen.generate_instances(prediction_length=prediction_length, windows=validation_window_size)

val_data, _ = split(dataset, offset=-1)

In [None]:
train_dataset = training_data
test_data_input_dataset = InputDataset(test_data)
test_data_label_dataset = LabelDataset(test_data)
print(len(train_dataset), len(test_data_input_dataset), len(test_data_label_dataset))
print(train_dataset)
print(test_data_input_dataset)
print(test_data_label_dataset)

train_dataset_iter = iter(training_data)
test_data_input_dataset_iter = iter(test_data_input_dataset)
test_data_label_dataset_iter = iter(test_data_label_dataset)

In [None]:
print(len(val_data))
print(val_data)
print(next(iter(val_data))["target"].shape)

In [None]:
raw_df[raw_df["stock_id"] == 0].iloc[-validation_length - 2:-validation_length + 2][["target"]]

In [None]:
class CustomBackTestSampler(InstanceSampler):
    back_test_size = 0
    
    def __init__(self, back_test_size):
        super().__init__()
        assert back_test_size > 0
        self.back_test_size = back_test_size

    def __call__(self, ts: np.ndarray) -> np.ndarray:
        data_size = ts.shape[-1]
        # TODO: do not use split offset -1 above
        return np.arange(data_size - self.back_test_size + 1, data_size)

In [None]:
train_sampler = ExpectedNumInstanceSampler(
    num_instances=1.0, min_future=prediction_length
)

In [None]:
custom_back_test_sampler = CustomBackTestSampler(validation_length)
print(type(custom_back_test_sampler), custom_back_test_sampler)

In [None]:
context_length = 55 * 2

In [None]:
max_epochs = 1
batch_size = 256
num_batches_per_epoch = 100

In [None]:
class MAEPyTorchCallback(Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        pass

In [None]:
estimator = SimpleFeedForwardEstimator(
    prediction_length=prediction_length,
    context_length=context_length,
    trainer_kwargs={"max_epochs": max_epochs},
    train_sampler=train_sampler,
    validation_sampler=custom_back_test_sampler,
    batch_size=batch_size,
    num_batches_per_epoch=num_batches_per_epoch,
)

In [None]:
train_output = estimator.train_model(
    training_data,
    validation_data=val_data,
    cache_data=True,
)
predictor = train_output.predictor

In [None]:
trainer = train_output.trainer
trainer_model = trainer.model
trainer_val_dataloaders = trainer.val_dataloaders
print(trainer_model)
print(len(list(iter(trainer_val_dataloaders))))

In [None]:
results = trainer.validate(model=trainer_model, dataloaders=trainer_val_dataloaders)

In [None]:
results

In [None]:
forecasts = list(predictor.predict(val_data))
print(len(forecasts), forecasts[0])

In [None]:
forecasts = list(predictor.predict(test_data_input_dataset))
print(len(forecasts), forecasts[0])

In [None]:
forecasts_median = np.array([forecast.median for forecast in forecasts]).flatten()
print(forecasts_median.shape)

In [None]:
ground_truth = np.array([d["target"] for d in test_data_label_dataset_iter]).flatten()
print(ground_truth.shape)

In [None]:
mae = mean_absolute_error(ground_truth, forecasts_median)
plt.scatter(forecasts_median, ground_truth)
plt.title(f"MAE: {mae}")

In [None]:
def calc_baseline_all_zero(ground_truth):
    predictions = np.zeros(ground_truth.shape)
    mae = mean_absolute_error(ground_truth, predictions)
    plt.scatter(predictions, ground_truth)
    plt.title(f"MAE: {mae}, shape: {predictions.shape}")

In [None]:
calc_baseline_all_zero(ground_truth)

In [None]:
prev_time = time.time()
val_set_parsed_data = list(iter(val_data))
# TODO: do not use offset split -1
val_set_ground_truth = list(map(lambda stock_data: stock_data["target"][-validation_length + 1:], val_set_parsed_data))
val_set_ground_truth = np.vstack(val_set_ground_truth).flatten()
curr_time = time.time()
print(val_set_ground_truth.shape, curr_time - prev_time)

In [None]:
calc_baseline_all_zero(val_set_ground_truth)

In [None]:
baseline_prev_predictions = list(map(lambda stock_data: stock_data["target"][-validation_length + 1 - 6:-6], val_set_parsed_data))
baseline_prev_predictions = np.vstack(baseline_prev_predictions).flatten()
print(baseline_prev_predictions.shape)

In [None]:
baseline_prev_mae = mean_absolute_error(val_set_ground_truth, baseline_prev_predictions)
plt.scatter(baseline_prev_predictions, val_set_ground_truth)
plt.title(f"MAE: {baseline_prev_mae}")