In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader
from pytorch_lightning.loggers import TensorBoardLogger
from PriceFNN import PriceFNN
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from shap_flow_util import read_csv_between
import datetime
import dill

In [None]:
version = 'v5'
date = datetime.datetime.now().strftime("%Y-%m-%d")

periods = [('2015-01-08', '2021-09-30'),
            ('2021-10-01', '2023-12-31'),
            ('2015-01-08', '2023-12-31')]

for start_date, end_date in periods:
    model_name = 'pytorch_start_{}_end_{}'.format(start_date, end_date)
    X = read_csv_between('./data/{}/X_full.csv'.format(version), start_date, end_date)
    y = read_csv_between('./data/{}/y_full.csv'.format(version), start_date, end_date)

    block_size = "4D"

    masker = [pd.Series(g.index) for n, g in X.groupby(pd.Grouper(freq=block_size))]
    train_mask, test_mask = train_test_split(
        masker, test_size=0.2, random_state=42)
    # split training data further into training and validation:
    train_mask, val_mask = train_test_split(train_mask, test_size=0.2, random_state=42)

    X_train = X.loc[pd.concat(train_mask)]
    X_val = X.loc[pd.concat(val_mask)]
    X_test = X.loc[pd.concat(test_mask)]

    y_train = y.loc[pd.concat(train_mask)]
    y_val = y.loc[pd.concat(val_mask)]
    y_test = y.loc[pd.concat(test_mask)]

    X_train.to_csv('./data/{}/X_train_{}.csv'.format(version, model_name), sep=',', index=True)
    X_val.to_csv('./data/{}/X_val_{}.csv'.format(version, model_name), sep=',', index=True)
    X_test.to_csv('./data/{}/X_test_{}.csv'.format(version, model_name), sep=',', index=True)
    y_train.to_csv('./data/{}/y_train_{}.csv'.format(version, model_name), sep=',', index=True)
    y_val.to_csv('./data/{}/y_val_{}.csv'.format(version, model_name), sep=',', index=True)
    y_test.to_csv('./data/{}/y_test_{}.csv'.format(version, model_name), sep=',', index=True)

    # scale inputs
    X_scaler = MinMaxScaler()
    y_scaler = MinMaxScaler()

    X_col = X_train.columns
    X_train[X_col] = X_scaler.fit_transform(X_train[X_col])
    X_test[X_col] = X_scaler.transform(X_test[X_col])
    X_val[X_col] = X_scaler.transform(X_val[X_col])

    y_col = y_train.columns
    y_train = y_scaler.fit_transform(y_train[y_col])
    y_test = y_scaler.transform(y_test[y_col])
    y_val = y_scaler.transform(y_val[y_col])

    with open('./credit_flow/{}/X_scaler_{}.pkl'.format(version, model_name), 'wb') as file:
        dill.dump(X_scaler, file)
    with open('./credit_flow/{}/y_scaler_{}.pkl'.format(version, model_name), 'wb') as file:
        dill.dump(y_scaler, file)

    inputs = torch.tensor(X_train[X_col].values, dtype=torch.float32)
    labels = torch.tensor(y_train, dtype=torch.float32)

    dataset = TensorDataset(inputs, labels)
    dataloader = DataLoader(dataset, num_workers=2, batch_size=128, shuffle=True)

    inputs_val = torch.tensor(X_val[X_col].values, dtype=torch.float32)
    labels_val = torch.tensor(y_val, dtype=torch.float32)

    dataset_val = TensorDataset(inputs_val, labels_val)
    dataloader_val = DataLoader(dataset_val, num_workers=2, batch_size=64)

    # --------------------------------

    hidden_layer_sizes = [(15, 15),
                     (10, 15),
                     (30, 15),
                     (10, 10),
                     (10, 5),
                     (15, 10),
                     (15, 5)]
    for l1, l2 in hidden_layer_sizes:
        modelname = "FNN"
        logger = TensorBoardLogger("model_logs", name='{}'.format(modelname))
        config = {
                "input_size": len(X.columns), # automatically set to number of features
                "l1_size": l1, 
                "l2_size": l2,
                "output_size": 1,
                "learning_rate": 0.001,
                "do": 0 # no dropout
        }
        model = PriceFNN(config)

        early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_loss", min_delta=0.00, patience=30, mode="min")
        trainer = pl.Trainer(max_epochs=700, 
                        callbacks=[early_stop_callback], 
                        logger=logger, 
                        enable_progress_bar=True)

        tuner = pl.tuner.Tuner(trainer)
        lr_finder = tuner.lr_find(model=model, 
                                train_dataloaders=dataloader, 
                                val_dataloaders=dataloader_val)

        suggested_lr = lr_finder.suggestion()
        print("Suggested learning_rate={}".format(suggested_lr))
        model.learning_rate = suggested_lr

        # fix progress bar: https://github.com/Lightning-AI/pytorch-lightning/issues/15283
        trainer.fit(model, dataloader, dataloader_val)
        torch.save(model.state_dict(), "./models/{}/pytorch/model_{}_l1_{}_l2_{}.pkl".format(version, model_name, l1, l2))