In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow as tf
import time
import os 

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor

from src.utils.time_series_procs import split_sequences

In [2]:
class StandardScaler3DShape:
    def __init__(self):
        self.scaler = StandardScaler()

    def fit_transform(self, X, y=None):
        X_new = self.scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

    def fit(self, X, y=None):
        X_new = X.reshape(-1, X.shape[-1])
        self.scaler.fit(X_new)
        return self

    def transform(self, X, y=None):
        X_new = self.scaler.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
        return X_new

In [3]:
def get_baseline_model(n_units=64, learning_rate=0.01):
    model = tf.keras.Sequential()

    model.add(tf.keras.layers.LSTM(units=n_units))
    model.add(tf.keras.layers.Dense(1))

    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=opt,
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
    )
    return model

In [4]:
def create_keras_model(
    activation="tanh", dropout_rate=0.0, n_units=64, n_layers=1, learning_rate=1e-3
):
    model = tf.keras.Sequential()

    for i in range(n_layers):
        if i + 1 < n_layers:
            model.add(
                tf.keras.layers.LSTM(
                    units=n_units, activation=activation, return_sequences=True
                )
            )
        else:
            model.add(
                tf.keras.layers.LSTM(
                    units=n_units, activation=activation, return_sequences=False
                )
            )
    if dropout_rate > 0.0:
        model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.Dense(1))
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="RMSE")],
    )
    return model

In [5]:
def make_repeated_time_series_k_fold(
    x,
    y,
    train_period,
    test_period,
    grid=None,
    repeats=10,
    show_individual_results=True,
    show_final_result=True,
    params=None,
):
    results = []
    max_samples = x.shape[0]

    if grid:
        activation = grid.best_params_["model__activation"]
        dropout_rate = grid.best_params_["model__dropout_rate"]
        n_units = grid.best_params_["model__n_units"]
        n_layers = grid.best_params_["model__n_layers"]
        learning_rate = grid.best_params_["model__learning_rate"]
        batch_size = grid.best_params_["model__batch_size"]
        epochs = grid.best_params_["model__epochs"]
    else:
        activation = params["activation"]
        dropout_rate = params["dropout_rate"]
        n_units = params["n_units"]
        n_layers = params["n_layers"]
        learning_rate = params["learning_rate"]
        batch_size = params["batch_size"]
        epochs = params["epochs"]

    for _ in range(repeats):
        i = 0
        scores = []
        for _ in range(0, max_samples // train_period):
            x_train = x[i : i + train_period]
            y_train = y[i : i + train_period]
            x_test = x[i + train_period : i + train_period + test_period]
            y_test = y[i + train_period : i + train_period + test_period]
            i += train_period
            x_train = scaler.fit_transform(
                x_train.reshape(-1, x_train.shape[-1])
            ).reshape(x_train.shape)
            x_test = scaler.transform(x_test.reshape(-1, x_test.shape[-1])).reshape(
                x_test.shape
            )
            model = create_keras_model(
                activation=activation,
                dropout_rate=dropout_rate,
                n_units=n_units,
                n_layers=n_layers,
                learning_rate=learning_rate,
            )
            model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
            rmse = model.evaluate(x_test, y_test, verbose=0)
            scores.append(rmse[1])
        results.append(scores)
        if show_individual_results:
            print("RMSE: %.3f (%.3f)" % (np.mean(scores), np.std(scores)))
    if show_final_result:
        print("\nRMSE: %.3f (%.3f)" % (np.mean(results), np.std(results)))
    return results

In [6]:
def make_timesteps_repeated_time_series_k_fold(
    df,
    train_period,
    test_period,
    params=None,
    grid=None,
    repeats=10,
    timesteps_list=[3],
    show_results=True,
):
    results = {}

    for timesteps in timesteps_list:
        x, y = split_sequences(df.values, timesteps)
        scores = make_repeated_time_series_k_fold(
            x, y, train_period, test_period, grid, repeats, False, False, params
        )
        results[timesteps] = scores
        if show_results:
            print(
                "TIMESTEPS: %d RMSE: %.3f (%.3f)"
                % (timesteps, np.mean(scores), np.std(scores))
            )
    return results

<h3>Disable Tensorflow Warnings</h3>

In [7]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [8]:
df = pd.read_csv("../../../../data/interim/partner_iii/Dati CADD 2020-2022_with_fillna.csv")

In [9]:
seed = 47
scaler = StandardScaler()

# Neural Network Long Short Term Memory - LSTM

# Configs:

1. Train Period: [01-01-2019, 2021-05-02)

2. Test Period:  (2021-05-02, 2021-11-31]

3. TIMESTEPS: [1, 3, 5, 7, 10, 15, 20]

4. Epochs: 100

5. Batch size: 32

6. LSTM Cells: 64

7. LSTM Layers: 1

8. Optimization Algorithm: Adam
    
    8.1. Learning Rate: 1e-2
    
<h3>Repeats: 10</h3>
<h3>K: 10</h3>

# Times Series Repeated KFold Cross validation - different Timesteps values

Here we will make a Times Series KFold Cross validation. We are using the Blocking Time Series Split method. We also do it with different number of timesteps.

In [10]:
params = {}
params['activation'] = 'tanh'
params['dropout_rate'] = 0.0
params['n_units'] = 64
params['n_layers'] = 1
params['learning_rate'] = 1e-2
params['batch_size'] = 32
params['epochs'] = 100

timesteps_list = [1, 3, 5, 7, 10, 15, 20]

repeats = 10

<h3> Time Series Repeated KFold Cross Validation with vary timesteps</h3>

<h4>Experiment 1</h4>
    
<b>Train 80 of that data and test with 20%</b>

In [11]:
train_period = 200 
test_period = 30

In [12]:
results = make_timesteps_repeated_time_series_k_fold(
    df, train_period, test_period, params, None, repeats, timesteps_list
)

TIMESTEPS: 1 RMSE: 5.762 (0.595)
TIMESTEPS: 3 RMSE: 5.782 (0.907)
TIMESTEPS: 5 RMSE: 6.360 (0.705)
TIMESTEPS: 7 RMSE: 6.528 (0.688)
TIMESTEPS: 10 RMSE: 6.434 (0.639)
TIMESTEPS: 15 RMSE: 5.553 (0.540)
TIMESTEPS: 20 RMSE: 4.837 (0.713)
