In [None]:
from parameters import get_parameters
from utils_lstm import *

from pathlib import Path
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.losses import Huber
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam


params = get_parameters()
target = params["target"]
source_conformed = params["source_conformed"]
source_results = params["source_results"]
start_year = params["start_year"]
validation_year = params["validation_year"]
test_year = params["test_year"]


PROJECT_ROOT = Path().resolve().parent
print(f"Project root: {PROJECT_ROOT}")

## Config

In [None]:
exog = False

In [None]:
if exog:
    folder = "LSTM_exog"
if not exog:
    folder = "LSTM"
print(folder)

In [None]:
lstm_units_tuples = [
    [128, 12],
    [256, 24],
]
dropout_rates_tuples = [
    [0.2, 0.2],
    [0.3, 0.2],
]
learning_rates = [0.001, 0.0001]
timesteps_values=[18, 52]
use_log_values = [False]
target_mode_values = ["absolute", "diff1"]


epochs=200
batch_size=32

In [None]:
def build_model(input_shape, lstm_units=[256, 12], dropout_rates=[0.3, 0.3], learning_rate=0.001):
    model = Sequential()
    model.add(LSTM(lstm_units[0], input_shape=input_shape))
    model.add(Dropout(dropout_rates[0]))
    model.add(Dense(lstm_units[1], activation='relu'))
    model.add(Dropout(dropout_rates[1]))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model

## Grid search - Validation

In [None]:
best_rmse = np.inf
best_model = None
best_config = None
best_history = None
best_y_true = None
best_y_pred = None
total_results = []
best_params = None

for use_log, target_mode in itertools.product(use_log_values, target_mode_values):
    params = {
        "exog": exog,
        "add_outlier": True,
        "use_log": use_log,
        "use_boxcox": False,
        "target_mode": target_mode,   # "absolute" | "diff1" | "diff2" | "ratio" | "relative"
    }

    df = pd.read_pickle(f"{PROJECT_ROOT}/{source_conformed}/data.pkl")
    df = format_df(df, **params)
    print(df.head())

    train_val_df, train_df, val_df, test_df = split_by_date(df, validation_year, test_year)
    val_df = pd.concat([train_df.tail(max(timesteps_values)), val_df], ignore_index=False)

    X_train = train_df.drop(columns=["CASES"])
    y_train = train_df[[target]]
    X_test = val_df.drop(columns=["CASES"])
    y_test = val_df[[target]]

    scaler_x = MinMaxScaler()
    X_train = scaler_x.fit_transform(X_train)
    X_test  = scaler_x.transform(X_test)

    scaler_y = MinMaxScaler()
    y_train = scaler_y.fit_transform(y_train)
    y_test  = scaler_y.transform(y_test)

    rmse, model, config, history, y_true, y_pred, results = grid_search(params, build_model, X_train, y_train, X_test, y_test, train_df, scaler_y, lstm_units_tuples, dropout_rates_tuples, learning_rates, timesteps_values, epochs, batch_size)
    
    for j in results:
        j["log"] = use_log
    
        if target_mode == "absolute":
            j["d"] = 0
        if target_mode == "diff1":
            j["d"] = 1
        if target_mode == "diff2":
            j["d"] = 2

    total_results.extend(results)

    
    if rmse <= best_rmse:
        best_rmse = rmse
        best_model = model
        best_config = config
        best_history = history
        best_y_true = y_true
        best_y_pred = y_pred
        best_params = params

total_results = [{"log": rec["log"], "d": rec["d"], **{k: v for k, v in rec.items() if k not in ["log", "d"]}} for rec in total_results]

In [None]:
best_y_true = pd.Series(best_y_true, index=val_df[max(timesteps_values):].index)
best_y_pred = pd.Series(best_y_pred, index=val_df[max(timesteps_values):].index)

In [None]:
val_dir = f"{PROJECT_ROOT}/{source_results}/{folder}/validation"
save_val_data(val_dir, X_train, X_test, val_df, best_config, best_history, best_y_true, best_y_pred, total_results)

## Test

In [None]:
print(best_params)

In [None]:
df = pd.read_pickle(f"{PROJECT_ROOT}/{source_conformed}/data.pkl")
df = format_df(df, **best_params)
print(df.head())

train_val_df, train_df, val_df, test_df = split_by_date(df, validation_year, test_year)
test_df = pd.concat([train_val_df.tail(best_config["timesteps"]), test_df], ignore_index=False)

In [None]:
X_train = train_val_df.drop(columns=["CASES"])
y_train = train_val_df[[target]]
X_test = test_df.drop(columns=["CASES"])
y_test = test_df[[target]]

In [None]:
scaler_x = MinMaxScaler()
X_train = scaler_x.fit_transform(X_train)
X_test  = scaler_x.transform(X_test)

scaler_y = MinMaxScaler()
y_train = scaler_y.fit_transform(y_train)
y_test  = scaler_y.transform(y_test)

In [None]:
model, history, y_true, y_pred = train_final_model(best_params, build_model, X_train, y_train, X_test, y_test, train_val_df, test_df, scaler_y, best_rmse, best_config, epochs, batch_size)

In [None]:
y_true = pd.Series(y_true, index=test_df[best_config["timesteps"]:].index)
y_pred = pd.Series(y_pred, index=test_df[best_config["timesteps"]:].index)

In [None]:
test_dir = f"{PROJECT_ROOT}/{source_results}/{folder}/test"
save_test_data(test_dir, X_train, X_test, test_df[best_config["timesteps"]:], history, y_true, y_pred)