In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from nowcast_lstm.LSTM import LSTM
from nowcast_lstm.model_selection import select_model, variable_selection
import torch
import dill

In [2]:
def load_data(dataset_end_date = False):
    
    # Load and rename data
    data = pd.read_excel('../230315 Nowcasting Dataset.xlsx', sheet_name='Nowcasting Dataset', parse_dates=['Date'])
    data = data.rename(columns={"GDP_QNA_RG": "GDP"})

    # Drop unnecessary GDP variables
    data = data.drop(
        ["GDP_QNA_PCT", "GDP_QNA_LVL_LD", "GDP_QNA_LVL"], axis=1)

    # Fill in missing NA values using linear interpolation
    data["GDP"] = data["GDP"].interpolate()
    data["LIBOR_3mth"] = data["LIBOR_3mth"].interpolate()
    
    # Select sub-sample, see INTERVALS
    if dataset_end_date:
        data = data[
            (data['Date'] <= pd.to_datetime(dataset_end_date))
            ]
    
    return data

In [3]:
def get_intervals():
    INTERVALS = {
        2010: {
            "dataset_end_date": "2010-12-01",
            "train_end_date": "2005-12-01",
            "test_start_date": "2006-01-01",
            "initial_window": 200,
            "break_points": [217]
        },
        2019: {
            "dataset_end_date": "2019-12-01",
            "train_end_date": "2015-12-01",
            "test_start_date": "2016-01-01",
            "initial_window": 200,
            "break_points": [217]
        },
        2022: {
            "dataset_end_date": False,
            "train_end_date": "2015-12-01",
            "test_start_date": "2016-01-01",
            "initial_window": 310,
            "break_points": [217, 361]
        },
    }
    return INTERVALS

ALL_VARIABLES = False
OUR_OWN_HYPERPARAMETERS = True
STR_BREAKS = True

In [4]:
def load_selection_results(dummy_variables = False):
    """
    selection_results.csv is output from nowcast_lstm/select_model()
    The function trained ±1000 models with varying parameters
    and variables; the process took approx. 4 hrs. The output
    of the function is 8 models that performed best.
    
    However, we find that LSTM performs better with variables
    selected by Elastic Net / Ridge. Lasso omits most of them.
    The list of variables from EN / Ridge is under 'varImp_results'.
    
    Finally, this function also allows running LSTM with all variables.
    To do this, just change ALL_VARIABLES from False to True.
    """
    
    # Select best performing model
    selections = pd.read_csv('selection_results.csv')
    best_selection = selections[
        selections.performance == selections.performance.max()]

    # Extract best hyperparameters
    if not OUR_OWN_HYPERPARAMETERS:
        best_hyperparameters = eval(best_selection['hyperparameters'].values[0])
    else:
        best_hyperparameters = {
            "n_timesteps": 2,
            "train_episodes": 500,
            "batch_size": 64,
            "decay": 0.98,
            "n_hidden": 20,
            "n_layers": 2,
            "dropout": 0,
            "n_models": 10
        }
    
    # Extract best variables
    best_variables = eval(best_selection['variables'].values[0])
    
    if ALL_VARIABLES:
        all_columns = load_data().columns.values.tolist()
        if dummy_variables:
            all_columns.extend(dummy_variables)
        return (best_hyperparameters, all_columns)
    
    # Date and GDP are required in the dataset.
    best_variables.extend(['Date', 'GDP'])
    
    # Elastic Net / Ridge suggested variables
    varImp_results = [
        'CPI_ALL', 
        'RPI_GOOD',
        'TOT_WEEK_HRS',
        'EMP',
        'M2',
        'RETAIL_TRADE_INDEX'
        ]

    best_variables.extend(varImp_results)
    
    if STR_BREAKS and dummy_variables:    
        best_variables.extend(dummy_variables)
    
    return (best_hyperparameters, best_variables)


In [5]:
def lag_data(data, lags, dummy_variables: list):
    ## The purpose of this is to lag explanatory variables
    # i.e. Lag = 2 would be
    
    # Dependent variable at t+1   Explanatory vars at t and t-1
    # GDP                         L1GDP   L2GDP   ...
    # 0.9                         NA      NA      ...
    # 0.4                         0.9     NA      ...
    # 0.2                         0.4     0.9     ...
    # 0.5                         0.2     0.4     ...
    # ...                         ...     ...     ...
    
    if lags < 1:
        return data
    
    original_data = data
    
    # Keep date and t+1 GDP
    data = original_data[['Date', 'GDP']]
    
    # Add t, t-1, ..., t-lag explanatory variables to 'data' variable
    for lag in range(1, lags+1):
        
        # Load selection results from nowcast_lstm.select_model()
        _, best_variables = load_selection_results(dummy_variables)
   
        # We don't want to lag 'Date', so we temporarily remove it
        best_variables.pop(best_variables.index('Date'))
        
        # Reset lagged_data for future lag iterations
        lagged_data = original_data[best_variables]
        
        # Shift explanatory variables by <lag>
        lagged_data = original_data[best_variables].shift(lag)
        
        # Rename columns 
        for col_name in lagged_data.columns:
            if col_name != 'Date':
                lagged_data = lagged_data.rename(columns={col_name: f'L{lag}{col_name}'})
        
        # Add 'Date' column back to lagged data for merge
        lagged_data['Date'] = original_data['Date']
        
        # Add t, t-1, ..., t-lag explanatory variables to 'data' variable
        data = pd.merge(data, lagged_data, on='Date')
    
    # First and last rows now contain NA values (because we lagged variables)
    data = data.dropna().reset_index(drop=True)
    return data

In [6]:
INTERVALS = get_intervals()

for year in [2010, 2019, 2022]:
    
    # Load year-specific parameters
    dataset_end_date = INTERVALS[year]["dataset_end_date"]
    train_end_date = INTERVALS[year]["train_end_date"]
    test_start_date = INTERVALS[year]["test_start_date"]
    structural_breakpoints = INTERVALS[year]["break_points"]
    
    original_data = load_data(dataset_end_date)
    
    # Add dummy variables
    dummy_variables = []
    for i in range(1, len(structural_breakpoints)+1):
        col_name = f"Break_{i}"
        original_data[col_name] = 0
        original_data.loc[structural_breakpoints[i-1]:,col_name] = 1
        dummy_variables.append(col_name)
    
    # Keep Date and GDP(t+1) variables
    
    # Set no. of lags
    lags = 2
    
    # Lag explanatory variables
    data = lag_data(original_data, lags, dummy_variables)
    
    # Partition data into training and test
    train_set = data[
            (data['Date'] <= pd.to_datetime(train_end_date))
            ]
    test_set = data[
            (data['Date'] >= pd.to_datetime(test_start_date))
            ]
    
    # Load hyperparameters
    best_hyperparameters, _ = load_selection_results()
    
    # Initiate LSTM object with hyperparameters
    model = LSTM(
        train_set,
        'GDP',
        best_hyperparameters["n_timesteps"],
        n_models=best_hyperparameters["n_models"],
        train_episodes=best_hyperparameters["train_episodes"],
        batch_size=best_hyperparameters["batch_size"],
        decay=best_hyperparameters["decay"],
        n_hidden=best_hyperparameters["n_hidden"],
        n_layers=best_hyperparameters["n_layers"],
        dropout=best_hyperparameters["dropout"],
    )
    
    # Train LSTM object (the network)
    model.train()
    
    # This makes expanding window predictions,
    # we pass full dataset but it only forecasts OOS
    # (it "remembers" what dates were training dates)
    predictions = model.predict(data, only_actuals_obs=False).loc[
        lambda x: x.date >= train_end_date
    ]
    
    # Rename columns
    predictions = predictions.rename(
        columns={
            'date': 'Date',
            'actuals': 'GDP',
            'predictions': 'LSTM Predictions'
        }
    )

    # Store all predictions
    predictions.to_csv(f'../output/LSTM_{year}_{"w_str_b" if STR_BREAKS else "wout_str_b"}.csv', index=False)

Training model 1
step :  0 loss :  0.460578590631485
step :  1 loss :  0.27860143780708313
step :  2 loss :  0.20083078742027283
step :  3 loss :  0.23598957061767578
step :  4 loss :  0.18507328629493713
step :  5 loss :  0.19248689711093903
step :  6 loss :  0.19058726727962494
step :  7 loss :  0.16001971065998077
step :  8 loss :  0.1447630077600479
step :  9 loss :  0.13168324530124664
step :  10 loss :  0.1408414989709854
step :  11 loss :  0.14921943843364716
step :  12 loss :  0.13182145357131958
step :  13 loss :  0.13480211794376373
step :  14 loss :  0.14817292988300323
step :  15 loss :  0.13784421980381012
step :  16 loss :  0.1215885728597641
step :  17 loss :  0.11416423320770264
step :  18 loss :  0.11390405893325806
step :  19 loss :  0.11125227808952332
step :  20 loss :  0.108932264149189
step :  21 loss :  0.10797641426324844
step :  22 loss :  0.10679000616073608
step :  23 loss :  0.10655854642391205
step :  24 loss :  0.10693492740392685
step :  25 loss :  0.1039