In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from nowcast_lstm.LSTM import LSTM
from nowcast_lstm.model_selection import select_model, variable_selection
import torch
import dill

In [2]:
def load_data(dataset_end_date):
    data = pd.read_excel('../230315 Nowcasting Dataset.xlsx', sheet_name='Nowcasting Dataset', parse_dates=['Date'])
    data = data.rename(columns={"GDP_QNA_RG": "GDP"})

    data = data.drop(
        ["GDP_QNA_PCT", "GDP_QNA_LVL_LD", "GDP_QNA_LVL"], axis=1)

    data["GDP"] = data["GDP"].interpolate()
    data["LIBOR_3mth"] = data["LIBOR_3mth"].interpolate()
    if dataset_end_date:
        data = data[
            (data['Date'] <= pd.to_datetime(dataset_end_date))
            ]
    return data

In [3]:
INTERVALS = {
    2010: {
        "dataset_end_date": "2010-12-01",
        "train_end_date": "2005-12-01",
        "test_start_date": "2006-01-01",
        "initial_window": 200,
        "break_points": [217]
    },
    2019: {
        "dataset_end_date": "2019-12-01",
        "train_end_date": "2015-12-01",
        "test_start_date": "2016-01-01",
        "initial_window": 200,
        "break_points": [217]
    },
    2022: {
        "dataset_end_date": False,
        "train_end_date": "2015-12-01",
        "test_start_date": "2016-01-01",
        "initial_window": 310,
        "break_points": [217, 361]
    },
}

start_date = pd.to_datetime("2006-01-01")
end_date = pd.to_datetime("2022-09-01")
dates = pd.date_range(start=start_date, end=end_date, freq='MS')

In [4]:
# for year in [2022]:
#     dataset_end_date = INTERVALS[year]["dataset_end_date"]
#     train_end_date = INTERVALS[year]["train_end_date"]
#     test_start_date = INTERVALS[year]["test_start_date"]
#     structural_breakpoints = INTERVALS[year]["break_points"]
    
#     data = load_data(dataset_end_date)
#     columns = ['Date']
#     for i in range(1, len(structural_breakpoints)+1):
#         data[f"Break_{i}"] = 0
#         data.loc[structural_breakpoints[i-1]:,f"Break_{i}"] = 1
#         columns.append(f"Break_{i}")
    
#     data = data.iloc[:, columns]
#     lags = 3
#     for lag in range(1, lags+1):
#         data_to_lag = load_data(dataset_end_date)
#         lagged_data = data_to_lag
#         lagged_data.iloc[:, 1:] = data_to_lag.iloc[:, 1:].shift(lag)
        
#         for col_name in lagged_data.columns:
#             if col_name != 'Date':
#                 lagged_data = lagged_data.rename(columns={col_name: f'L{lag}{col_name}'})
#         data = pd.merge(data, lagged_data, on='Date')
    
#     data = data.dropna().reset_index(drop=True)

# train_set = data[
#             (data['Date'] <= pd.to_datetime(train_end_date))
#             ]
# test_set = data[
#             (data['Date'] >= pd.to_datetime(test_start_date))
#             ]

In [5]:
def load_selection_results(dummy_variables = False):
    selections = pd.read_csv('selection_results.csv')
    best_selection = selections[
        selections.performance == selections.performance.max()]

    best_hyperparameters = eval(best_selection['hyperparameters'].values[0])
    best_variables = eval(best_selection['variables'].values[0])
    
    best_variables.extend(['Date', 'GDP'])
    
    varImp_results = [
        'CPI_ALL', 
        'RPI_GOOD',
        'TOT_WEEK_HRS',
        'EMP',
        'M2',
        'RETAIL_TRADE_INDEX'
        ]

    best_variables.extend(varImp_results)
    
    if dummy_variables:    
        best_variables.extend(dummy_variables)
    
    return (best_hyperparameters, best_variables)


In [6]:
def lag_data(data, lags, dummy_variables: list):
    # The purpose of this is to lag explanatory variables
    # i.e. Lag = 2 would be
    # Dependent variable at t+1   Explanatory vars at t and t-1
    # GDP                         L1GDP   L2GDP   ...
    # 0.9                         NA      NA      ...
    # 0.4                         0.9     NA      ...
    # 0.2                         0.4     0.9     ...
    # 0.5                         0.2     0.4     ...
    # ...                         ...     ...     ...
    if lags < 1:
        return data
    
    original_data = data
    
    # Keep date and t+1 GDP
    data = original_data[['Date', 'GDP']]
    
    # Add t, t-1, ..., t-lag explanatory variables to 'data' variable
    for lag in range(1, lags+1):
        
        # Load selection results from nowcast_lstm.select_model()
        _, best_variables = load_selection_results(dummy_variables)
        
        # We don't want to lag 'Date', so we temporarily remove it
        best_variables.pop(best_variables.index('Date'))
        
        # Reset lagged_data for future lag iterations
        lagged_data = original_data[best_variables]
        
        # Shift explanatory variables by <lag>
        lagged_data = original_data[best_variables].shift(lag)
        
        # Rename columns 
        for col_name in lagged_data.columns:
            if col_name != 'Date':
                lagged_data = lagged_data.rename(columns={col_name: f'L{lag}{col_name}'})
        
        # Add 'Date' column back to lagged data for merge
        lagged_data['Date'] = original_data['Date']
        
        # Add t, t-1, ..., t-lag explanatory variables to 'data' variable
        data = pd.merge(data, lagged_data, on='Date')
    
    # First and last rows now contain NA values (because we lagged variables)
    data = data.dropna().reset_index(drop=True)
    return data

In [14]:
for year in [2010, 2019, 2022]:
    
    # Load year-specific parameters
    dataset_end_date = INTERVALS[year]["dataset_end_date"]
    train_end_date = INTERVALS[year]["train_end_date"]
    test_start_date = INTERVALS[year]["test_start_date"]
    structural_breakpoints = INTERVALS[year]["break_points"]
    
    original_data = load_data(dataset_end_date)
    
    # Add dummy variables
    dummy_variables = []
    for i in range(1, len(structural_breakpoints)+1):
        col_name = f"Break_{i}"
        original_data[col_name] = 0
        original_data.loc[structural_breakpoints[i-1]:,col_name] = 1
        dummy_variables.append(col_name)
    
    # Keep Date and GDP(t+1) variables
    
    lags = 2
    data = lag_data(original_data, lags, dummy_variables)
    
    train_set = data[
            (data['Date'] <= pd.to_datetime(train_end_date))
            ]
    test_set = data[
            (data['Date'] >= pd.to_datetime(test_start_date))
            ]
    
    best_hyperparameters, _ = load_selection_results()
    
    model = LSTM(
        train_set,
        'GDP',
        best_hyperparameters["n_timesteps"],
        n_models=best_hyperparameters["n_models"],
        train_episodes=best_hyperparameters["train_episodes"],
        batch_size=best_hyperparameters["batch_size"],
        decay=best_hyperparameters["decay"],
        n_hidden=best_hyperparameters["n_hidden"],
        n_layers=best_hyperparameters["n_layers"],
        dropout=best_hyperparameters["dropout"],
    )
    model.train()
    
    predictions = model.predict(data, only_actuals_obs=False).loc[
        lambda x: x.date >= train_end_date
    ]
    
    predictions = predictions.rename(
        columns={
            'date': 'Date',
            'actuals': 'GDP',
            'predictions': 'LSTM Predictions'
        }
    )

    predictions.to_csv(f'../output/LSTM_{year}.csv', index=False)

Training model 1
step :  0 loss :  0.28080472350120544
step :  1 loss :  0.2529962360858917
step :  2 loss :  0.2535656690597534
step :  3 loss :  0.2536745071411133
step :  4 loss :  0.24953334033489227
step :  5 loss :  0.24723933637142181
step :  6 loss :  0.23464441299438477
step :  7 loss :  0.19620050489902496
step :  8 loss :  0.15043862164020538
step :  9 loss :  0.14442381262779236
step :  10 loss :  0.13372115790843964
step :  11 loss :  0.14010094106197357
step :  12 loss :  0.1244029849767685
step :  13 loss :  0.13234828412532806
step :  14 loss :  0.11982563138008118
step :  15 loss :  0.13203972578048706
step :  16 loss :  0.11374818533658981
step :  17 loss :  0.14963708817958832
step :  18 loss :  0.10614823549985886
step :  19 loss :  0.0961732342839241
step :  20 loss :  0.1156221330165863
step :  21 loss :  0.0897098109126091
step :  22 loss :  0.11635266244411469
step :  23 loss :  0.08892334252595901
step :  24 loss :  0.08154778927564621
step :  25 loss :  0.0784