In [1]:
from dateutil.relativedelta import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [14, 7]
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [2]:
catalog = pd.read_csv("../helper/catalog.csv")
data = pd.read_csv("../output/2020-06-01_database_tf.csv", parse_dates=["date"])
ys = ["x_world.sa", "x_vol_world2.sa", "x_servs_world.sa"]

In [3]:
# generating vintage dataset
def offset_series(series, offset):
    tmp = series
    tmp[-offset:] = np.nan
    return tmp
    
def gen_vintage(data, catalog, start_date, end_date):
    tmp = data.loc[lambda x: (x.date >= start_date) & (x.date <= end_date), :]
    offsets = [
        catalog.loc[catalog.code == x,"publication_lag"].values[0]
            if len(catalog.loc[catalog.code == x,"publication_lag"]) > 0 
            else catalog.loc[catalog.code == x[:-3],"publication_lag"].values[0]
        for x in data.columns[1:]
    ]
    for i in range(1, len(tmp.columns)):
        tmp.iloc[:,i] = offset_series(tmp.iloc[:,i], offsets[i-1])
    return tmp

def interpolate(data, method):
    if method == "none":
        return data
    elif method == "linear":
        return data.interpolate()
    elif method == "mean":
        return data.fillna(data.mean())

def gen_target_data(data, catalog, target, start_date, end_date, interp_method="none", ragged_ends=True):
    vintage = gen_vintage(data, catalog, start_date, end_date)
    if target == "x_world.sa":
        catalog_col = "octave_value"
    elif target == "x_vol_world2.sa":
        catalog_col = "octave_volume"
    elif target == "x_servs_world.sa":
        catalog_col = "octave_services"
    tmp = vintage.loc[:, pd.unique(["date"] + catalog.loc[~pd.isna(catalog[catalog_col]), catalog_col].to_list())]
    tmp = interpolate(data, interp_method)
    if ragged_ends:
        tmp = gen_vintage(tmp, catalog, start_date, end_date).set_index("date").dropna(how="all")
    else:
        tmp = tmp.set_index("date").dropna(how="all")
    return tmp.loc[tmp.index > "2002-01-01",:]

# LSTM

In [5]:
# params
start_date = "2002-01-01"
global_end_date = "2020-06-01"
target = "x_world.sa"

x = gen_target_data(data, catalog, target, start_date, end_date, interp_method="none", ragged_ends=True)

In [6]:
x

Unnamed: 0_level_0,x_cn.sa,x_us.sa,x_de.sa,x_jp.sa,x_nl.sa,x_kr.sa,x_fr.sa,x_it.sa,x_uk.sa,x_in.sa,...,tourists_seasia,tourists_world,container_hk.sa,transit_pa_canal.sa,transit_suez_canal.sa,air_freight_mem.sa,container_exports_la.sa,container_total_la.sa,x_servs_world.sa,x_world.sa
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-02-01,-0.018771,-0.003969,0.005755,0.009547,0.010611,0.007966,-0.006775,-0.006747,0.003458,-0.144847,...,,,0.035289,-0.015393,,,0.125083,0.223764,,
2002-03-01,-0.004273,0.003733,0.031766,0.046668,0.017382,0.083882,0.013695,-0.004742,0.020142,0.010228,...,,,0.029375,0.014661,,,0.010691,-0.056676,,
2002-04-01,0.061018,0.036352,0.007278,0.008106,-0.006057,-0.008464,0.055294,-0.007274,-0.014353,0.144986,...,,,0.052911,0.018911,,,-0.032867,0.006005,,
2002-05-01,-0.014122,0.003865,-0.011658,0.047498,0.053012,0.069513,0.007092,0.068378,0.118999,-0.015973,...,,,-0.045336,-0.014889,,,0.066421,0.021283,,
2002-06-01,0.038516,0.010141,0.130196,0.041509,0.054391,-0.069458,0.032787,0.045639,-0.071101,0.026890,...,,,0.063280,-0.003603,,,0.014788,0.046550,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-01,0.001607,0.002875,0.052599,0.036064,0.026391,-0.022330,0.027369,0.044171,-0.006086,0.040344,...,,,-0.029585,-0.044890,0.002669,0.006741,0.044552,0.031898,,
2017-09-01,0.001443,0.013778,0.005742,-0.006844,0.016104,0.140376,0.022117,0.013979,0.052094,0.152261,...,,,0.073232,0.003148,0.024273,-0.009536,-0.053111,-0.065569,0.030259,0.045609
2017-10-01,-0.001858,0.005324,-0.039406,-0.016356,-0.027894,-0.216023,-0.016290,-0.025371,-0.017394,-0.152876,...,,,-0.060861,,,0.029421,-0.032246,-0.035124,,
2017-11-01,0.062628,0.022953,,0.024355,,0.105096,,,0.025610,0.189576,...,,,0.033298,,,,0.299084,0.268703,,
