https://www.statsmodels.org/dev/generated/statsmodels.tsa.statespace.dynamic_factor_mq.DynamicFactorMQ.html
<br>https://medium.com/@philippetousignant/dynamic-factor-models-in-python-58d2d5252640
<br>https://www.ecb.europa.eu/pub/pdf/scpwps/ecbwp1564.pdf

In [None]:
import datetime
import requests
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import statsmodels.tsa.api as sm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse
import warnings
from statsmodels.tsa.arima.model import ARIMA

Static data

In [None]:
source_file = 'Data Sources.csv'

In [None]:
train_ratio = 0.8
max_factors, max_lags = 5, 2
max_ar = 9
forecast_steps = 1

Classes and functions

In [None]:
class Variables:
    def __init__(self, df):
        self.dep = df[df['Dependent']=='Y']['Name'].tolist()

        self.indep = df[df['Dependent']!='Y']['Name'].tolist()
        self.indep = [i for i in self.indep if 'business expectations' not in i.lower()]

        self.freq = dict(zip(df['Name'],df['Frequency']))

        self.non_stat = []

In [None]:
def get_dos(url):
    response = requests.get(url=url).json()
    return({i['Key']:float(i['Value']) for i in response})

In [None]:
def get_sgx(url):
    data = {}
    response = requests.get(url=url).json()
    for d in response['data']:
        date = d['trading_time'].split('_')[0]
        date = datetime.datetime.strptime(date, '%Y%m%d').date()
        data[date] = float(d['lp'])
    return(data)

In [None]:
def get_mas(url, params={'fields':'end_of_month,m1','limit':100,'offset':0}):
    response = requests.get(url=url, params=params).json()

    no_records = int(response['result']['total'])
    pagesize = int(response['result']['limit'])
    pages = no_records//pagesize + (no_records%pagesize>0)

    # parse first page
    data = {}
    for i in response['result']['records']:
        data[i['end_of_month']] = float(i['m1'])

    # parse rest of data to get complete series
    for i in range(1,pages):
        params['offset'] = i*100
        response = requests.get(url=url, params=params).json()
        for i in response['result']['records']:
            data[i['end_of_month']] = float(i['m1'])

    return(data)

In [None]:
def run_df(data, no_factors, f_lags):
    try:
        model = sm.DynamicFactorMQ(endog=data,
                                   k_endog_monthly=len([k for k,v in variables.freq.items() if v=='M']),
                                   factors=no_factors, factor_orders=f_lags,
                                   idiosyncratic_ar1=True)
        results = model.fit(maxiter=100, disp=False)
        return(results)

    except UserWarning:
        return(None)

In [None]:
def find_min_ic(models_dict):
    # info criterion type: (model specifications, info criterion value)
    ic_min = {ic:(None,float('inf')) for ic in ics}

    # k: model specifications, v: model object
    for k,v in models_dict.items():
        for ic,val in ic_min.items():
            if v[ic]<ic_min[ic][1]:
                ic_min[ic] = (k,v[ic])
    return(ic_min)

In [None]:
def compare(actual_df, predict_df):
    compare_df = pd.DataFrame(actual_df[variables.dep].dropna().values,
                              index=actual_df[variables.dep].dropna().index,
                              columns=['Actual'])

    compare_df = compare_df.merge(predict_df[variables.dep].rename(columns={variables.dep[0]:'Predicted'}),
                                  how='left', left_index=True, right_index=True)
    return(compare_df)

In [None]:
def split_train_test(data, train_prop):
    train_no_rows = round(train_prop * len(data))
    train_df = data.iloc[:train_no_rows]
    test_df = data.iloc[train_no_rows:]
    return(train_df, test_df)

In [None]:
def run_train_test(full_data, train_data, forecast_steps, factor, lag):
    fc_df = pd.DataFrame()
    model = run_df(train_data, factor, lag)
    if not model:
        return(None) # exit func if model does not converge

    for i in range(-(len(full_data)-len(train_data)), 0, forecast_steps):
        model = model.apply(endog=full_data.iloc[:i])
        forecast = model.forecast(steps=forecast_steps)
        fc_df = pd.concat([fc_df, forecast])

    export = {'model':model, 'forecasts':fc_df}

    for ic in ics:
        export[ic] = getattr(model, ic)

    return(export)

Load raw data

In [None]:
sources = pd.read_csv(source_file, encoding='utf-8')
variables = Variables(sources)

In [None]:
variables.__dict__.keys()

In [None]:
requests_funcs = {'mas':get_mas, 'sgx':get_sgx, 'singstat':get_dos}

In [None]:
apis = {'mas':[], 'sgx':[], 'singstat':[]}
for k in apis.keys():
    for i in zip(sources['Name'], sources['Frequency'], sources['API']):
        if k in i[-1]:
            apis[k].append({'Name':i[0], 'Frequency':i[1], 'API':i[2]})

In [None]:
# pull data
ts_data = {}
for k,v in apis.items():
    for dim in v:
        ts_data[dim['Name']] = requests_funcs[k](dim['API'])

In [None]:
# pass data into pandas series
ts_pd = {}
for k,v in apis.items():
    for dim in v:

        if dim['Frequency']=='Q' and k=='singstat':
            periods = [p.split()[0]+p.split()[-1][::-1] for p in list(ts_data[dim['Name']])]
            periods = pd.PeriodIndex(periods, freq=dim['Frequency'])
            ts_pd[dim['Name']] = pd.Series(ts_data[dim['Name']].values(), index=periods)

        elif dim['Frequency']=='M':
            periods = pd.to_datetime(list(ts_data[dim['Name']])) + pd.tseries.offsets.MonthEnd(0)
            ts_pd[dim['Name']] = pd.Series(ts_data[dim['Name']].values(), index=periods)

        elif dim['Frequency']=='D' and k=='sgx':
            periods = pd.to_datetime(list(ts_data[dim['Name']]))
            ts_pd[dim['Name']] = pd.Series(ts_data[dim['Name']].values(), index=periods).resample('M').last()
            variables.freq[dim['Name']] = 'M' # update to 'M' since resampled

In [None]:
list(ts_data)

In [None]:
ts_data['Business Expectations Of The Manufacturing Sector - Forecast By Industry']

Check and ensure series' stationarity

In [None]:
# if p-value >0.05, var is non-stationary
for i in variables.indep:

    # if unit root, take % yoy growth (which also removes seasonality)
    if adfuller(ts_pd[i])[1]>0.05:

        if variables.freq[i]=='M':
            ts_pd[i] = ts_pd[i].pct_change(periods=12) * 100
        elif variables.freq[i]=='Q':
            ts_pd[i] = ts_pd[i].pct_change(periods=4) * 100
        print(f'[Non-stationary] {i}')
        variables.non_stat.append(i)

    else:
        print(f'[Stationary] {i}')

Resample all series and pass into dataframe (order must be adhered to as defined by <i>statsmodels</i> docs):
- dependent variable leftmost
- monthly data in the first columns
- quarterly data in the last columns

In [None]:
# resample all series to monthly and start from first valid index
for series, freq in zip(sources['Name'], sources['Frequency']):

    if freq=='Q':
        ts_pd[series] = ts_pd[series].resample('M', convention='end').asfreq()
        ts_pd[series].index = pd.to_datetime(ts_pd[series].index.strftime('%Y-%m-%d'))

    if ts_pd[series].index[0]!=ts_pd[series].first_valid_index():
        ts_pd[series] = ts_pd[series][ts_pd[series].first_valid_index():]

In [None]:
# pass all series into dataframe, start dataframe from first year of GDP growth data
ts_df = pd.DataFrame(ts_pd)
ts_df = ts_df.loc[ts_df.index.year>=ts_df.loc[:,variables.dep[0]].first_valid_index().year]

# rearrange columns in correct order for factor modelling as explained in markdown above
ts_df = ts_df[variables.dep+\
              [k for k,v in variables.freq.items() if v=='M' and k!=variables.dep[0]]+\
              [k for k,v in variables.freq.items() if v=='Q' and k!=variables.dep[0]]]
ts_df

Explore data

In [None]:
ts_df.describe().round(3)

In [None]:
for i in list(ts_df):
    plt.figure()
    plt.title(i)
    plt.plot(ts_df[i].fillna(method='ffill')) # fillna because quarterly data has blanks when freq=monthly
    plt.tight_layout()

Instantiate and fit DF model

In [None]:
# to catch warnings for models that do not converge
warnings.filterwarnings('error', category=UserWarning)

In [None]:
ics = ['aic','bic','hqic']

In [None]:
models = {}
for factor in range(1, max_factors+1):
    for lag in range(1, max_lags+1):
        print(f'({factor}, {lag})', end=' ')
        models[(factor,lag)] = {'model':run_df(ts_df, factor, lag)}

        # if model converges, extract info criterions. if not, remove model from dict.
        if models[(factor,lag)]['model']:
            for ic in ics:
                models[(factor,lag)][ic] = getattr(models[(factor, lag)]['model'], ic)
        else:
            models.pop((factor,lag))
        print('done')

In [None]:
models.keys()

Find model with lowest information criterion

In [None]:
models_min_ic = find_min_ic(models)
mod_ic = 'bic'
use_model = models[models_min_ic[mod_ic][0]]

Compare actual vs predicted

In [None]:
pred = use_model['model'].predict()
compare_df = compare(ts_df, pred)
compare_df.iloc[-(4*10):].plot(title=f"{variables.dep[0]}\nDynamic factor model\n\
(Factors: {models_min_ic[mod_ic][0][0]}, Order: {models_min_ic[mod_ic][0][1]})")

In [None]:
mean_sq_err = mse(compare_df['Actual'], compare_df['Predicted'])
print(f'In-sample MSE of GDP DF forecast: {round(mean_sq_err,3)}')

Pseudo out-of-sample forecasts, i.e., train on x% of dataset, test on (1-x)%

In [None]:
train_ts, test_ts = split_train_test(ts_df, train_ratio)

In [None]:
oos_models = {}
with np.errstate(divide='ignore'):
    for factor in range(1, max_factors+1):
        for lag in range(1, max_lags+1):
            print(f'({factor}, {lag})', end=' ')
            oos_models[(factor,lag)] = run_train_test(ts_df, train_ts, forecast_steps, factor, lag)

            # if model does not converge, remove from dict
            if not oos_models[(factor,lag)]:
                oos_models.pop((factor,lag))
            print('done')

In [None]:
oos_models.keys()

In [None]:
oos_models_min_ic = find_min_ic(oos_models)
oos_ic = 'bic'
use_oos_model = oos_models[oos_models_min_ic[oos_ic][0]]

In [None]:
pred_oos = use_oos_model['model'].predict()
compare_oos = compare(ts_df, pred_oos)
compare_oos.iloc[-(4*10):].plot(title=f"{variables.dep[0]}\nDynamic factor model\n\
(Factors: {oos_models_min_ic[oos_ic][0][0]}, Order: {oos_models_min_ic[oos_ic][0][1]})")

In [None]:
mean_sq_err_oos = mse(compare_oos['Actual'], compare_oos['Predicted'])
print(f'Out-of-sample MSE of GDP DF forecast: {round(mean_sq_err_oos,3)}')

Run an AR model as a baseline vs DF

In [None]:
ar_df = ts_df[variables.dep].dropna()

In [None]:
if adfuller(ar_df[variables.dep[0]])[1]>0.05:
    print(f'[Non-stationary] {variables.dep[0]}')
else:
    print(f'[Stationary] {variables.dep[0]}')

In [None]:
ar_models = {(p,0,0):None for p in range(1,max_ar)}

In [None]:
for i in ar_models:
    print(f'{i}', end=' ')
    ar_models[i] = {'model':ARIMA(ar_df[variables.dep[0]], order=i).fit()}
    for ic in ics:
        ar_models[i][ic] = getattr(ar_models[i]['model'], ic)
    print('done')

In [None]:
ar_models_min_ic = find_min_ic(ar_models)
ar_ic = 'aic'
use_ar_model = ar_models[ar_models_min_ic[ar_ic][0]]

In [None]:
pred_ar = pd.DataFrame(use_ar_model['model'].predict())
pred_ar.columns = variables.dep
compare_ar = compare(ts_df, pred_ar)
compare_ar.iloc[-(4*10):].plot(title=f"{variables.dep[0]}\nARIMA\n\
(p: {ar_models_min_ic[ar_ic][0][0]}, d: {ar_models_min_ic[ar_ic][0][1]}, q: {ar_models_min_ic[ar_ic][0][2]})")

In [None]:
mean_sq_err_ar = mse(compare_ar['Actual'], compare_ar['Predicted'])
print(f'In-sample MSE of GDP AR forecast: {round(mean_sq_err_ar,3)}')