In [1]:
import argparse
import pandas as pd
import os

from forecast.forecast_funcs import run_forecast
from metadata.etfs import etfs_large, etfs_small
from utils.conn_data import save_pickle
from utils.parsers import str_2_bool

from statsmodels.tsa.api import VAR
from models.ModelClasses import LassoWrapper, LinearRegressionWrapper, RandomForestWrapper, SVMWrapper
from utils.parsers import add_and_keep_lags_only
from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

from copy import copy

parser = argparse.ArgumentParser(description="Run forecast.")

estimation_window=12 * 7
p=-1
correl_window=100000
beta_threshold=0.4
pval_threshold=0.05
fix_start=True
incercept=True
fs_method="rfecv-svm"
opt_k_method="no"
clustering_method="no"
n_clusters=0
intra_cluster_selection="no"
data_name="monetary-policy-processed"
inputs_path=os.path.join(os.getcwd(), "data", "inputs")
outputs_path=os.path.join(os.getcwd(), "data", "outputs")
target="ldEXME"

  from .autonotebook import tqdm as notebook_tqdm


rpy2 package not installed. You wont be able to run seqICP model.


In [2]:
fix_start = str_2_bool(fix_start)
incercept = str_2_bool(incercept)

data = pd.read_csv(os.path.join(inputs_path, f'{data_name}.csv'))

# fix columns
if "Unnamed: 0" in data.columns:
    data = data.drop(["Unnamed: 0"], axis=1)

# fix dates
data["date"] = pd.to_datetime(data["date"])
data = data.set_index("date")

etfs_large = etfs_large.copy()

target = target

if target != "ldEXME":
    # select etfs to remove
    removed_etfs = [etf for etf in etfs_large if etf != target]

    # delete etfs
    selected_data = data.drop(removed_etfs, axis=1)
else:
    selected_data = data.copy()

In [3]:
data=selected_data
target=target
fix_start=fix_start
estimation_window=estimation_window
correl_window=correl_window
p=p
beta_threshold=beta_threshold
pval_threshold=pval_threshold
incercept=incercept
fs_method=fs_method
opt_k_method=opt_k_method
clustering_method=clustering_method
n_clusters=n_clusters
intra_cluster_selection=intra_cluster_selection

In [4]:
step = 90

if fix_start or (step == 0):
    start = 0
else:
    start += 1

train_df = data.iloc[start:(estimation_window + step), :]
test_df = data.iloc[start:(estimation_window + step + 1), :]

# compute within c1luster correlation
if clustering_method != "no":
    if rolling_cluster:
        labelled_clusters = clusters_series[[str(step)]]
        labelled_clusters.columns = ["cluster"]
        labelled_clusters.reset_index(inplace = True)
    else:
        clusters = cm.compute_clusters(data=data, target=target, n_clusters=n_clusters, clustering_method=clustering_method)  
        labelled_clusters = cm.add_cluster_description(clusters=clusters)
    
    if intra_cluster_selection == "rank":
        ranks = cm.compute_within_cluster_corr_rank(data=train_df,
                                                    target=target,
                                                    labelled_clusters=labelled_clusters,
                                                    correl_window=correl_window)
        # select features and time window
        last_row = pd.DataFrame(ranks.iloc[-1])
        selected_columns = list(last_row[last_row == 1].dropna().index)
    elif intra_cluster_selection == "pca":
        train_pcs_df = cm.compute_within_cluster_pca(data=train_df,
                                                        labelled_clusters=labelled_clusters,
                                                        n_pcs=1)
        
        test_pcs_df = cm.compute_within_cluster_pca(data=test_df,
                                                    labelled_clusters=labelled_clusters,
                                                    n_pcs=1)

        train_df = pd.concat([train_df, train_pcs_df], axis=1)
        test_df = pd.concat([test_df, test_pcs_df], axis=1)
        selected_columns = list(train_pcs_df.columns)
    else:
        raise Exception(f"intra cluster selection method not registered: {intra_cluster_selection}")
else:
    labelled_clusters = pd.DataFrame([{"fred": target, "cluster": 1, "description": target}])
    selected_columns = list(train_df.drop([target], axis=1).columns)

train_df = train_df[[target] + selected_columns]

# zscore of train data
mean = train_df.mean()
std = train_df.std()

train_df = (train_df - mean) / std

# select optimal lag
if p == -1:
    var_select_model = VAR(train_df)
    selected_p = var_select_model.select_order(maxlags=6)
    selected_p = selected_p.selected_orders["aic"]
    if selected_p == 0:
        selected_p = 1
else:
    selected_p = p

test_df = test_df[[target] + selected_columns].iloc[(estimation_window + step - selected_p):(estimation_window + step + 1), :]

# zscore of test data
test_df = (test_df - mean) / std

# subset data into train and test
Xt_train = train_df.drop([target], axis=1)
yt_train = train_df[[target]]

Xt_test = test_df.drop([target], axis=1)
yt_test = test_df[[target]]

  self._init_dates(dates, freq)


In [5]:
if '-lin' in fs_method:
    model_wrapper = LinearRegressionWrapper(model_params={'fit_intercept': True})
elif '-rf'in fs_method:
    model_wrapper = RandomForestWrapper()
elif '-svm' in fs_method:
    model_wrapper = SVMWrapper()
else:
    raise Exception(f'Feature Selection Model not recognized: {fs_method}')

rfe = RFE(estimator=model_wrapper.ModelClass, step=1)

pipeline = Pipeline([
    ('feature_selector', rfe),
    ('model', model_wrapper.ModelClass)
])

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=model_wrapper.param_grid,
    n_iter=10,
    cv=5,
    scoring="neg_mean_squared_error",
    random_state=42
)

Xt_train = pd.concat([yt_train, Xt_train], axis=1)
Xt_test = pd.concat([yt_test, Xt_test], axis=1)

# create lags of Xt variables
Xt_train = add_and_keep_lags_only(data=Xt_train, lags=selected_p)
Xt_test = add_and_keep_lags_only(data=Xt_test, lags=selected_p)

Xt_train = Xt_train.dropna()
yt_train = yt_train.loc[Xt_train.index]

search_output = search.fit(Xt_train, yt_train.values.ravel())

selected_indices = search.best_estimator_.named_steps['feature_selector'].get_support()
selected_variables = Xt_train.columns[selected_indices]

Traceback (most recent call last):
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/base.py", line 702, i

Traceback (most recent call last):
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/danielco/miniconda3/envs/cml-fin/lib/python3.8/site-packages/sklearn/base.py", line 702, i

ValueError: when `importance_getter=='auto'`, the underlying estimator SVR should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [14]:
search.best_params_

{'model__max_depth': 2,
 'model__max_features': 'log2',
 'model__n_estimators': 300}

In [15]:
selected_indices

array([ True,  True,  True,  True,  True, False, False,  True,  True,
       False,  True,  True, False,  True, False,  True, False,  True,
       False, False,  True, False, False,  True,  True, False, False,
       False,  True,  True, False,  True,  True, False, False, False,
        True, False, False, False])

In [16]:
selected_variables

Index(['ldEXME(t-1)', 'ldEXME(t-2)', 'ldEXME(t-3)', 'ldEXME(t-4)', 'dCMR(t-1)',
       'dCMR(t-4)', 'ldFCIr(t-1)', 'ldFCIr(t-3)', 'ldFCIr(t-4)',
       'ldRIMFr(t-2)', 'ldRIMFr(t-4)', 'ldMAr(t-2)', 'ldCHFr(t-1)',
       'ldCHFr(t-4)', 'ldtotOAr(t-1)', 'ldGDPch(t-1)', 'ldGDPch(t-2)',
       'ldGDPch(t-4)', 'ldGDPeu(t-1)', 'dCPI(t-1)'],
      dtype='object')