In [19]:
import argparse
import pandas as pd
import os
import numpy as np

from metadata.etfs import etfs_large
from utils.parsers import str_2_bool

from statsmodels.tsa.api import VAR
import lingam
import statsmodels.api as sm

from utils.parsers import add_and_keep_lags_only

parser = argparse.ArgumentParser(description="Run forecast.")

estimation_window=12 * 7
p=1
correl_window=100000
beta_threshold=0.4
pval_threshold=0.05
fix_start=True
incercept=True
fs_method="var-lingam"
opt_k_method="no"
clustering_method="no"
n_clusters=0
intra_cluster_selection="no"
data_name="etfs_macro_large"
inputs_path=os.path.join(os.getcwd(), "data", "inputs")
outputs_path=os.path.join(os.getcwd(), "data", "outputs")
target="SPY"

In [20]:
fix_start = str_2_bool(fix_start)
incercept = str_2_bool(incercept)

data = pd.read_csv(os.path.join(inputs_path, f'{data_name}.csv'))

# fix columns
if "Unnamed: 0" in data.columns:
    data = data.drop(["Unnamed: 0"], axis=1)

# fix dates
data["date"] = pd.to_datetime(data["date"])
data = data.set_index("date")

etfs_large = etfs_large.copy()

target = target

if target != "ldEXME":
    # select etfs to remove
    removed_etfs = [etf for etf in etfs_large if etf != target]

    # delete etfs
    selected_data = data.drop(removed_etfs, axis=1)
else:
    selected_data = data.copy()

In [21]:
data=selected_data
target=target
fix_start=fix_start
estimation_window=estimation_window
correl_window=correl_window
p=p
beta_threshold=beta_threshold
pval_threshold=pval_threshold
incercept=incercept
fs_method=fs_method
opt_k_method=opt_k_method
clustering_method=clustering_method
n_clusters=n_clusters
intra_cluster_selection=intra_cluster_selection

In [45]:
step = 38

if fix_start or (step == 0):
    start = 0
else:
    start += 1

train_df = data.iloc[start:(estimation_window + step), :]
test_df = data.iloc[start:(estimation_window + step + 1), :]

# compute within c1luster correlation
if clustering_method != "no":
    if rolling_cluster:
        labelled_clusters = clusters_series[[str(step)]]
        labelled_clusters.columns = ["cluster"]
        labelled_clusters.reset_index(inplace = True)
    else:
        clusters = cm.compute_clusters(data=data, target=target, n_clusters=n_clusters, clustering_method=clustering_method)  
        labelled_clusters = cm.add_cluster_description(clusters=clusters)
    
    if intra_cluster_selection == "rank":
        ranks = cm.compute_within_cluster_corr_rank(data=train_df,
                                                    target=target,
                                                    labelled_clusters=labelled_clusters,
                                                    correl_window=correl_window)
        # select features and time window
        last_row = pd.DataFrame(ranks.iloc[-1])
        selected_columns = list(last_row[last_row == 1].dropna().index)
    elif intra_cluster_selection == "pca":
        train_pcs_df = cm.compute_within_cluster_pca(data=train_df,
                                                        labelled_clusters=labelled_clusters,
                                                        n_pcs=1)
        
        test_pcs_df = cm.compute_within_cluster_pca(data=test_df,
                                                    labelled_clusters=labelled_clusters,
                                                    n_pcs=1)

        train_df = pd.concat([train_df, train_pcs_df], axis=1)
        test_df = pd.concat([test_df, test_pcs_df], axis=1)
        selected_columns = list(train_pcs_df.columns)
    else:
        raise Exception(f"intra cluster selection method not registered: {intra_cluster_selection}")
else:
    labelled_clusters = pd.DataFrame([{"fred": target, "cluster": 1, "description": target}])
    selected_columns = list(train_df.drop([target], axis=1).columns)

train_df = train_df[[target] + selected_columns]

# zscore of train data
mean = train_df.mean()
std = train_df.std()

train_df = (train_df - mean) / std

# select optimal lag
if p == -1:
    var_select_model = VAR(train_df)
    selected_p = var_select_model.select_order(maxlags=6)
    selected_p = selected_p.selected_orders["aic"]
    if selected_p == 0:
        selected_p = 1
else:
    selected_p = p

test_df = test_df[[target] + selected_columns].iloc[(estimation_window + step - selected_p):(estimation_window + step + 1), :]

# zscore of test data
test_df = (test_df - mean) / std

# subset data into train and test
Xt_train = train_df.drop([target], axis=1)
yt_train = train_df[[target]]

Xt_test = test_df.drop([target], axis=1)
yt_test = test_df[[target]]

In [48]:
data_train = pd.concat([yt_train, Xt_train], axis=1)
data_train.corr()

Unnamed: 0,SPY,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,...,DDURRG3M086SBEA,DNDGRG3M086SBEA,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,DTCOLNVHFNM,DTCTHFNM,INVEST
SPY,1.000000,0.142770,0.103121,0.130010,0.304004,0.228121,0.177073,0.223957,0.154349,0.137758,...,0.029958,0.217598,-0.062363,0.046977,0.072587,0.005625,0.331569,-0.042136,-0.031067,0.170683
RPI,0.142770,1.000000,0.700497,0.177957,0.095723,0.072420,0.158523,0.161066,0.153914,0.057759,...,-0.052165,-0.168500,-0.052257,0.109515,-0.016895,0.028417,0.074530,0.030869,-0.065077,-0.019175
W875RX1,0.103121,0.700497,1.000000,0.193139,0.156865,0.061374,0.282955,0.281456,0.251500,0.135722,...,-0.104118,-0.257470,-0.159964,0.077760,0.010441,-0.003972,0.115000,0.013992,-0.145309,-0.090076
DPCERA3M086SBEA,0.130010,0.177957,0.193139,1.000000,0.502641,0.773954,0.243841,0.190236,0.168984,0.105630,...,0.155342,-0.019522,0.453605,-0.014261,0.112786,-0.195473,0.100408,0.181452,0.005529,0.003641
CMRMTSPLx,0.304004,0.095723,0.156865,0.502641,1.000000,0.532670,0.473347,0.449688,0.411016,0.325884,...,0.051208,-0.036873,0.206899,0.081710,-0.044913,0.073750,-0.047709,0.105416,0.081111,0.013101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CES3000000008,0.005625,0.028417,-0.003972,-0.195473,0.073750,-0.167977,0.049695,0.049606,0.054038,-0.009293,...,-0.075734,-0.061284,-0.152853,0.531485,-0.236047,1.000000,-0.090383,-0.074853,0.169523,0.093827
UMCSENTx,0.331569,0.074530,0.115000,0.100408,-0.047709,0.045877,-0.101191,-0.083237,-0.090791,-0.065559,...,0.075095,-0.001604,0.057299,-0.045283,0.059161,-0.090383,1.000000,0.038717,-0.106998,-0.054975
DTCOLNVHFNM,-0.042136,0.030869,0.013992,0.181452,0.105416,0.251633,0.078052,0.101862,0.132685,0.096671,...,0.139067,-0.076151,0.288018,0.026515,0.040895,-0.074853,0.038717,1.000000,0.270325,-0.204533
DTCTHFNM,-0.031067,-0.065077,-0.145309,0.005529,0.081111,0.039565,0.054081,0.096503,0.137666,0.134144,...,-0.103690,-0.022914,0.024984,0.082175,-0.056794,0.169523,-0.106998,0.270325,1.000000,-0.174247


In [50]:
data_train.corr().to_clipboard()

In [76]:
corr = data_train.corr()
corr.values[np.triu_indices_from(corr, k=1)] = np.nan

# melt the matrix to get it in stacked form
melted_corr = corr.reset_index().melt(id_vars='index', var_name='feature2', value_name='corr').dropna()
melted_corr.columns = ['feature1', 'feature2', 'corr']  # Rename columns

# reset index for a cleaner look
melted_corr.reset_index(drop=True, inplace=True)

# create abs value corr
melted_corr['corr_abs'] = np.abs(melted_corr['corr'])

# exclude main diagonal
melted_corr = melted_corr.loc[~((melted_corr['corr'] == 1))]

In [87]:
from sklearn.linear_model import LassoCV

# define and fit Lasso with cross-validation
alphas = np.linspace(0.0001, 0.05, 100) # we dont want to apply a very strong regularization, but we want var-lingam to do most of the work
lasso_cv = LassoCV(cv=5, random_state=42, max_iter=1000000, alphas=alphas).fit(Xt_train, yt_train.values.ravel())

# output the selected coefficients
lasso_coefficients = pd.Series(lasso_cv.coef_, index=Xt_train.columns)
selected_features = lasso_coefficients[lasso_coefficients != 0].index.tolist()

In [88]:
corr = data_train[selected_features].corr()
corr.values[np.triu_indices_from(corr, k=1)] = np.nan

# melt the matrix to get it in stacked form
melted_corr = corr.reset_index().melt(id_vars='index', var_name='feature2', value_name='corr').dropna()
melted_corr.columns = ['feature1', 'feature2', 'corr']  # Rename columns

# reset index for a cleaner look
melted_corr.reset_index(drop=True, inplace=True)

# create abs value corr
melted_corr['corr_abs'] = np.abs(melted_corr['corr'])

# exclude main diagonal
melted_corr = melted_corr.loc[~((melted_corr['corr'] == 1))]

In [89]:
melted_corr.sort_values(['corr_abs'], ascending=False)

Unnamed: 0,feature1,feature2,corr,corr_abs
12,CONSPI,RPI,-0.791223,0.791223
316,BAA,AAA,0.784665,0.784665
109,AWHMAN,CES1021000001,0.708830,0.708830
185,AAAFFM,AWHMAN,-0.674246,0.674246
56,UEMP27OV,UEMPMEAN,0.657671,0.657671
...,...,...,...,...
166,EXJPUSx,AWOTMAN,-0.001880,0.001880
144,EXJPUSx,USTRADE,0.001701,0.001701
348,CES2000000008,AAAFFM,-0.001487,0.001487
247,WPSID62,TOTRESNS,-0.001424,0.001424


In [35]:
dags = {}
parents_of_target = []
predictions = []

data_train = pd.concat([yt_train, Xt_train], axis=1)
data_test = pd.concat([yt_test, Xt_test], axis=1)

# run VAR-LiNGAM
var_lingam = lingam.VARLiNGAM(lags=selected_p, criterion="none")
var_lingam_fit = var_lingam.fit(data_train)

# build labels
labels = {}
selected_variables = []
for i in range(selected_p):

    var_names = []
    for colname in data_train.columns:
            if i == 0:
                var_names.append(f"{colname}(t)")
            else:
                var_names.append(f"{colname}(t-{i})")
    labels[f'labels{i}'] = var_names

    B = var_lingam_fit._adjacency_matrices[i]
    B_df = pd.DataFrame(B, columns=labels[f'labels{i}'] , index=labels['labels0'] )
    tmp_selected_variables = list(B_df.loc["{target}(t)".format(target=target)][np.abs(B_df.loc["{target}(t)".format(target=target)]) > beta_threshold].index)

    selected_variables += [name.split("(")[0] for name in tmp_selected_variables]

selected_variables = list(set(selected_variables))
selected_variables = [f"{var}(t-{i})" for var in selected_variables for i in range(1, selected_p+1)]

# save dags
dict_ = {Xt_train.index[-1].strftime("%Y%m%d"): {
    "dag": var_lingam_fit._adjacency_matrices, 
    "threshold": beta_threshold,
    "labels": labels,
    }
}
dags.update(dict_)

# create lags of Xt variables
data_train = add_and_keep_lags_only(data=data_train, lags=selected_p)
data_test = add_and_keep_lags_only(data=data_test, lags=selected_p)
    
Xt_train = data_train.dropna()
Xt_test = data_test.dropna()



In [None]:
selected_variables_df = pd.DataFrame(1, index=selected_variables, columns=[Xt_test.index[-1]]).T

yt_test_zscore = yt_test.copy()
yt_test_zscore.index = pd.to_datetime(yt_test_zscore.index)
yt_test = yt_test * std[yt_test.columns[0]] + mean[yt_test.columns[0]]
yt_test.index = pd.to_datetime(yt_test.index)

if len(selected_variables) != 0:

    # add clusters to parents
    melted_selected_variables_df = selected_variables_df.reset_index().melt("index").rename(columns={"index": "date"})
    melted_selected_variables_df["fred"] = [varname.split("(")[0] for varname in melted_selected_variables_df["variable"]]
    melted_selected_variables_df = pd.merge(melted_selected_variables_df, labelled_clusters[["fred", "cluster"]], how="left", on="fred")
    parents_of_target.append(melted_selected_variables_df)

    Xt_selected_train = []
    Xt_selected_test = []
    for full_vname in selected_variables:
        Xt_selected_train.append(Xt_train[full_vname])
        Xt_selected_test.append(Xt_test[full_vname])

    Xt_selected_train = pd.concat(Xt_selected_train, axis=1)
    Xt_selected_train = pd.concat([yt_train, Xt_selected_train], axis=1)
    Xt_selected_train = Xt_selected_train.dropna()

    Xt_selected_test = pd.concat(Xt_selected_test, axis=1)
    Xt_selected_test = Xt_selected_test.dropna()

    if incercept:
        Xt_selected_train["const"] = 1
        Xt_selected_test["const"] = 1

    # linear regression estimate and prediction
    model = sm.OLS(endog=Xt_selected_train[target], exog=Xt_selected_train.drop([target], axis=1))
    model_fit = model.fit()
    ypred = model_fit.predict(exog=Xt_selected_test)

    pred = pd.DataFrame([{
        "date": ypred.index[0],
        "prediction_zscore": ypred[0],
        "true_zscore": yt_test_zscore.loc[ypred.index[0]][0],
        "prediction": ypred[0] * std[yt_test.columns[0]] + mean[yt_test.columns[0]],
        "true": yt_test.loc[ypred.index[0]][0],
        }])
    predictions.append(pred)
else:
    pred = pd.DataFrame([{
        "date": yt_test.index[-1],
        "prediction_zscore": 0,
        "true_zscore": yt_test_zscore.iloc[-1][0],
        "prediction": 0,
        "true": yt_test.iloc[-1][0],
        }])
    predictions.append(pred)