In [1]:
import os
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np

from utils.conn_data import load_pickle

warnings.filterwarnings('ignore')

plt.style.use('ggplot')

In [22]:
outputs_path = os.path.join(os.path.dirname(os.getcwd()), "src", "data", "backup")
des = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), "src", 'data', 'utils', 'fredmd_description.csv'), sep=';')

fs_methods = [
    "dynotears_rollingcluster_k5",
    "var-lingam_rollingcluster_k5",
]
metric_names = ["stability", "mse", "mae"]
etf_focus = None
stability_threshold = 0.8
plot_ts = False

In [23]:
all_parents = []
for fs in fs_methods:
    results_files = glob.glob(os.path.join(outputs_path, fs, "etfs_macro_large", "*.pickle"))

    for file in results_files:
        etf = file.split(fs)[-1].split("/")[-1].split("_")[0]
        obj = load_pickle(path=file)
        parents_of_target_df = obj["parents_of_target"]

        # add tags
        parents_of_target_df["etf"] = etf
        parents_of_target_df["fs"] = fs

        # fill na of clusters column
        if (fs == "dynotears_rollingcluster_k5") or (fs == "var-lingam_rollingcluster_k5"):
            parents_of_target_df["cluster"] = parents_of_target_df["cluster"].fillna(parents_of_target_df["cluster"].max() + 1)
        elif (fs == "lasso1_nocluster") or (fs == "pairwise-granger_nocluster") or (fs == "multivariate-granger_nocluster"):
            parents_of_target_df['cluster'] = pd.factorize(parents_of_target_df['fred'])[0]

        all_parents.append(parents_of_target_df)
all_parents_df = pd.concat(all_parents)

## 

In [24]:
etf = "SPY"
fs = "dynotears_rollingcluster_k5"

check_df = all_parents_df.loc[(all_parents_df['etf'] == etf)&(all_parents_df['fs'] == fs)].drop(["etf", "fs"], axis=1)
check_df["date"] = [pd.to_datetime(dtref).strftime('%Y-%m-%d') for dtref in check_df["date"]]

cluster_variables = {}
for cluster in check_df["cluster"].unique():
    cluster_variables[cluster] = check_df.loc[check_df["cluster"] == cluster]["fred"].unique()

In [25]:
cluster_variables

{4.0: array(['RPI', 'S&P: indust', 'INVEST', 'AMDMUOx', 'CMRMTSPLx', 'UEMP27OV',
        'RETAILx', 'CUSR0000SAS', 'UEMPLT5', 'ACOGNO'], dtype=object),
 1.0: array(['S&P: indust', 'INVEST', 'AMDMUOx', 'EXJPUSx', 'IPCONGD', 'PPICMM',
        'UEMP27OV', 'RETAILx', 'CUSR0000SAS'], dtype=object),
 2.0: array(['EXJPUSx', 'RPI', 'S&P: indust', 'AMDMUOx', 'CMRMTSPLx', 'IPNMAT',
        'PPICMM', 'RETAILx', 'CUSR0000SAS', 'UEMPLT5'], dtype=object),
 0.0: array(['S&P: indust', 'RPI', 'EXJPUSx', 'INVEST', 'AMDMUOx', 'HOUSTW',
        'RETAILx', 'CUSR0000SAS', 'ACOGNO'], dtype=object),
 3.0: array(['EXJPUSx', 'AMDMUOx', 'S&P: indust', 'UMCSENTx', 'UEMP27OV',
        'RETAILx', 'CUSR0000SAS', 'UEMPLT5'], dtype=object),
 5.0: array(['SPY'], dtype=object)}

In [21]:
cluster_variables

{4.0: array(['RPI', 'S&P: indust', 'INVEST', 'AMDMUOx', 'CMRMTSPLx', 'UEMP27OV',
        'RETAILx', 'CUSR0000SAS', 'UEMPLT5', 'ACOGNO'], dtype=object),
 1.0: array(['S&P: indust', 'INVEST', 'AMDMUOx', 'EXJPUSx', 'IPCONGD', 'PPICMM',
        'UEMP27OV', 'RETAILx', 'CUSR0000SAS'], dtype=object),
 2.0: array(['EXJPUSx', 'RPI', 'S&P: indust', 'AMDMUOx', 'CMRMTSPLx', 'IPNMAT',
        'PPICMM', 'RETAILx', 'CUSR0000SAS', 'UEMPLT5'], dtype=object),
 0.0: array(['S&P: indust', 'RPI', 'EXJPUSx', 'INVEST', 'AMDMUOx', 'HOUSTW',
        'RETAILx', 'CUSR0000SAS', 'ACOGNO'], dtype=object),
 3.0: array(['EXJPUSx', 'AMDMUOx', 'S&P: indust', 'UMCSENTx', 'UEMP27OV',
        'RETAILx', 'CUSR0000SAS', 'UEMPLT5'], dtype=object),
 5.0: array(['SPY'], dtype=object)}