In [1]:
import os
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np

from utils.conn_data import load_pickle

warnings.filterwarnings('ignore')

plt.style.use('ggplot')

In [2]:
outputs_path = os.path.join(os.path.dirname(os.getcwd()), "src", "data", "outputs")
des = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), "src", 'data', 'utils', 'fredmd_description.csv'), sep=';')

lasso_methods = [
    "lasso1_nocluster",
    "lasso2_nocluster",
]
varlingam_methods = [
    "var-lingam_rollingkmeans_k5_pca", "var-lingam_rollingspectral_k5_pca",
    "var-lingam_rollingkmeans_k5_rank", "var-lingam_rollingspectral_k5_rank",
    "var-lingam_rollingkmeans_k10_pca", "var-lingam_rollingspectral_k10_pca",
    "var-lingam_rollingkmeans_k10_rank", "var-lingam_rollingspectral_k10_rank",
    "var-lingam_rollingspectral_kauto_eigen_rank", "var-lingam_rollingkmeans_kauto_eigen_rank"
]
pairwisegranger_methods = [
    "pairwise-granger_rollingkmeans_k5_pca", "pairwise-granger_rollingspectral_k5_pca",
    "pairwise-granger_rollingkmeans_k5_rank", "pairwise-granger_rollingspectral_k5_rank",
    "pairwise-granger_rollingkmeans_k10_pca", "pairwise-granger_rollingspectral_k10_pca",
    "pairwise-granger_rollingkmeans_k10_rank", "pairwise-granger_rollingspectral_k10_rank",
    "pairwise-granger_rollingspectral_kauto_eigen_rank", "pairwise-granger_rollingkmeans_kauto_eigen_rank"
]
mulivariategranger_methods = [
    "multivariate-granger_rollingkmeans_k5_pca", "multivariate-granger_rollingspectral_k5_pca",
    "multivariate-granger_rollingkmeans_k5_rank", "multivariate-granger_rollingspectral_k5_rank",
    "multivariate-granger_rollingkmeans_k10_pca", "multivariate-granger_rollingspectral_k10_pca",
    "multivariate-granger_rollingkmeans_k10_rank", "multivariate-granger_rollingspectral_k10_rank",
    "multivariate-granger_rollingspectral_kauto_eigen_rank", "multivariate-granger_rollingkmeans_kauto_eigen_rank"
]
dynotears_methods = [
        "dynotears_rollingkmeans_k5_pca", "dynotears_rollingspectral_k5_pca", 
        "dynotears_rollingkmeans_k5_rank", "dynotears_rollingspectral_k5_rank",
        "dynotears_rollingkmeans_k10_pca", "dynotears_rollingspectral_k10_pca", 
        "dynotears_rollingkmeans_k10_rank", "dynotears_rollingspectral_k10_rank",
        "dynotears_rollingspectral_kauto_eigen_rank",
]

fs_methods = lasso_methods + dynotears_methods
metric_names = ["stability", "mse", "mae"]
etf_focus = None
stability_threshold = 0.8
plot_ts = False

In [3]:
all_parents = []
for fs in fs_methods:
    results_files = glob.glob(os.path.join(outputs_path, fs, "etfs_macro_large", "*.pickle"))

    for file in results_files:
        etf = file.split(fs)[-1].split("/")[-1].split("_")[0]
        obj = load_pickle(path=file)
        parents_of_target_df = obj["parents_of_target"]

        # add tags
        parents_of_target_df["etf"] = etf
        parents_of_target_df["fs"] = fs

        # fill na of clusters column
        if (fs == "lasso1_nocluster") or (fs == "pairwise-granger_nocluster") or (fs == "multivariate-granger_nocluster"):
            parents_of_target_df['cluster'] = pd.factorize(parents_of_target_df['fred'])[0]
        else:
            parents_of_target_df["cluster"] = parents_of_target_df["cluster"].fillna(parents_of_target_df["cluster"].max() + 1)

        all_parents.append(parents_of_target_df)
all_parents_df = pd.concat(all_parents)

In [9]:
etf = "SPY"
fs = "dynotears_rollingkmeans_k5_rank"

check_df = all_parents_df.loc[(all_parents_df['etf'] == etf)&(all_parents_df['fs'] == fs)].drop(["etf", "fs"], axis=1)
check_df["date"] = [pd.to_datetime(dtref).strftime('%Y-%m-%d') for dtref in check_df["date"]]

cluster_variables = {}
for cluster in check_df["cluster"].unique():
    cluster_variables[cluster] = check_df.loc[check_df["cluster"] == cluster]["fred"].unique()

In [11]:
cluster_variables

{4.0: array(['RPI', 'AMDMUOx', 'UEMP27OV', 'UEMPLT5'], dtype=object),
 1.0: array(['S&P: indust', 'IPCONGD', 'CMRMTSPLx', 'UMCSENTx'], dtype=object),
 2.0: array(['EXJPUSx', 'INVEST', 'S&P: indust'], dtype=object),
 5.0: array(['SPY'], dtype=object),
 3.0: array(['HOUSTW', 'AMDMUOx', 'IPNMAT', 'CUSR0000SAS'], dtype=object),
 0.0: array(['PPICMM', 'RETAILx', 'ACOGNO'], dtype=object)}

In [14]:
for i, cluster1 in enumerate(cluster_variables.keys()):
    for j, cluster2 in enumerate(cluster_variables.keys()):
        inter = set(cluster_variables[cluster1]).intersection(cluster_variables[cluster2])
        if (i < j) and (len(inter) > 0):
            print(f"Cluster {cluster1} vs Cluster {cluster2}")
            print(f"Common variables: {set(cluster_variables[cluster1]).intersection(cluster_variables[cluster2])}")
            print("\n")
        else:
            continue

Cluster 4.0 vs Cluster 3.0
Common variables: {'AMDMUOx'}


Cluster 1.0 vs Cluster 2.0
Common variables: {'S&P: indust'}


