In [3]:
import os
import pandas as pd
from scipy import stats
import lingam
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from lingam.utils import make_dot
from tqdm import tqdm

In [10]:
df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'inputs', 'etfs_macro_large.csv'))
df.set_index('date', inplace=True)

des = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'utils', 'fredmd_description.csv'), sep=';')

df.head()

Unnamed: 0_level_0,SPY,XLI,XLE,XLK,XLV,XLU,XLF,XLY,XLP,XLB,...,CUSR0000SAC,CUSR0000SAD,CUSR0000SAS,CPIULFSL,CUSR0000SA0L2,CUSR0000SA0L5,PCEPI,DDURRG3M086SBEA,DNDGRG3M086SBEA,DSERRG3M086SBEA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-02-01,-0.021282,-0.079459,0.001764,-0.012479,0.026966,-0.015484,-0.023499,-0.092305,-0.002751,-0.122759,...,0.006116,-0.001597,0.002601,0.004109,0.004902,0.003639,0.003131,-0.002778,0.007243,0.002975
2000-03-01,-0.01701,-0.076506,-0.027588,0.096418,-0.090493,-0.053417,-0.09694,-0.1045,-0.104976,-0.134285,...,0.010111,0.003192,0.003112,0.006423,0.006701,0.006637,0.004292,0.00194,0.012709,0.001775
2000-05-01,-0.032612,0.055099,0.042422,-0.145227,-0.019337,0.079906,-0.00456,0.04325,0.107819,-0.007696,...,0.0,0.003185,0.002581,0.001164,0.001215,0.001805,0.000836,0.000103,-0.001753,0.001923
2000-06-01,-0.00472,-0.003146,0.071931,-0.062907,-0.024226,0.004376,0.075222,-0.073864,0.04256,-0.021432,...,0.00738,-0.002388,0.004628,0.006957,0.006657,0.005992,0.00339,-0.003126,0.009124,0.002706
2000-08-01,-0.008866,0.01963,-0.040169,-0.056302,0.007473,0.005542,0.05811,0.020346,0.015038,-0.036369,...,-0.004013,-0.000798,0.003061,-0.000577,-0.001203,-0.000596,-6.4e-05,-0.004124,-0.005135,0.002607


In [11]:
start_window = 12 * 8
p = 1
correl_window = 1000
min_periods = 12
beta_threshold = 0.05
start = 0
end = start + start_window

etfs = ["SPY" ,"XLI" ,"XLE" ,"XLK" ,"XLV", "XLU" ,"XLF" ,"XLY" ,"XLP" ,"XLB"]
targets = ["SPY", "XLE", "XLK"]

all_parents_of_targets = []
for target in targets:
    copy_etfs = [var for var in etfs if var != target]
    target_df = df.drop(copy_etfs, axis=1)

    # compute correlation matrix
    corr = target_df.drop([target], axis=1).corr()

    # compute forward looking cluster of the correlation matrix
    kmeans = KMeans(n_clusters=20, random_state=0, n_init="auto").fit(corr)

    # label clusters
    clusters_df = pd.DataFrame({"fred": target_df.drop([target], axis=1).columns, "cluster": kmeans.labels_})
    clusters_df.sort_values(by="cluster")
    clusters_df = pd.merge(clusters_df, des[["fred", "description"]], on='fred')

    # compute rolling correlation for each cluster and rank them
    correl_dict = {}
    rank_list = []
    for c in np.unique(kmeans.labels_):
        clustes_variables = clusters_df.loc[clusters_df['cluster'] == c]['fred'].values

        clusters_features_df = target_df[[target] + list(clustes_variables)]
        
        # compute rolling correlation
        rolling_corr_df = clusters_features_df.rolling(window=correl_window, min_periods=min_periods).corr()

        # compute correlation with the target
        rolling_corr_df = rolling_corr_df[[target]].reset_index()
        rolling_corr_df = rolling_corr_df.loc[rolling_corr_df["level_1"] != target]
        rolling_corr_df = rolling_corr_df.pivot_table(index=["date"], columns=["level_1"])
        rolling_corr_df.columns = rolling_corr_df.columns.droplevel()

        # save correl
        correl_dict[c] = rolling_corr_df

        # compute rankings given correl
        rank_df = rolling_corr_df.rank(axis=1, ascending=False)

        # save rank
        rank_list.append(rank_df)
    final_rank_df = pd.concat(rank_list, axis=1)

    target_subset_df = target_df.iloc[(min_periods-1):, :]

    parents_of_target = []
    for step in tqdm(range(0, len(target_subset_df) - start_window, 1), total=len(target_subset_df) - start_window, desc="rolling VAR-LiNGAM: {target}".format(target=target)):

        # select features and time window
        selected_columns = list(final_rank_df.iloc[(end + step)].index[(final_rank_df.iloc[(end + step)] == 1)])
        Xt = target_subset_df.iloc[start:(end + step), :][[target] + selected_columns]

        # run VARLiNGAM
        var_lingam = lingam.VARLiNGAM(lags=p)
        var_lingam_fit = var_lingam.fit(Xt)

        # build labels - ONLY WORKS FOR k=1
        labels0 = []
        labels1 = []
        for i in range(p+1):
            for colname in Xt.columns:
                if i == 0:
                    labels0.append("{}(t)".format(colname, i))
                else:
                    labels1.append("{}(t-{})".format(colname, i))

        B0 = var_lingam_fit.adjacency_matrices_[0]
        B1 = var_lingam_fit.adjacency_matrices_[1]

        B0_df = pd.DataFrame(B0, columns=labels0, index=labels0)
        B1_df = pd.DataFrame(B1, columns=labels1, index=labels0)

        selected_variables = list(B1_df.loc["{target}(t)".format(target=target)][np.abs(B1_df.loc["{target}(t)".format(target=target)]) > beta_threshold].index)
        parents_of_target.append(pd.DataFrame(1, index=selected_variables, columns=[Xt.index[-1]]).T)

    parents_of_target_df = pd.concat(parents_of_target, axis=0)
    parents_of_target_df = parents_of_target_df.reset_index().melt("index")
    parents_of_target_df["target"] = target

    all_parents_of_targets.append(parents_of_target_df)
all_parents_of_targets_df = pd.concat(all_parents_of_targets, axis=0)

rolling VAR-LiNGAM: SPY: 100%|██████████| 86/86 [00:57<00:00,  1.51it/s]
rolling VAR-LiNGAM: XLE: 100%|██████████| 86/86 [00:58<00:00,  1.46it/s]
rolling VAR-LiNGAM: XLK: 100%|██████████| 86/86 [00:57<00:00,  1.50it/s]


In [14]:
tmp_parents_of_target_df

Unnamed: 0,index,variable,value,target
0,2012-06-01,SPY(t-1),1.0,SPY
1,2012-08-01,SPY(t-1),1.0,SPY
2,2012-10-01,SPY(t-1),1.0,SPY
3,2012-11-01,SPY(t-1),1.0,SPY
4,2013-01-01,SPY(t-1),1.0,SPY
...,...,...,...,...
2059,2022-03-01,CPITRNSL(t-1),,SPY
2060,2022-04-01,CPITRNSL(t-1),1.0,SPY
2061,2022-06-01,CPITRNSL(t-1),,SPY
2062,2022-07-01,CPITRNSL(t-1),,SPY


In [12]:
target = "SPY"

tmp_parents_of_target_df = all_parents_of_targets_df.loc[all_parents_of_targets_df["target"] == target]
summary_parents_of_target_df = tmp_parents_of_target_df.fillna(0).groupby("variable").sum(numeric_only=True)[["value"]].reset_index()
summary_parents_of_target_df.columns = ["feature_name", "count"]
summary_parents_of_target_df["fred"] = [name.split("(t")[0] for name in summary_parents_of_target_df["feature_name"]]
summary_parents_of_target_df = summary_parents_of_target_df[["fred", "feature_name", "count"]]
summary_parents_of_target_df = pd.merge(summary_parents_of_target_df, clusters_df, on="fred")
summary_parents_of_target_df = summary_parents_of_target_df.sort_values(by="cluster", ascending=False)

test_df = tmp_parents_of_target_df.drop(["target"], axis=1).rename(columns={"variable": "fred"})
test_df["fred"] = [name.split("(t")[0] for name in test_df["fred"]]
merge_test_cluster_df = pd.merge(test_df, clusters_df, on=["fred"])

In [13]:
tot = merge_test_cluster_df.pivot_table(index=["index"], columns=["cluster"], values=["value"]).shape[0]
merge_test_cluster_df.pivot_table(index=["index"], columns=["cluster"], values=["value"]).fillna(0).sum() / tot

       cluster
value  1          1.000000
       2          0.837209
       3          0.802326
       5          0.906977
       9          0.988372
       10         0.686047
       11         1.000000
       12         0.872093
       14         1.000000
       15         0.244186
       17         0.918605
       18         0.267442
       19         0.953488
dtype: float64

In [14]:
test_df

Unnamed: 0,index,fred,value
0,2012-06-01,SPY,1.0
1,2012-08-01,SPY,1.0
2,2012-10-01,SPY,1.0
3,2012-11-01,SPY,1.0
4,2013-01-01,SPY,1.0
...,...,...,...
2059,2022-03-01,CPITRNSL,
2060,2022-04-01,CPITRNSL,1.0
2061,2022-06-01,CPITRNSL,
2062,2022-07-01,CPITRNSL,


In [14]:
all_summary_stability = []
for target in targets:
    tmp_parents_of_target_df = all_parents_of_targets_df.loc[all_parents_of_targets_df["target"] == target]
    summary_parents_of_target_df = tmp_parents_of_target_df.fillna(0).groupby("variable").sum(numeric_only=True)[["value"]].reset_index()
    summary_parents_of_target_df.columns = ["feature_name", "count"]
    summary_parents_of_target_df["fred"] = [name.split("(t")[0] for name in summary_parents_of_target_df["feature_name"]]
    summary_parents_of_target_df = summary_parents_of_target_df[["fred", "feature_name", "count"]]
    summary_parents_of_target_df = pd.merge(summary_parents_of_target_df, clusters_df, on="fred")
    summary_parents_of_target_df = summary_parents_of_target_df.sort_values(by="cluster", ascending=False)
    test_df = tmp_parents_of_target_df.drop(["target"], axis=1).rename(columns={"variable": "fred"})
    test_df["fred"] = [name.split("(t")[0] for name in test_df["fred"]]

    merge_test_cluster_df = pd.merge(test_df, clusters_df, on=["fred"])
    stability_df = merge_test_cluster_df.pivot_table(index=["index"], columns=["cluster"], values=["value"]).fillna(0)
    tot = stability_df.shape[0]
    summary_stability_df = (stability_df.sum() / tot).sort_values(ascending=False).reset_index().drop(["level_0"], axis=1)
    summary_stability_df.columns = ["cluster", "stability"]
    summary_stability_df["target"] = target

    all_summary_stability.append(summary_stability_df)

all_summary_stability_df = pd.concat(all_summary_stability, axis=0)

In [15]:
all_summary_stability_df.set_index(["target", "cluster"])

Unnamed: 0_level_0,Unnamed: 1_level_0,stability
target,cluster,Unnamed: 2_level_1
SPY,1,1.0
SPY,11,1.0
SPY,14,1.0
SPY,9,0.988372
SPY,19,0.953488
SPY,17,0.918605
SPY,5,0.906977
SPY,12,0.872093
SPY,2,0.837209
SPY,3,0.802326
