In [16]:
import os
import pandas as pd
import warnings
from causalnex.structure.notears import from_pandas
from causalnex.structure.dynotears import from_pandas_dynamic

warnings.filterwarnings("ignore")  # silence warnings

In [17]:
inputs_path = os.path.join(os.path.dirname(os.getcwd()), "data", "inputs")
data_name = "etfs_macro_large"
deleted_tickers = ['XLI', 'XLE', 'XLK', 'XLV', 'XLU', 'XLF', 'XLY', 'XLP', 'XLB']

data = pd.read_csv(os.path.join(inputs_path, f'{data_name}.csv'))
data['date'] = pd.to_datetime(data['date']) 
data.set_index("date", inplace=True)

target_data = data.drop(deleted_tickers, axis=1)
dates = target_data.index
target_data = target_data.reset_index(drop=True)

target_data.tail()

Unnamed: 0,SPY,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,...,CUSR0000SAC,CUSR0000SAD,CUSR0000SAS,CPIULFSL,CUSR0000SA0L2,CUSR0000SA0L5,PCEPI,DDURRG3M086SBEA,DNDGRG3M086SBEA,DSERRG3M086SBEA
270,-0.038692,0.002957,0.004051,0.004784,0.016057,0.006697,-0.001079,0.001196,0.003203,0.00133,...,-0.004755,0.003005,0.006746,0.001421,0.000255,0.002013,0.002679,0.004809,-0.008099,0.005841
271,-0.101102,0.004929,0.001923,0.003318,-0.001031,0.010616,0.000179,0.00462,0.007262,0.007908,...,0.005658,-0.006425,0.00433,0.004571,0.003748,0.005603,0.004115,-0.003914,0.008315,0.004255
272,0.062557,0.004929,0.001923,0.003318,-0.001031,0.010616,0.000179,0.00462,0.007262,0.007908,...,0.005658,-0.006425,0.00433,0.004571,0.003748,0.005603,0.004115,-0.003914,0.008315,0.004255
273,0.054101,0.001898,0.001797,-0.003194,-0.009608,-0.010802,-0.005831,-0.005445,-0.00661,-0.005735,...,-0.001741,-0.008484,0.004287,0.001492,-6.9e-05,0.002563,0.001659,-0.005663,-0.000514,0.003734
274,-0.063937,-0.000347,0.000936,0.011287,0.015158,0.029199,0.000261,-0.000201,-0.001112,-0.006839,...,0.003584,-0.000563,0.006279,0.005165,0.004168,0.005929,0.006175,0.002532,0.007575,0.006401


In [3]:
dsm = from_pandas_dynamic(target_data, p=1)

In [8]:
dsm.edges()

OutEdgeView([('SPY_lag0', 'HWI_lag0'), ('SPY_lag1', 'HWI_lag0'), ('RPI_lag0', 'HWI_lag0'), ('RPI_lag1', 'HWI_lag0'), ('W875RX1_lag0', 'HWI_lag0'), ('RETAILx_lag0', 'HWI_lag0'), ('RETAILx_lag1', 'HWI_lag0'), ('IPCONGD_lag0', 'HWI_lag0'), ('IPNCONGD_lag0', 'HWI_lag0'), ('IPB51222S_lag0', 'HWI_lag0'), ('IPB51222S_lag1', 'HWI_lag0'), ('IPFUELS_lag0', 'HWI_lag0'), ('CUMFNS_lag0', 'HWI_lag0'), ('CUMFNS_lag0', 'UNRATE_lag0'), ('CUMFNS_lag0', 'UEMPMEAN_lag0'), ('CUMFNS_lag0', 'CES0600000007_lag0'), ('CUMFNS_lag0', 'AWHMAN_lag0'), ('CUMFNS_lag0', 'BAAFFM_lag0'), ('CUMFNS_lag0', 'VIXCLSx_lag0'), ('CUMFNS_lag1', 'HWI_lag0'), ('CUMFNS_lag1', 'BAAFFM_lag0'), ('CUMFNS_lag1', 'VIXCLSx_lag0'), ('CUMFNS_lag1', 'WPSFD49502_lag0'), ('HWI_lag0', 'SPY_lag0'), ('HWI_lag0', 'W875RX1_lag0'), ('HWI_lag0', 'DPCERA3M086SBEA_lag0'), ('HWI_lag0', 'CMRMTSPLx_lag0'), ('HWI_lag0', 'RETAILx_lag0'), ('HWI_lag0', 'INDPRO_lag0'), ('HWI_lag0', 'IPFPNSS_lag0'), ('HWI_lag0', 'IPFINAL_lag0'), ('HWI_lag0', 'IPCONGD_lag0'), ('

In [9]:
edges_df = pd.DataFrame(dsm.edges(), columns=['to', 'from'])[['from', 'to']]
edges_df["from_lag"] = edges_df["from"].apply(lambda x: int(x.split("_")[1][-1]))
edges_df["to_lag"] = edges_df["to"].apply(lambda x: int(x.split("_")[1][-1]))

edges_df["new_from"] = edges_df["from"].apply(lambda x: x.split("_")[0])
edges_df["new_to"] = edges_df["to"].apply(lambda x: x.split("_")[0])

edges_df = edges_df.loc[(edges_df["from_lag"] == 0) & (edges_df["to_lag"] != 0)]

edges_df.tail()

Unnamed: 0,from,to,from_lag,to_lag,new_from,new_to
2006,CPITRNSL_lag0,VIXCLSx_lag1,0,1,CPITRNSL,VIXCLSx
2007,CUSR0000SAC_lag0,VIXCLSx_lag1,0,1,CUSR0000SAC,VIXCLSx
2011,HWI_lag0,WPSID62_lag1,0,1,HWI,WPSID62
2013,HWI_lag0,OILPRICEx_lag1,0,1,HWI,OILPRICEx
2015,HWI_lag0,PPICMM_lag1,0,1,HWI,PPICMM


In [19]:
adj = []
for from_node in edges_df.loc[edges_df["from_lag"] == 0]["new_from"].unique():
    tmp_from = edges_df.loc[edges_df["new_from"] == from_node]
    col_names = []
    for idx, row in tmp_from.iterrows():
        col_names.append(f"{row['new_to']}(t-{row['to_lag']})")
    row_name = from_node

    tmp_adj = pd.DataFrame(1, columns=col_names, index=[f"{row_name}(t)"])
    adj.append(tmp_adj)
adj_df = pd.concat(adj).fillna(0)

In [20]:
adj_df

Unnamed: 0,SPY(t-1),RPI(t-1),RETAILx(t-1),IPB51222S(t-1),CUMFNS(t-1),HWI(t-1),HWIURATIO(t-1),UNRATE(t-1),UEMPMEAN(t-1),UEMPLT5(t-1),...,BAAFFM(t-1),EXSZUSx(t-1),EXUSUKx(t-1),UMCSENTx(t-1),DTCOLNVHFNM(t-1),DTCTHFNM(t-1),VIXCLSx(t-1),WPSID62(t-1),OILPRICEx(t-1),PPICMM(t-1)
HWI(t),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
BAAFFM(t),0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
VIXCLSx(t),0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
WPSFD49502(t),0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
SPY(t),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CUSR0000SA0L5(t),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DDURRG3M086SBEA(t),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DNDGRG3M086SBEA(t),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IPFPNSS(t),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [105]:
target = "RETAILx"

# select non-zero coefficients of the target
list(adj_df[adj_df.index == f"{target}(t)"][adj_df[adj_df.index == f"{target}(t)"] != 0].dropna(axis=1).columns)

['CES0600000007(t-1)', 'AWHMAN(t-1)']