In [1]:
import pandas as pd
import os
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri
from statsmodels.tsa.api import VAR

from metadata.etfs import etfs_large, etfs_small

pandas2ri.activate()

inputs_path = os.path.join(os.getcwd(), "data", "inputs")
data_name = "monetary-policy-processed"
target = "ldEXME"
p = -1
pval_threshold = 0.05

In [2]:
data = pd.read_csv(os.path.join(inputs_path, f'{data_name}.csv'))

# fix columns
if "Unnamed: 0" in data.columns:
    data = data.drop(["Unnamed: 0"], axis=1)

# fix dates
data["date"] = pd.to_datetime(data["date"])
data = data.set_index("date")

etfs_large = etfs_large.copy()

if target != "ldEXME":
    # select etfs to remove
    removed_etfs = [etf for etf in etfs_large if etf != target]

    # delete etfs
    selected_data = data.drop(removed_etfs, axis=1)
else:
    selected_data = data.copy()

In [3]:
selected_data = selected_data # .iloc[1:100,:]

if p == -1:
    var_select_model = VAR(selected_data)
    selected_p = var_select_model.select_order(maxlags=6)
    selected_p = selected_p.selected_orders["aic"]
    if selected_p == 0:
        selected_p = 1
else:
    selected_p = p

X_train = selected_data.drop(target, axis=1).values
y_train = selected_data[target]

X_train_r = numpy2ri.numpy2rpy(X_train)
y_train_r = numpy2ri.numpy2rpy(y_train)
selected_p_r =  robjects.vectors.IntVector([selected_p])
pval_threshold_r = robjects.vectors.IntVector([pval_threshold])

# pass inputs to global variables
robjects.globalenv['Xmatrix'] = X_train_r
robjects.globalenv['Y'] = y_train_r
robjects.globalenv["selected_p"] = selected_p_r
robjects.globalenv["pval_threshold"] = pval_threshold_r

robjects.r(f'''
    library(seqICP)

    seqICP_result <- seqICP(Xmatrix,
                            Y,
                            test="smooth.variance",
                            par.test=list(alpha=pval_threshold,B=1000),
                            model="ar",
                            par.model=list(pknown=TRUE,p=selected_p),
                            stopIfEmpty=FALSE,
                            silent=TRUE)
    seqICP_summary <- summary(seqICP_result)
    parent_set <- seqICP_result$parent.set
    p_values <- seqICP_result$p.values


''')

# retrieve results from seqICP
parent_set = robjects.r['parent_set']
p_values = robjects.r['p_values']

  self._init_dates(dates, freq)



 Invariant Linear Causal Regression at level 0
 No variable shows a significant causal effect
 
           coefficient lower bound upper bound  p-value   
intercept        0.00        -Inf         Inf       NA   
X1[t]            0.00    0.000000           0    0.068 . 
X2[t]            0.00    0.000000           0    0.016 * 
X3[t]            0.00    0.000000           0    0.053 . 
X4[t]            0.00    0.000000           0    0.045 * 
X5[t]            0.00    0.000000           0    0.068 . 
X6[t]            0.00    0.000000           0    0.068 . 
X7[t]            0.00    0.000000           0    0.006 **
X8[t]            0.00    0.000000           0    0.068 . 
X9[t]            0.00    0.000000           0    0.068 . 
Y0[t-1]          0.20        -Inf         Inf       NA   
X1[t-1]         -0.23        -Inf         Inf       NA   
X2[t-1]          0.23        -Inf         Inf       NA   
X3[t-1]          0.11        -Inf         Inf       NA   
X4[t-1]         -0.04        -In

In [18]:
# retrieve results from seqICP
p_values = robjects.r['p_values']

selected_variables_df = pd.DataFrame({
    "variables": selected_data.drop(target, axis=1).columns,
    "pval": robjects.r['p_values']
})

selected_variables_df = selected_variables_df.loc[selected_variables_df["pval"] <= pval_threshold]

if selected_variables_df.shape[0] > 0:
    selected_variables = []
    for feature in selected_variables_df["variables"]:
        for i in range(1, selected_p+1):
            selected_variables.append(f"{feature}(t-{i})")
else:
    selected_variables = []

In [19]:
selected_variables

['ldFCIr(t-1)',
 'ldFCIr(t-2)',
 'ldFCIr(t-3)',
 'ldFCIr(t-4)',
 'ldMAr(t-1)',
 'ldMAr(t-2)',
 'ldMAr(t-3)',
 'ldMAr(t-4)',
 'ldGDPch(t-1)',
 'ldGDPch(t-2)',
 'ldGDPch(t-3)',
 'ldGDPch(t-4)']