### Imports

In [None]:
import os
import string
import itertools
import numpy as np
import pandas as pd
from pathlib import Path

import sys
sys.path.append("..")

import torch

from utils import custom_binary_metrics
from cd_methods.DynoTears.utils import estimate_with_DYNOTEARS

from simulation.simulation_tools import get_optimal_sim_XY

from CausalTime.tools import generate_CT

import warnings
warnings.filterwarnings("ignore")

COL_NAMES = list(string.ascii_uppercase) + ["".join(a) for a in list(itertools.permutations(list(string.ascii_uppercase), r=2))]

### Simulation

In [None]:
par_dir = Path(os.getcwd()).parents[1].as_posix() 
save_dir = f"{par_dir}/data/results/cd_efficacy"
FN = "cp_style"


# Data structure is as such for convenient comparison with CausalTime
DATA_DICT = {

    # # NOTE: uncomment for MvTS real data
    # filename.split(".csv")[0]: {
    #     'data_path': f"{par_dir}/data/MvTS/{FN}/",
    #     'data_type': 'fmri',
    #     'task': filename, 
    #     'straight_path': f"{par_dir}/data/MvTS/{FN}/" + f"{filename}"
    # } for filename in os.listdir(f"{par_dir}/data/MvTS/{FN}/")

    # # NOTE: uncomment for the rest data
    filename.split(".csv")[0]: {
        'data_path': f"{par_dir}/data/{FN}/increasing_edges_cp_1/data",
        'data_type': 'fmri',
        'task': filename, 
        'straight_path': f"{par_dir}/data/{FN}/increasing_edges_cp_1/data/" + f"{filename}"
    } for filename in os.listdir(f"{par_dir}/data/{FN}/increasing_edges_cp_1/data")
}

# CausalTime Parameters
PARAMS = {
    "batch_size" : 32, 
    "hidden_size" : 128, 
    "num_layers" : 2, 
    "dropout" : 0.1, 
    "seq_length" : 20, 
    "test_size" : 0.2, 
    "learning_rate" : 0.0001, 
    "n_epochs" : 1, 
    "flow_length" : 4, 
    "gen_n" : 20, 
    "n" : 2000,
    "arch_type" : "MLP", 
    "save_path" : "outputs/", 
    "log_dir" : "log/", 
}


for k, v in list(DATA_DICT.items())[:10]:

    try:
    
        # info
        filename = v['task']
        print(f" \n------------- {filename} ---------------\n ")

        # data
        true_data = pd.read_csv(v["straight_path"])
        true_data = true_data.rename(columns=dict(zip(true_data.columns, COL_NAMES[:true_data.shape[1]])))
        
        # adjust timesteps for computation time 
        print(f"true data length: {true_data.shape[0]}")

        # shorten true data
        if true_data.shape[0]>2000:
            anchor = np.random.uniform(low=0, high=true_data.shape[0]-2000)
            true_data = true_data.loc[anchor : anchor + 2000, :]
            print(f"true data length (adjusted): {true_data.shape[0]}")

        # \epsilon added to avoid computation errors w/ PCMCI
        for i in range(true_data.shape[0]):
            for j in range(true_data.shape[1]):
                if true_data.iloc[i, j] == 0:
                    true_data.iloc[i, j] += np.random.uniform(low=0.0001, high=0.001)
        

        """ ____________________________________ Simulate w/ TCS ____________________________________ """

        results_tcs = get_optimal_sim_XY(true_data=true_data)
        tcs_data = results_tcs["optimal_data"]
        tcs_auc = results_tcs["auc"]


        print("""\n ____________________________________ Simulate w/ CausalTime ____________________________________ \n""")

        true_pd, pro_true_pd, skimmed_pd, pro_gen_pd = generate_CT(
                batch_size=PARAMS["batch_size"], 
                hidden_size=PARAMS["hidden_size"], 
                num_layers=PARAMS["num_layers"], 
                dropout=PARAMS["dropout"], 
                seq_length=PARAMS["seq_length"], 
                test_size=PARAMS["test_size"], 
                learning_rate=PARAMS["learning_rate"], 
                n_epochs=PARAMS["n_epochs"], 
                flow_length=PARAMS["flow_length"], 
                gen_n=PARAMS["gen_n"], 
                n=PARAMS["n"],
                arch_type=PARAMS["arch_type"], 
                save_path=PARAMS["save_path"], 
                log_dir=PARAMS["log_dir"], 
                data_path=v["data_path"],
                data_type= v["data_type"], 
                task= v["task"],
            )
        ct_data = pro_gen_pd.copy()

        # Store
        os.makedirs(f"{save_dir}/simulated_tcs/{FN}/", exist_ok=True)
        tcs_data.to_csv(f"{save_dir}/simulated_tcs/{FN}/{filename}", index=False)
        os.makedirs(f"{save_dir}/simulated_ct/{FN}/", exist_ok=True)
        ct_data.to_csv(f"{save_dir}/simulated_ct/{FN}/{filename}", index=False)
        
    
    except:
        print(f"LOG: CD Efficacy: Error occured when simulating from {FN}.")
        continue

### CD Efficacy

In [None]:
ori_paths = {
    'air_quality_mini' : Path(os.getcwd()).parents[1] / "data" / "MvTS" / "air_quality_mini",
    'AirQualityUCI' : Path(os.getcwd()).parents[1] / "data" / "MvTS" / "AirQualityUCI",
    'bike-usage' : Path(os.getcwd()).parents[1] / "data" / "MvTS" / "bike-usage",
    'cp_style' : Path(os.getcwd()).parents[1] / "data" / "cp_style" / "increasing_edges_cp_1" / "data",
    'outdoor' : Path(os.getcwd()).parents[1] / "data" / "MvTS" / "outdoor",
    'ETTh1' : Path(os.getcwd()).parents[1] / "data" / "MvTS" / "ETTh1",
    'ETTm1' : Path(os.getcwd()).parents[1] / "data" / "MvTS" / "ETTm1",
    'fMRI' : Path(os.getcwd()).parents[1] / "data" / "fMRI" / "timeseries",
    'WTH' : Path(os.getcwd()).parents[1] / "data" / "MvTS" / "WTH"
}

sim_path = Path(os.getcwd()).parents[1] / "data" / "results" / "cd_efficacy"

res_ct = {}
res_tcs = {}
res_both = {}

for FN in ori_paths.keys():

    res_ct[FN] = {}
    res_tcs[FN] = {}
    res_both[FN] = {}

    print(FN)
    for k in os.listdir(ori_paths[FN]):
        try:
            true_data = pd.read_csv(ori_paths[FN] / k)
            true_data = true_data.rename(columns=dict(zip(true_data.columns, COL_NAMES[:true_data.shape[1]])))
            tcs_data = pd.read_csv(sim_path / "simulated_tcs" / FN / k)
            ct_data = pd.read_csv(sim_path / "simulated_ct" / FN / k)

            # Fix potential length mismatches
            assert ct_data.shape == tcs_data.shape, AssertionError("Different data shape for TCS and CausalTime.")
            if true_data.shape[0] > tcs_data.shape[0]:
                true_data = true_data[:tcs_data.shape[0]]
            elif true_data.shape[0] < tcs_data.shape[0]:
                tcs_data = tcs_data[:true_data.shape[0]]
                ct_data = ct_data[:true_data.shape[0]]

            print(f"- {k}")
            print(f"    - {(tcs_data == ct_data).prod().prod()}")

            adj_cp_true, adj_pd_true = estimate_with_DYNOTEARS(true_data=true_data)
            adj_cp_tcs, adj_pd_tcs = estimate_with_DYNOTEARS(true_data=tcs_data)
            adj_cp_ct, adj_pd_ct = estimate_with_DYNOTEARS(true_data=ct_data)

            tpr, fpr, tnr, fnr, auc = custom_binary_metrics(torch.tensor(adj_cp_tcs), torch.tensor(adj_cp_true), verbose=False)
            res_tcs[FN][k] = auc.item()

            tpr, fpr, tnr, fnr, auc = custom_binary_metrics(torch.tensor(adj_cp_ct), torch.tensor(adj_cp_true), verbose=False)
            res_ct[FN][k] = auc.item()

        except:
            continue
    
    res_both[FN]["TCS_mean"] = np.array(list(res_tcs[FN].values())).mean().round(2)
    res_both[FN]["CT_mean"] = np.array(list(res_ct[FN].values())).mean().round(2)
    res_both[FN]["TCS_var"] = np.array(list(res_tcs[FN].values())).var().round(2)
    res_both[FN]["CT_var"] = np.array(list(res_ct[FN].values())).var().round(2)
    
pd.DataFrame(data=res_both).T