In [None]:
#TODO: da riscrivere, il codice va in src 


In [1]:
import os
import sys
import numpy as np 
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import roc_curve, auc

sys.path.append('..')
from src.benchmark.var import VAR
from src.benchmark.d2c_wrapper import D2C
from src.benchmark.metrics import make_plots, compute_roc_auc_curves
from src.descriptors.d2c_past_gen import DescriptorsGenerator


import pandas as pd 
import os 
import pickle
from tqdm import tqdm
from multiprocessing import Pool
import statsmodels.tsa.api as tsa


In [2]:

def create_lagged_multiple_ts(observations, maxlags):
    #create lagged observations for all the available time series
    lagged_observations = []
    for obs in observations:
        lagged = obs.copy()
        for i in range(1,maxlags+1):
            lagged = pd.concat([lagged, obs.shift(i)], axis=1)
        lagged.columns = [i for i in range(len(lagged.columns))]
        lagged_observations.append(lagged.dropna())
    return lagged_observations

def infer( single_ts, maxlags):
    model = tsa.var.var_model.VAR(single_ts.values)
    results = model.fit(maxlags=maxlags)
    return results

def build_causal_df(results, n_variables):
    pvalues = results.pvalues
    values = results.coefs

    #initialization
    pairs = [(source, effect) for source in range(n_variables) for effect in range(n_variables)]
    multi_index = pd.MultiIndex.from_tuples(pairs, names=['source', 'target'])
    causal_dataframe = pd.DataFrame(index=multi_index, columns=['is_causal', 'value', 'pvalue'])

    
    for source in range(n_variables):
        for effect in range(n_variables):
            current_pvalue = pvalues[source, effect]
            current_value = values[0][effect][source]

            is_causal = 0 if current_pvalue > 0.05 else 0 if abs(current_value) < 0.1 else 1
            causal_dataframe.loc[(source, effect)] = is_causal, current_value, current_pvalue

    return causal_dataframe

def process_time_series(args):
    generative_process_idx, internal_idx, ts, n_variables, maxlags, loaded_observations, descriptors = args
    lenght_data_generative_process = len(loaded_observations[generative_process_idx])
    corresponding_graph_id = (generative_process_idx - 1) * lenght_data_generative_process + internal_idx
    results = infer(ts, maxlags=1)
    causal_df = build_causal_df(results, n_variables * (maxlags + 1))
    descriptors_chunk = descriptors.loc[descriptors.graph_id == corresponding_graph_id]
    joined_table = pd.merge(descriptors_chunk, causal_df.drop(columns='is_causal'), how='inner', left_on=['edge_source', 'edge_dest'], right_on=['source', 'target'])
    return joined_table



In [4]:
descriptors = {}
for n_variables in [3,5,10,20]:
        descriptors[n_variables] = {}
        for noise_std in [0.5,0.75]:
            maxlags = 3
            output_folder = f'data/synthetic/data_N{n_variables}_std{noise_std}/'
            file_name = 'descriptors_Ridgefamily-0-1-2-3-4-5.pkl'

            descriptors = pd.read_pickle('../'+output_folder + file_name)

            data_path = output_folder
            loaded_observations = {}
            loaded_dags = {}
            loaded_causal_dfs = {}
            for file in os.listdir('../'+data_path):
                if file.startswith('data'):
                    index = file.split('_')[1].split('.')[0]
                    with open('../'+data_path+file, 'rb') as f:
                        loaded_observations[int(index)], loaded_dags[int(index)], loaded_causal_dfs[int(index)], _ = pickle.load(f)

            list_results = []
            for generative_process_idx in tqdm(range(1, 21)):
                lagged_time_series = create_lagged_multiple_ts(loaded_observations[generative_process_idx], maxlags)
                n_jobs = 30
                if n_jobs == 1:
                    results = []
                    for internal_idx, ts in enumerate(lagged_time_series):
                        results.append(process_time_series((generative_process_idx, internal_idx, ts, n_variables, maxlags, loaded_observations, descriptors)))
                else:
                    with Pool(n_jobs) as pool:
                        args = [(generative_process_idx, internal_idx, ts, n_variables, maxlags, loaded_observations, descriptors) for internal_idx, ts in enumerate(lagged_time_series)]
                        results = pool.map(process_time_series, args)
                list_results.append(results)

            list_flat = [item for sublist in list_results for item in sublist]
            descriptors_var = pd.concat(list_flat, axis=0)
            descriptors_var = descriptors_var[[c for c in descriptors_var if c not in ['is_causal']] + ['is_causal']]
            descriptors_var.to_pickle('../'+data_path+'descriptors_var.pkl')

100%|██████████| 20/20 [02:28<00:00,  7.43s/it]
100%|██████████| 20/20 [02:31<00:00,  7.60s/it]
  descriptors[n_variables] = {}
100%|██████████| 20/20 [02:54<00:00,  8.74s/it]
100%|██████████| 20/20 [02:54<00:00,  8.73s/it]
  descriptors[n_variables] = {}
100%|██████████| 20/20 [03:24<00:00, 10.22s/it]
100%|██████████| 20/20 [03:36<00:00, 10.83s/it]
  descriptors[n_variables] = {}
100%|██████████| 20/20 [04:43<00:00, 14.19s/it]
100%|██████████| 20/20 [04:53<00:00, 14.66s/it]
