In [88]:
import pandas as pd 
import os 
import pickle
from tqdm import tqdm

In [81]:
descriptors = pd.read_pickle('causeme/2024.01.20.pkl')

In [94]:
descriptors.shape

(61200, 92)

In [82]:
data_path = '2024.01.20/'
loaded_observations = {}
loaded_dags = {}
loaded_causal_dfs = {}
for file in os.listdir(data_path):
    if file.startswith('data'):
        index = file.split('_')[1].split('.')[0]
        with open(data_path+file, 'rb') as f:
            loaded_observations[int(index)], loaded_dags[int(index)], loaded_causal_dfs[int(index)], _ = pickle.load(f)


In [83]:
import statsmodels.tsa.api as tsa
def create_lagged_multiple_ts(observations, maxlags):
    #create lagged observations for all the available time series
    lagged_observations = []
    for obs in observations:
        lagged = obs.copy()
        for i in range(1,maxlags+1):
            lagged = pd.concat([lagged, obs.shift(i)], axis=1)
        lagged.columns = [i for i in range(len(lagged.columns))]
        lagged_observations.append(lagged.dropna())
    return lagged_observations

def infer( single_ts, maxlags):
    model = tsa.var.var_model.VAR(single_ts.values)
    results = model.fit(maxlags=maxlags)
    return results

def build_causal_df(results, n_variables):
    pvalues = results.pvalues
    values = results.coefs

    #initialization
    pairs = [(source, effect) for source in range(n_variables) for effect in range(n_variables)]
    multi_index = pd.MultiIndex.from_tuples(pairs, names=['source', 'target'])
    causal_dataframe = pd.DataFrame(index=multi_index, columns=['is_causal', 'value', 'pvalue'])

    
    for source in range(n_variables):
        for effect in range(n_variables):
            current_pvalue = pvalues[source, effect]
            current_value = values[0][effect][source]

            is_causal = 0 if current_pvalue > 0.05 else 0 if abs(current_value) < 0.1 else 1
            causal_dataframe.loc[(source, effect)] = is_causal, current_value, current_pvalue

    return causal_dataframe


In [91]:
n_variables = 20
maxlags = 5
descriptors_plus_var = pd.DataFrame()
joined = []
for generative_process_idx in range(1, 18):
    lenght_data_generative_process = len(loaded_observations[generative_process_idx])
    lagged_time_series = create_lagged_multiple_ts(loaded_observations[generative_process_idx],maxlags)
    for internal_idx, ts in tqdm(enumerate(lagged_time_series)):

        corresponding_graph_id = (generative_process_idx - 1)*lenght_data_generative_process + internal_idx
        results = infer(ts,maxlags=1)
        causal_df = build_causal_df(results, n_variables*(maxlags+1))
        descriptors_chunk = descriptors.loc[descriptors.graph_id == corresponding_graph_id]
        joined_table = pd.merge(descriptors_chunk, causal_df.drop(columns='is_causal'), how='inner', left_on=['edge_source', 'edge_dest'], right_on=['source', 'target'])
        joined.append(joined_table)

descriptors_plus_var = pd.concat([descriptors_plus_var, joined_table], axis=0)
descriptors_plus_var = descriptors_plus_var[[c for c in descriptors_plus_var if c not in ['is_causal']] + ['is_causal']]        

60it [01:26,  1.45s/it]
51it [01:14,  1.45s/it]


KeyboardInterrupt: 

In [None]:
from multiprocessing import Pool

def process_time_series(args):
    generative_process_idx, internal_idx, ts = args
    lenght_data_generative_process = len(loaded_observations[generative_process_idx])
    corresponding_graph_id = (generative_process_idx - 1) * lenght_data_generative_process + internal_idx
    results = infer(ts, maxlags=1)
    causal_df = build_causal_df(results, n_variables * (maxlags + 1))
    descriptors_chunk = descriptors.loc[descriptors.graph_id == corresponding_graph_id]
    joined_table = pd.merge(descriptors_chunk, causal_df.drop(columns='is_causal'), how='inner', left_on=['edge_source', 'edge_dest'], right_on=['source', 'target'])
    return joined_table

list_results = []
for generative_process_idx in range(1, 18):
    lagged_time_series = create_lagged_multiple_ts(loaded_observations[generative_process_idx], maxlags)

    with Pool(60) as pool:
        args = [(generative_process_idx, internal_idx, ts) for internal_idx, ts in enumerate(lagged_time_series)]
        results = pool.map(process_time_series, args)
    list_results.append(results)


In [106]:
list_flat = [item for sublist in list_results for item in sublist]
len(list_flat)


1020

In [108]:
descriptors_var = pd.concat(list_flat, axis=0)
descriptors_var = descriptors_var[[c for c in descriptors_var if c not in ['is_causal']] + ['is_causal']]
descriptors_var.to_pickle(data_path+'descriptors_var.pkl')

In [3]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])

In [4]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)


In [None]:
#random forest 
from sklearn.ensemble import RandomForestClassifier

#classification metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

results = pd.DataFrame(columns=['accuracy', 'f1', 'precision', 'recall', 'roc_auc'])
for i, (train_index, test_index) in enumerate(tscv.split(X)):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    results.loc[i] = [accuracy, f1, precision, recall, roc_auc]


    # evaluate classification

