In [1]:
import os
import sys
import json
import random
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from causal_gen.outlier_injection import inject_n_outliers

In [2]:
def get_dag_edges(adjacency_rep, 
                  child_col_index = 0, 
                  parent_col_index = 1,
                  self_causality = False):
    vertices = sorted(adjacency_rep.iloc[:, 0].unique())
    vertex_map = {f'{vertex}' : f'X{vertex}' for vertex in vertices}
    edges = list()
    for i in range(adjacency_rep.shape[0]):
        child = adjacency_rep.iloc[i,child_col_index]
        parent = adjacency_rep.iloc[i,parent_col_index]
        #print(f'at index {i} {parent} -> {child}')
        if not self_causality and child == parent :
            continue
        edges.append((vertex_map[str(parent)], vertex_map[str(child)]))
    return vertex_map, edges

## Populate causal graph metadata

In [3]:
ori_folder = 'fMRI_from_TCDF'
datasets = [i for i in range(1, 29)]
metadata = {}
## note timeseries 4 has 50 vertices
for dataset in datasets:
    adjacency_rep = pd.read_csv(f'{ori_folder}/sim{dataset}_gt_processed.csv', header=None)
    vertex_map, edges = get_dag_edges(adjacency_rep,
                      child_col_index = 0, 
                      parent_col_index = 1, 
                      self_causality = False)
    metadata[f'timeseries{dataset}'] = {'vertex_map':vertex_map,
                                        'edges':edges}


## Inject outliers into datasets

In [4]:
dest_folder = 'fMRI_with_outliers'
cwd = os.getcwd()
dest_folder = os.path.join(cwd, dest_folder)

if not os.path.exists(dest_folder):
    os.makedirs(dest_folder)

set_1 = (1, 5, 7, 8, 9, 
         10, 15, 18, 19, 
         20, 21, 22, 23, 
         24, 25, 26, 27, 28)
set_2 = (2, 6, 11, 12, 17)
set_3 = (3,)
set_4 = (4,)
set_5 = (13,)
set_6 = (16,)
outlier_percentage = 0.03
for dataset in datasets:
    ## we do not inject outliers to timeseries 13 and 14 because they do not have root (all vertices have parents)
    random.seed(dataset)
    if dataset == 13 or dataset==14:
        continue
        
    filename = f'timeseries{dataset}.csv'
    df = pd.read_csv(f'{ori_folder}/{filename}')

    vertex_map = metadata[f'timeseries{dataset}']['vertex_map']
    edges = metadata[f'timeseries{dataset}']['edges']
    
    df.rename(columns=vertex_map, inplace=True)
    
    G = nx.DiGraph()
    G.add_nodes_from(list(vertex_map.values()))
    G.add_edges_from(edges)
    
    target_node = 'X4'
    using_root = True
    if dataset in set_2:
        target_node = 'X9'
    elif dataset == 3:
        target_node = 'X14'
    elif dataset == 4:
        target_node = 'X49'
    
    target_outlier_positions, root_causes = inject_n_outliers(df,
                                                  causal_graph = G,
                                                  target_node= target_node,
                                                  n_outliers=outlier_percentage,
                                                  multiplier = 5,
                                                  lag = 1,
                                                  using_root=using_root)
    print(f'dataset {dataset} \n{target_outlier_positions}\n{root_causes}')
    metadata[f'timeseries{dataset}']['target_node'] = target_node
    metadata[f'timeseries{dataset}']['target_outlier_positions'] = target_outlier_positions
    metadata[f'timeseries{dataset}']['root_causes'] = root_causes
    
    filepath = f'{dest_folder}/{filename}'
    df.to_csv(filepath, sep=',', index=False)

metadata_filepath = f'{dest_folder}/metadata.json'
with open(metadata_filepath, "w") as outfile:
    json.dump(metadata, outfile, indent = 4)

dataset 1 
(21, 35, 39, 70, 131, 150)
['X2', 'X2', 'X1', 'X2', 'X1', 'X2']
dataset 2 
(24, 31, 33, 53, 102, 198)
['X7', 'X2', 'X2', 'X8', 'X7', 'X8']
dataset 3 
(48, 75, 109, 154, 166, 169)
['X12', 'X1', 'X12', 'X2', 'X12', 'X13']
dataset 4 
(76, 89, 110, 127, 151, 172)
['X2', 'X47', 'X37', 'X1', 'X42', 'X27']
dataset 5 
(8, 15, 31, 64, 111, 152, 213, 236, 264, 275, 289, 326, 331, 344, 345, 377, 433, 446, 448, 509, 515, 528, 577, 597, 647, 739, 766, 784, 802, 840, 916, 958, 965, 1090, 1118, 1180)
['X1', 'X3', 'X1', 'X3', 'X2', 'X2', 'X1', 'X2', 'X3', 'X3', 'X2', 'X2', 'X2', 'X1', 'X3', 'X2', 'X1', 'X2', 'X1', 'X2', 'X3', 'X2', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X1', 'X2', 'X1', 'X1', 'X3', 'X1', 'X3', 'X2']
dataset 6 
(10, 54, 85, 103, 175, 190, 200, 202, 209, 308, 405, 413, 415, 421, 522, 545, 553, 568, 606, 664, 697, 748, 751, 774, 849, 857, 879, 921, 973, 1003, 1011, 1112, 1114, 1143, 1163, 1185)
['X8', 'X2', 'X1', 'X2', 'X7', 'X1', 'X2', 'X7', 'X2', 'X2', 'X2', 'X1', 'X2', 'X7', 

In [5]:
import networkx as nx

# Create a directed acyclic graph (DAG)
G = nx.DiGraph()

# Add nodes
G.add_nodes_from([1, 2, 3, 4, 5, 6])

# Add edges
G.add_edges_from([(1, 2), (1, 3), (2, 4), (3, 4), (4, 5), (5, 6)])

# Define the source and target nodes
source_node = 1
target_node = 6

# Find all possible paths from the source node to the target node
all_paths = list(nx.all_simple_paths(G, source=source_node, target=target_node))

print("All possible paths:", all_paths)


All possible paths: [[1, 2, 4, 5, 6], [1, 3, 4, 5, 6]]
