In [1]:
import numpy as np
import pandas as pd
from scipy.stats import truncexpon, halfnorm
import networkx as nx
import datetime

from causal_gen.basic_ts import find_root_children_nodes
from causal_gen.basic_ts import generate_root_data
from causal_gen.basic_ts import generate_child_data
from causal_gen.basic_ts import generate_data
from causal_gen.basic_ts import merge_node_data

In [2]:
def generate_root_data_with_outlier(node, 
                                   node_data, 
                                   start_ts, 
                                   n_data, 
                                   time_propagation, 
                                   root_cause=False, 
                                   outlier_position=(),
                                   outlier_multiplier=3):
    """
    Generating data for the root nodes
    Inputs:
        node : string of node name
        node_data: dictionary
        start_ts: float of timestamp 
        n_data: int, number of data to generate 
        time_propagation: float, time needed to propagate value from an upstream node to downstream node (in seconds)
    Outputs
        updated node_data
    """
    if not root_cause:
        generate_root_data(node, node_data, start_ts, n_data, time_propagation)
    else:
        total_outlier = len(outlier_position)
        last_pos = 0
        datas = list()
        for pos in outlier_position:
            n_normal = pos - last_pos
            stop_ts = start_ts + n_normal * time_propagation
            ts_normal = np.arange(start=start_ts, 
                                  stop= stop_ts, 
                                  step=time_propagation)
            data_normal = truncexpon.rvs(size=n_normal, b=3, scale=0.2).reshape(-1,1)
            root_cause = np.zeros(n_normal)
            
            ts_outlier = stop_ts + time_propagation
            data_outlier = outlier_multiplier * truncexpon.rvs(size=1, b=3, scale=0.2).reshape(-1,1)
            root_cause = np.append(root_cause, 1).reshape(-1,1)
            
            ts = np.append(ts_normal, ts_outlier).reshape(-1,1)
        
            data = np.vstack((data_normal, data_outlier))
            data = np.hstack((data, ts))
            data = np.hstack((data, root_cause))
            datas.append(data)
            
            start_ts = stop_ts + time_propagation + 1
            last_pos = pos + 1
            print(f'at outlier position {pos} , data.shape {data.shape}, ts.shape {ts.shape}')
        if last_pos < n_data:
            n_normal = n_data - last_pos
            data_normal = truncexpon.rvs(size=n_normal, b=3, scale=0.2).reshape(-1,1)
            stop_ts = start_ts + n_normal * time_propagation
            ts_normal = np.arange(start=start_ts, 
                                  stop= stop_ts, 
                                  step=time_propagation).reshape(-1,1)
            
            data = np.hstack((data_normal, ts_normal))
            root_cause = np.zeros(n_normal).reshape(-1,1)
            data = np.hstack((data, root_cause))
            datas.append(data)
        
        all_data = None
        for data in datas:
            if all_data is None:
                all_data = data
            else:
                all_data = np.vstack((all_data, data))
        
        node_data[node] = {'data' : pd.DataFrame(all_data, columns=(node, f'ts', f'{node}_root_cause')), 
                       'start_ts' : start_ts,}
        return datas

In [3]:
features = ('X1', 'X2', 'X3', 'X4', 'X5')
causal_graph = nx.DiGraph([('X1', 'X2'), 
                          ('X2', 'X3'),
                          ('X3', 'X4'),
                          ('X4', 'X5')])
time_propagation = 1 # in second
start_ts = datetime.datetime.now().timestamp()
n_data = 10
node_data = {}
node = 'X1'
datas = generate_root_data_with_outlier(node, 
                                node_data, 
                                start_ts, 
                                n_data, 
                                time_propagation, 
                                root_cause=True, 
                                outlier_position=(2,8),
                                outlier_multiplier=3)
print(f'len(datas) {len(datas)}')
print(f'datas[0] {datas[0].shape}')
print(f'datas[1] {datas[1].shape}')
print(f'datas[2] {datas[2].shape}')
print(f'node_data[node]["data"].shape {node_data[node]["data"].shape}')

at outlier position 2 , data.shape (3, 3), ts.shape (3, 1)
at outlier position 8 , data.shape (6, 3), ts.shape (6, 1)
len(datas) 3
datas[0] (3, 3)
datas[1] (6, 3)
datas[2] (1, 3)
node_data[node]["data"].shape (10, 3)


In [4]:
features = ('X1', 'X2', 'X3', 'X4', 'X5')
causal_graph = nx.DiGraph([('X1', 'X2'), 
                          ('X2', 'X3'),
                          ('X3', 'X4'),
                          ('X4', 'X5')])
time_propagation = 1 # in second
start_ts = datetime.datetime.now().timestamp()
n_data = 10
node_data = {}
node = 'X1'
datas = generate_root_data_with_outlier(node, 
                                node_data, 
                                start_ts, 
                                n_data, 
                                time_propagation, 
                                root_cause=True, 
                                outlier_position=(5,),
                                outlier_multiplier=3)
print(f'len(datas) {len(datas)}')
print(f'datas[0] {datas[0].shape}')
print(f'datas[1] {datas[1].shape}')

at outlier position 5 , data.shape (6, 3), ts.shape (6, 1)
len(datas) 2
datas[0] (6, 3)
datas[1] (4, 3)


In [5]:
def generate_child_normal_data(n_normal, 
                               parents, 
                               start_ts, 
                               stop_ts, 
                               time_propagation):
    data_normal = halfnorm.rvs(size= n_normal, loc=0.5, scale=0.2).reshape(-1,1)
    ts_normal = np.arange(start=start_ts, 
                          stop =stop_ts, 
                          step=time_propagation)
    root_cause = np.zeros(n_normal)
    return data_normal, ts_normal, root_cause

In [6]:
def generate_child_data_with_outlier(node, 
                                     parents, 
                                     node_data, 
                                     n_data, 
                                     time_propagation, 
                                     root_cause=False, 
                                     outlier_position=(),
                                     outlier_multiplier=3):
    """
    Generating data for the child nodes
    Inputs:
        node : string of node name
        parents : dictionary of the node and its parent nodes
        node_data: dictionary
        n_data: int, number of data to generate 
        time_propagation: float, time needed to propagate value from an upstream node to downstream node (in seconds)
    Outputs:
        updated node_data
    """
    if not root_cause:
        generate_child_data(node, parents, node_data, n_data, time_propagation)
        node_data[node]['data'][f'{node}_root_cause'] = np.zeros(n_data)
    else:
        last_pos = 0
        datas = list()
        
        parent_start_ts = list()
        for parent in parents:
            parent_start_ts.append(node_data[parent]['start_ts'])
        start_ts = max(parent_start_ts) + time_propagation
        
        for pos in outlier_position:
            n_normal = pos - last_pos
            stop_ts= start_ts + n_normal * time_propagation
            data_normal, ts_normal, root_cause = generate_child_normal_data(n_normal, 
                                                                       parents, 
                                                                       start_ts, 
                                                                       stop_ts, 
                                                                       time_propagation) 
            data_outlier = outlier_multiplier * halfnorm.rvs(size= 1, loc=0.5, scale=0.2).reshape(-1,1)
            ts_outlier = stop_ts + time_propagation
            
            root_cause = np.append(root_cause, 1).reshape(-1,1)
            ts = np.append(ts_normal, ts_outlier).reshape(-1,1)
        
            data = np.vstack((data_normal, data_outlier))
            data = np.hstack((data, ts))
            data = np.hstack((data, root_cause))
            datas.append(data)
            start_ts = stop_ts + time_propagation + 1
            last_pos = pos + 1
        
        if last_pos < n_data:
            n_normal = n_data - last_pos
            stop_ts = start_ts + n_normal * time_propagation
            data_normal, ts_normal, root_cause = generate_child_normal_data(n_normal, 
                                                                       parents, 
                                                                       start_ts, 
                                                                       stop_ts, 
                                                                       time_propagation) 
            
            data = np.hstack((data_normal, ts_normal))
            root_cause = np.zeros(n_normal).reshape(-1,1)
            data = np.hstack((data, root_cause))
            datas.append(data)
        
        all_data = None
        for data in datas:
            if all_data is None:
                all_data = data
            else:
                all_data = np.vstack((all_data, data))
        
        for parent in parents:
            if parent in node_data.keys():
                all_data += node_data[parent]['data'][parent].values.reshape(-1,1)
            else:
                print(f'parent {parent} of node {node} has no data')

        node_data[node] = {'data' : pd.DataFrame(all_data, columns=(node, f'ts', f'{node}_root_cause')), 
                       'start_ts' : start_ts,}

In [7]:
features = ('X1', 'X2', 'X3', 'X4', 'X5')
causal_graph = nx.DiGraph([('X1', 'X2'), 
                          ('X2', 'X3'),
                          ('X3', 'X4'),
                          ('X4', 'X5')])
time_propagation = 1 # in second
start_ts = datetime.datetime.now().timestamp()
n_data = 10
node_data = {}
node = 'X1'
datas = generate_root_data_with_outlier(node, 
                                node_data, 
                                start_ts, 
                                n_data, 
                                time_propagation, 
                                root_cause=True, 
                                outlier_position=(5,),
                                outlier_multiplier=3)
print(f'len(datas) {len(datas)}')
print(f'datas[0] {datas[0].shape}')
print(f'datas[1] {datas[1].shape}')

at outlier position 5 , data.shape (6, 3), ts.shape (6, 1)
len(datas) 2
datas[0] (6, 3)
datas[1] (4, 3)


In [9]:
features = ('X1', 'X2', 'X3', 'X4', 'X5')
causal_graph = nx.DiGraph([('X1', 'X2'), 
                          ('X2', 'X3'),
                          ('X3', 'X4'),
                          ('X4', 'X5')])
for path in nx.all_simple_paths(causal_graph, source='X1', target='X5'):
    print(path)

['X1', 'X2', 'X3', 'X4', 'X5']


In [10]:
for path in nx.all_simple_paths(causal_graph, source='X2', target='X5'):
    print(path)

['X2', 'X3', 'X4', 'X5']


In [15]:
tuple(nx.all_simple_paths(causal_graph, source='X5', target='X5'))

()