In [1]:
import unittest
import numpy as np
import pandas as pd

from dowhy import gcm
from dowhy.gcm import MedianCDFQuantileScorer

from ocular.causal_model import dag

from ocular.outlier_score import compute_it_score
from ocular.outlier_score import _relative_frequency
from ocular.outlier_score import node_outlier_contribution_scores

from ocular.noise_data_generation import data_dict_to_data_df

from ocular.noise_data_generation import generate_noisedf_from_data

from ocular.noise_data_generation import generate_noise_and_node_samples

from ocular.noise_data_generation import get_target_data_from_noise_arr

from ocular.model_generation import noise_model_fitting

## Data Generation Process

In [2]:
gcm.config.disable_progress_bars()
nodes = [('X1', 'X2'), ('X2', 'X3'), ('X3', 'X4'), ('X4', 'X5')]
features = ['X1', 'X2', 'X3', 'X4', 'X5']
causal_graph = dag.CausalGraph(nodes, features)
target_node = 'X5'

data = pd.read_csv('inlier.csv', sep=',')
m_samples = 1

all_ancestors_of_node = causal_graph.ancestors[target_node]
all_ancestors_of_node.update({target_node})
sorted_nodes = [node for node in causal_graph.sorted_nodes if node in all_ancestors_of_node]
print(f'sorted_nodes is {sorted_nodes}')
## first we need to generate noise_models
noise_models = noise_model_fitting(data, 
                                causal_graph, 
                                m_samples,
                                target_node, 
                                sorted_nodes)


num_noise_samples = 1500
## next we generate noise_samples and node_samples based on the generated noise_models
noise_samples, node_samples = generate_noise_and_node_samples(noise_models, 
                                causal_graph, 
                                target_node, 
                                sorted_nodes, 
                                num_noise_samples)


## now we can train outlier_scorer using node_samples
outlier_scorer = MedianCDFQuantileScorer()
outlier_scorer.fit(node_samples[target_node])


## suppose there is one outlier
outliers = pd.read_csv('outlier.csv', sep=',')

## outlier_noises can have less columns than outliers since we only care about the nodes that has path to target_node
outlier_noises = generate_noisedf_from_data(outliers, 
                     noise_models, 
                     causal_graph, 
                     sorted_nodes, 
                     target_node)
out_noises_arr = outlier_noises.to_numpy()

results = node_outlier_contribution_scores(outlier_noises=out_noises_arr,
                            noise_samples=noise_samples,
                            outlier_scorer=outlier_scorer,
                            attribute_mean_deviation=False,
                            noise_models=noise_models,
                            causal_graph=causal_graph,
                            sorted_nodes=sorted_nodes,
                            target_node=target_node,
                            shapley_config = None)


sorted_nodes is ['X1', 'X2', 'X3', 'X4', 'X5']
Fitting emperical model
Fitting emperical model
Fitting emperical model
Fitting emperical model
subset_to_result_map
((0, 0, 0, 0, 0)) --> [-8.00670085]
((0, 0, 0, 0, 1)) --> [-8.00670085]
((0, 0, 0, 1, 0)) --> [-8.00670085]
((0, 0, 0, 1, 1)) --> [-8.00670085]
((0, 0, 1, 0, 0)) --> [-8.00670085]
((0, 0, 1, 0, 1)) --> [-8.00670085]
((0, 0, 1, 1, 0)) --> [-8.00670085]
((0, 0, 1, 1, 1)) --> [-8.00670085]
((0, 1, 0, 0, 0)) --> [-8.00670085]
((0, 1, 0, 0, 1)) --> [-8.00670085]
((0, 1, 0, 1, 0)) --> [-8.00670085]
((0, 1, 0, 1, 1)) --> [-8.00670085]
((0, 1, 1, 0, 0)) --> [-8.00670085]
((0, 1, 1, 0, 1)) --> [-8.00670085]
((0, 1, 1, 1, 0)) --> [-8.00670085]
((0, 1, 1, 1, 1)) --> [-8.00670085]
((1, 0, 0, 0, 0)) --> [-8.00670085]
((1, 0, 0, 0, 1)) --> [-8.00670085]
((1, 0, 0, 1, 0)) --> [-8.00670085]
((1, 0, 0, 1, 1)) --> [-8.00670085]
((1, 0, 1, 0, 0)) --> [-8.00670085]
((1, 0, 1, 0, 1)) --> [-8.00670085]
((1, 0, 1, 1, 0)) --> [-8.00670085]
((1, 0, 

In [3]:
noise_samples['X1']

array([[0.28497835],
       [0.29370961],
       [0.28375998],
       ...,
       [0.3190708 ],
       [0.14045941],
       [0.13753506]])