# Show discrepancy between experiment results with preferred semantics

Consider a 3 node graph and generate a random set of dependency and independency facts. Run original causal ABA and new one with PR' semantics and compare outputs.
Investigate the discrepancies.

In [1]:
import sys
sys.path.insert(0, '../../')
sys.path.insert(0, '../../ArgCausalDisco/')
sys.path.insert(0, '../../notears/')

from pathlib import Path

from ArgCausalDisco.utils.helpers import random_stability
from ArgCausalDisco.abapc import ABAPC

from src.abapc import get_arrow_sets
from src.utils.bn_utils import get_dataset

from ArgCausalDisco.causalaba import CausalABA
from itertools import combinations
from src.utils.utils import powerset
from src.utils.enums import Fact, RelationEnum
import numpy as np
from src.utils.utils import get_arrows_from_model
from src.causal_aba.factory import ABASPSolverFactory
from ArgCausalDisco.utils.graph_utils import initial_strength, set_of_models_to_set_of_graphs

from ArgCausalDisco.utils.helpers import random_stability, logger_setup
logger_setup(f'./log.log')


  from .autonotebook import tqdm as notebook_tqdm
INFO:root:You can use `os.environ['CASTLE_BACKEND'] = backend` to set the backend(`pytorch` or `mindspore`).
INFO:root:You are using ``pytorch`` as the backend.


## For Random Facts

In [2]:
def all_possible_facts(n_nodes):
    indep_facts = []
    dep_facts = []
    for X, Y in combinations(range(n_nodes), 2):
        for S in powerset(set(range(n_nodes)) - {X, Y}):
            dep_facts.append(Fact(relation=RelationEnum.dep,
                              node1=X,
                              node2=Y,
                              node_set=S,
                              score=0))  # score placeholder
            indep_facts.append(Fact(relation=RelationEnum.indep,
                                node1=X,
                                node2=Y,
                                node_set=S,
                                score=0))   
    return dep_facts, indep_facts


    

In [3]:

def generate_fact_tuple(fact):
    X, Y, S, dep_type_PC, I = fact.node1, fact.node2, fact.node_set, fact.relation.value, fact.score
    s_str = 'empty' if len(S)==0 else 's'+'y'.join([str(i) for i in S])
    return(X,S,Y,dep_type_PC, f"{dep_type_PC}({X},{Y},{s_str}).", I)

def get_fact_location(facts, base_location='./facts.lp'):

    facts_location = base_location
    facts_location_wc = base_location.replace('.lp', '_wc.lp')
    facts_location_I = base_location.replace('.lp', '_I.lp')

    facts = [generate_fact_tuple(fact) for fact in facts]


    ### Save external statements
    with open(facts_location, "w") as f:
        for n, s in enumerate(facts):
            f.write(f"#external ext_{s[4]}\n")
    ### Save weak constraints
    with open(facts_location_wc, "w") as f:
        for n, s in enumerate(facts):
            print(s)
            f.write(f":~ {s[4]} [-{int(s[5]*1000)}]\n")
    ### Save inner strengths
    with open(facts_location_I, "w") as f:
        for n, s in enumerate(facts):
            f.write(f"{s[4]} I={s[5]}, NA\n")
        
    return facts_location

In [4]:
def get_random_facts(n_nodes):
    dep_facts, indep_facts = all_possible_facts(n_nodes)

    indep_scores = np.random.uniform(0, 1, len(indep_facts))
    dep_scores = np.random.uniform(0, 1, len(dep_facts))

    for i, fact in enumerate(indep_facts):
        fact.score = indep_scores[i]

    for i, fact in enumerate(dep_facts):
        fact.score = dep_scores[i]
    
    num_dep = np.random.randint(0, len(dep_facts)//2)
    num_indep = np.random.randint(0, len(indep_facts)//2)
    indexes = np.random.choice(len(dep_facts), num_dep+num_indep, replace=False)
    dep_facts = [dep_facts[i] for i in indexes[:num_dep]]
    indep_facts = [indep_facts[i] for i in indexes[num_dep:]]
    facts = dep_facts + indep_facts

    return facts

In [5]:
def get_new_impl_models(facts, n_nodes, semantics='ST'):
    sorted_facts = sorted(facts, key=lambda x: x.score, reverse=True)

    #  TODO: replace by logging
    print(f"All facts extracted: {sorted_facts}")

    # binary search to find the largest fact set where stable extensions exist

    factory = ABASPSolverFactory(n_nodes=n_nodes)
    fact_idx = len(sorted_facts)

    while fact_idx >= 0:
        solver, all_paths = factory._create_solver(sorted_facts[:fact_idx])
        models = solver.enumerate_extensions(semantics)
        if models is not None and len(models) > 0:
            break
        fact_idx -= 1
    # TODO: replace by logging
    print(f"Number of facts kept: {fact_idx}")
    arrow_sets = [get_arrows_from_model(model) for model in models]
    return arrow_sets, models, sorted_facts[:fact_idx], all_paths, solver

In [6]:
def compare(n_nodes, seed, semantics='ST'):
    np.random.seed(seed)
    all_fact_choices = get_random_facts(n_nodes)
    with open('./facts_initial.txt', 'w') as f:
        for x in sorted(all_fact_choices, key=lambda x: x.score, reverse=True):
            f.write(f"{x}\n")
    all_fact_choices = sorted(all_fact_choices, key=lambda x: x.score, reverse=True)
    all_fact_choices = all_fact_choices[:4]

    
    facts_location = get_fact_location(all_fact_choices, base_location='./facts.lp')
    model_sets, multiple_solutions = CausalABA(n_nodes, facts_location, weak_constraints=True, skeleton_rules_reduction=True,
                                            fact_pct=1.0, search_for_models='first',
                                            opt_mode='optN', print_models=False, set_indep_facts=False)
    old_models, MEC = set_of_models_to_set_of_graphs(model_sets, n_nodes, False)

    new_models, new_models_full, facts, all_paths, solver = get_new_impl_models(all_fact_choices, n_nodes, semantics)
    new_models = {frozenset(model) for model in new_models}

    print(seed)
    print('old: ', old_models)
    print('new: ', new_models)

    if old_models == new_models:
        ans = True
    else:
        ans = False

    return ans, old_models, new_models, new_models_full, facts, all_paths, solver
    

In [None]:
SEMANTICS = 'PR'
problem = False
N_NODES = 3
for seed in range(0, 10):
    ans, old_models, new_models, new_models_full, facts, all_paths, solver = compare(N_NODES, seed, SEMANTICS)
    if not ans:
        print(f"Discrepancy found for seed {seed}!")
        problem = True
        break
;

In [25]:
problem

True

In [26]:
new_models - old_models

{frozenset({(1, 2)}), frozenset({(0, 2)})}

In [27]:
old_models - new_models

set()

In [28]:
facts

[Fact(relation=<RelationEnum.dep: 'dep'>, node1=0, node2=1, node_set=(2,), score=np.float64(0.6192709663506637)),
 Fact(relation=<RelationEnum.dep: 'dep'>, node1=0, node2=1, node_set=(), score=np.float64(0.2046486340378425))]

In [29]:
arrow_sets = [get_arrows_from_model(model) for model in new_models_full]

for m in new_models - old_models:
    index = arrow_sets.index(m)
    print(index, new_models_full[index])

6 {arr_1_2, blocked_path_0_1__0__, noe_0_1, blocked_path_0_1__0__2} |= {-noe_1_2, dpath_1_2, -indep_0_1__, -arr_2_1, -arr_1_0, -arr_0_1, -indep_0_1__2}
8 {arr_0_2, blocked_path_0_1__0__, noe_0_1, blocked_path_0_1__0__2} |= {-arr_2_0, -indep_0_1__, -noe_0_2, -arr_1_0, dpath_0_2, -arr_0_1, -indep_0_1__2}


In [30]:
all_paths

{(0, 1): [(0, 1), (0, 2, 1)]}

In [31]:
solver.assumptions

{'arr_0_1',
 'arr_0_2',
 'arr_1_0',
 'arr_1_2',
 'arr_2_0',
 'arr_2_1',
 'blocked_path_0_1__0__',
 'blocked_path_0_1__0__2',
 'blocked_path_0_1__1__',
 'blocked_path_0_1__1__2',
 'indep_0_1__',
 'indep_0_1__2',
 'noe_0_1',
 'noe_0_2',
 'noe_1_2'}

Focusing on the  following extension:  
```
{arr_1_2, blocked_path_0_1__0__, noe_0_1, blocked_path_0_1__0__2} |= {-noe_1_2, dpath_1_2, -indep_0_1__, -arr_2_1, -arr_1_0, -arr_0_1, -indep_0_1__2} 

```

With facts:

```
[Fact(relation=<RelationEnum.dep: 'dep'>, node1=0, node2=1, node_set=(2,), score=np.float64(0.6192709663506637)),
 Fact(relation=<RelationEnum.dep: 'dep'>, node1=0, node2=1, node_set=(), score=np.float64(0.2046486340378425))]
 ```

 With paths:

 ```
 {(0, 1): [(0, 1), (0, 2, 1)]}
 ```


So the facts are that nodes 0 and 1 are dependent when conditioned on node 2, and they are also dependent unconditionally.  

However we get a "preferred" extension where there is only an arrow from 1 to 2 and that's it. This means that the dependency facts are not satisfied.  



Q: So why isn't the independency of 0 and 1 included to invalidate this extension due to a conflict?  
A: Because it is attacked by empty set, from which it can't be defended.   





Q: Then why aren't  `blocked_path_0_1__1__` or `blocked_path_0_1__1__2` included? They are clearly true given the circumstances. And if these were included then independence would be derived from the extension and wound conflict the dependency fact, and everything would've worked out.  


A: They are not included because they are not defended. They are not defended because of the following:

Thes blocked paths correspond to the path (0, 2, 1), according to their path id of 1. These blocked path assumptions can be attacked by arrows forming an active collider tree. These arrows in this case are `arr_1_2` and `arr_2_0`. The `blocked_path_0_1__1__` is attacked by {`arr_1_2`, `arr_2_0`}. Both `arr_1_2` and `arr_2_0` are not attacked by the accepted assumption set: `arr_1_2` is included in it and `arr_2_0` is not attacked. So why isn't `arr_2_0` included then? Because it is attacked by `noe_0_2` and is not defended from it. Why isn't `noe_0_2` included? Because it is attacked by `arr_2_0` and is not defended from it.

We arrived at an example of a COMPLETE extension in Causal ABA framework that is not faithful to the d-separations in the graph. These d-separations are in this case dependence relations encoded as facts in the Causal ABAF.

Maybe the maximisation didn't work properly? Let's check that as well.

In [42]:
for model1 in new_models_full:
    break_var = False
    for model2 in new_models_full:
        assums1 = set(model1.assumptions)
        assums2 = set(model2.assumptions)
        if assums1 != assums2 and (assums1.issubset(assums2) or assums2.issubset(assums1)):
            print("Subset found:")
            print("Model 1:", assums1)
            print("Model 2:", assums2)
            break_var = True
            break
    if break_var:
        break
else:
    print("No subset found")
        
        

No subset found


In [45]:
for i, m in enumerate(new_models_full):
    print(i, m.assumptions)

0 {'arr_1_0', 'arr_1_2', 'blocked_path_0_1__1__', 'arr_0_2'}
1 {'arr_1_0', 'arr_1_2', 'blocked_path_0_1__1__2', 'arr_2_0'}
2 {'arr_2_0', 'arr_0_1', 'blocked_path_0_1__1__2', 'arr_2_1'}
3 {'arr_0_1', 'arr_1_2', 'blocked_path_0_1__1__', 'arr_0_2'}
4 {'arr_1_2', 'blocked_path_0_1__1__2', 'noe_0_2', 'blocked_path_0_1__1__', 'arr_0_1'}
5 {'blocked_path_0_1__1__2', 'arr_2_0', 'blocked_path_0_1__1__', 'arr_0_1', 'noe_1_2'}
6 {'arr_1_2', 'blocked_path_0_1__0__', 'noe_0_1', 'blocked_path_0_1__0__2'}
7 {'arr_1_0', 'arr_1_2', 'blocked_path_0_1__1__2', 'noe_0_2', 'blocked_path_0_1__1__'}
8 {'arr_0_2', 'blocked_path_0_1__0__', 'noe_0_1', 'blocked_path_0_1__0__2'}
9 {'arr_1_0', 'blocked_path_0_1__1__2', 'blocked_path_0_1__1__', 'noe_1_2', 'arr_0_2'}
10 {'arr_0_1', 'arr_0_2', 'blocked_path_0_1__1__2', 'arr_2_1'}
11 {'blocked_path_0_1__1__2', 'blocked_path_0_1__1__', 'noe_1_2', 'arr_0_1', 'arr_0_2'}
12 {'arr_1_0', 'blocked_path_0_1__1__2', 'arr_2_0', 'blocked_path_0_1__1__', 'noe_1_2'}
13 {'arr_1_0', 

# Conclusion

We found a MAXIMALLY COMPLETE (or PREFERRED') extension that is not faithful to the d-separations in the graph.