# Comparison
Goal of this is to compare the original DAG with the graph computed by D2C and by CausalNex (NOTEARS algorithm)

In [9]:
from d2c.simulatedDAGs import SimulatedDAGs
from d2c.D2C import D2C
from causalnex.structure.notears import from_pandas
from causalnex.structure import StructureModel
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import networkx as nx

In [2]:
n_dags = 5
n_observations = 100
n_nodes = 4

In [3]:
simulated_dags = SimulatedDAGs(n_dags, n_observations, n_nodes)
simulated_dags.generate_dags()

In [4]:
#generate the observations 
simulated_dags.simulate_observations()

In [5]:
#keep just the last graph for test 
all_DAGs = simulated_dags.get_dags()
training_DAGs = all_DAGs[:-1]
testing_DAGs = all_DAGs[-1]

all_observations = simulated_dags.get_observations()
training_observations = all_observations[:-1]
testing_observations = all_observations[-1]


## CausalNex (NOTEAR Algo)

In [15]:
#using causalnex for structure learning
sm = from_pandas(testing_observations)
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

html = viz.generate_html()
with open("comparison/causalnex.html", mode='w', encoding='utf-8') as fp:
        fp.write(html)


## True DAG

In [16]:
sm = StructureModel()
sm.add_edges_from(testing_DAGs.edges())
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

html = viz.generate_html()
with open("comparison/truth.html", mode='w', encoding='utf-8') as fp:
        fp.write(html)



## D2C

In [6]:
d2c = D2C(simulated_dags)
# d2c.initialize()
# d2c.save_df('comparison/comparison_descriptors.csv')

In [7]:

d2c.load_descriptors('comparison/comparison_descriptors.csv')

In [17]:
#split train test where test is the rows with graph_id = 4
df = d2c.get_df()
train = df[df['graph_id'] != 4]
test = df[df['graph_id'] == 4]

#train a random forest classifier 
X_train = train.drop(['graph_id', 'edge_source', 'edge_dest', 'is_causal'], axis=1)
y_train = train['is_causal']

X_test = test.drop(['graph_id', 'edge_source', 'edge_dest', 'is_causal'], axis=1)
X_test = test.drop(['graph_id', 'edge_source', 'edge_dest', 'is_causal'], axis=1)
y_test = test['is_causal']

rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf.fit(X_train, y_train)

#predict on test set
y_pred = rf.predict(X_test)
test['is_causal'] = y_pred

G = nx.DiGraph()  # Creates a directed graph

for index, row in test.iterrows():
    if row['is_causal']:
        G.add_edge(int(row['edge_source']), int(row['edge_dest']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['is_causal'] = y_pred


In [19]:
sm = StructureModel()
sm.add_edges_from(G.edges())
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

html = viz.generate_html()
with open("comparison/D2C.html", mode='w', encoding='utf-8') as fp:
        fp.write(html)

