### **1. Simulation of i.i.d. Data with nonlinear, nonidentifiable causal transformations**

In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("..")

#Data simulation imports:
import pickle
from Data_Simulation_Framework.IID_Data_Generation_Process import lin_func,relu_func, Data_Generation_Process
from Results_Visualization.Visualization_Tools import plot_sf_er_graph_NodesEdgesMapping
from Causal_Discovery_Models.Causal_Discovery import Causal_Discovery
from Src.Utils import python_pickle_to_rds_r

2024-11-29 00:18:00,444 - C:\Users\Georg Velev\anaconda3\lib\site-packages\castle\backend\__init__.py[line:36] - INFO: You can use `os.environ['CASTLE_BACKEND'] = backend` to set the backend(`pytorch` or `mindspore`).
2024-11-29 00:18:00,691 - C:\Users\Georg Velev\anaconda3\lib\site-packages\castle\algorithms\__init__.py[line:36] - INFO: You are using ``pytorch`` as the backend.


In [None]:
#Initialize the data generator instance:
nonlinear_pattern='ReLU'
nonlinearities=[[(1.0,lin_func)],#linear patterns only
                [(0.5,lin_func),(0.5,relu_func)],#equal probability
               [(0.3,lin_func),(0.7,relu_func)],#70% probability relu
                [(0.1,lin_func),(0.9,relu_func)]]#90% probability relu

data_generator=Data_Generation_Process(beta_lower_limit=0.5,
                betta_upper_limit_values=[1.0,2.0,3.0,4.0],
                cont_noise=1.0,
                nr_nodes_values=[10,20,50,100],
                edge_desnity_values=[0.2,0.3,0.4],
                data_scale_values=['original','standardized'],
                num_samples=2500,
                nonlinearities=nonlinearities)

#First simulate the i.i.d. datasets based on the ER graph model:
er_simulation=data_generator.large_scale_simulation(graph_type='ER')

#Save large sample size ER-base data:
data_generator.save_data(frames_descriptions=er_simulation[0],
                         true_causal_matrices=er_simulation[1],true_weighted_causal_matrices=er_simulation[2],
                         frames=er_simulation[3],nonlinear_pattern=nonlinear_pattern,
                         graph_type='ER',sample_size='Large_Sample_Size',
                         save_path='../Data_Simulation_Framework/Simulated_Datasets/')

#Save small sample size ER-based data:
data_generator.save_data(frames_descriptions=er_simulation[0],
                         true_causal_matrices=er_simulation[1],true_weighted_causal_matrices=er_simulation[2],
                         frames=er_simulation[3],nonlinear_pattern=nonlinear_pattern,
                         graph_type='ER',sample_size='Small_Sample_Size',
                         save_path='../Data_Simulation_Framework/Simulated_Datasets/')

#Prepare avg number of edges per number of nodes for SF graph type:
avg_number_edges=data_generator.get_avg_number_edges_ER_graph(frames_descriptions=er_simulation[0],
                                                             save_path_edge_mapping='../Performance_Evaluation_Framework/Results/Avg_Number_Edges.pkl')

#Simulate SF-based data:
sf_simulation=data_generator.large_scale_simulation(graph_type='SF',avg_number_edges=avg_number_edges)

#Save large sample size SF-based data:
data_generator.save_data(frames_descriptions=sf_simulation[0],
                         true_causal_matrices=sf_simulation[1],true_weighted_causal_matrices=sf_simulation[2],
                         frames=sf_simulation[3],nonlinear_pattern=nonlinear_pattern,
                         graph_type='SF',sample_size='Large_Sample_Size',
                         save_path='../Data_Simulation_Framework/Simulated_Datasets/')

#Save small sample size SF-based data:
data_generator.save_data(frames_descriptions=sf_simulation[0],
                         true_causal_matrices=sf_simulation[1],true_weighted_causal_matrices=sf_simulation[2],
                         frames=sf_simulation[3],nonlinear_pattern=nonlinear_pattern,
                         graph_type='SF',sample_size='Small_Sample_Size',
                         save_path='../Data_Simulation_Framework/Simulated_Datasets/')

In [None]:
plot_sf_er_graph_NodesEdgesMapping(er_frames_descriptions=er_simulation[0],
                                      sf_frames_descriptions=sf_simulation[0])

In [None]:
#Simulate 40 datasets with increased beta upper limit:
beta_upper_limit_generator=Data_Generation_Process(beta_lower_limit=0.5,
                betta_upper_limit_values=[1.0,4.0,6.0,8.0],
                cont_noise=1.0,nr_nodes_values=[20],
                edge_desnity_values=[0.3],data_scale_values=['original'],
                num_samples=2500,nonlinearities=[[(0.1,lin_func),(0.9,relu_func)]])

betas_simulaiton=beta_upper_limit_generator.large_scale_simulation(graph_type='SF',avg_number_edges={'Nodes_20':{0.3:avg_number_edges['Nodes_20'][0.3]}})

beta_upper_limit_generator.save_data(frames_descriptions=betas_simulaiton[0],
                         true_causal_matrices=betas_simulaiton[1],true_weighted_causal_matrices=betas_simulaiton[2],
                         frames=betas_simulaiton[3],nonlinear_pattern=nonlinear_pattern,
                         graph_type='SF',sample_size='Large_Sample_Size',
                         save_path='../Data_Simulation_Framework/Simulated_Datasets/Betas_')

### **2. Application of Causal Discovery Models on simulated Data**

In [2]:
with open('../Data_Simulation_Framework/Simulated_Datasets/ER_Large_Sample_Size_Datasets_ReLU_10_nodes.pkl','rb') as f:
    frames_list=pickle.load(f)

In [4]:
causal_discovery=Causal_Discovery(frames_list=frames_list[:3],
                                  index_frame_description=0,
                                  index_true_adjacency=1,
                                  index_true_weighted_adjacency=2,
                                  index_frame=3)
causal_discovery.extract_causal_graphs()
causal_discovery_demo=causal_discovery.causal_discovery_results
causal_discovery_demo_rds=[csl_itm.copy() for csl_itm in causal_discovery_demo]

DASK Client Dashboard Link:  http://127.0.0.1:8787/status
Using default cache_path: `C:\Users\Georg Velev\Desktop\Humboldt Uni\CSL_Benchmark_Study\Causal_Discovery_iid_Data\Src\cache`


2024-11-28 22:12:42,055 - C:\Users\Georg Velev\anaconda3\lib\site-packages\jax\_src\xla_bridge.py[line:622] - INFO: Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2024-11-28 22:12:42,057 - C:\Users\Georg Velev\anaconda3\lib\site-packages\jax\_src\xla_bridge.py[line:622] - INFO: Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
2024-11-28 22:12:42,064 - C:\Users\Georg Velev\anaconda3\lib\site-packages\jax\_src\xla_bridge.py[line:622] - INFO: Unable to initialize backend 'tpu': module 'jaxlib.xla_extension' has no attribute 'get_tpu_client'


In [6]:
#Save the the results in RDS format:
save_file_path='../Performance_Evaluation_Framework/Results/Causal_Discovery_Results_Demo.rds'
python_pickle_to_rds_r(frames_list=causal_discovery_demo_rds,save_path=save_file_path)
#Run the script Hybrid_Bayesian_Networks.R in the folder Causal_Discovery_Models in order to
#collect the results for the same file from bayesian hybrid networks

In [8]:
#Append Results from hybrid bayesian networks computed in R to the python results list:
with open('../Performance_Evaluation_Framework/Results/Causal_Discovery_Results_Demo_Hybrid_Bayesian_Networks.pkl','rb') as f:
    causal_discovery_demo_hbn=pickle.load(f)

for csl_index in range(0,len(causal_discovery_demo)):
    #Convert the rds-to-pickle dataframe in python format and compare it to the frames in the python file,
    #to make sure that the results from hybrid bayesian networks are mapped to those from the other models:
    rds_frame=causal_discovery_demo_hbn[csl_index]['dataset']
    cols_array=[int(col) for col in rds_frame.columns]
    rds_frame.columns=cols_array
    indices=[int(index_value) for index_value in rds_frame.index]
    rds_frame.index=indices

    if causal_discovery_demo[csl_index][3].equals(rds_frame):
        causal_discovery_demo[csl_index][-1]['pctabu']=causal_discovery_demo_hbn[csl_index]['pctabu']
        causal_discovery_demo[csl_index][-1]['mmtabu']=causal_discovery_demo_hbn[csl_index]['mmtabu']
        causal_discovery_demo[csl_index][-1]['fedtabu']=causal_discovery_demo_hbn[csl_index]['fedtabu']
    else:
        print('Mismatch found at index: ',csl_index)

In [6]:
#Overwrite the frames list with all results in the instance of causal discovery & save it in the result folder:
causal_discovery.set_causal_discovery_results(current_csl_results=causal_discovery_demo)
causal_discovery.save_csl_results(save_path='../Performance_Evaluation_Framework/Results/Causal_Discovery_Results_Demo.pkl',
                                 save_for_evaluation=True)

In [11]:
len(list(causal_discovery_demo[0][-1].keys()))

14

In [None]:
#import avici
#from dask.distributed import Client
#import pickle
#import torch
#import numpy as np
#import networkx as nx

#import math
#import torch.nn as nn
#import copy
#from tqdm.auto import tqdm
#import typing

#from scipy.stats import ttest_ind
#import pandas as pd
#sklearn

#from abc import ABCMeta, abstractmethod
#from pygam import LinearGAM, s
#from pygam.terms import Term, TermList

#from dataclasses import dataclass, field
#from warnings import warn
#import inspect
#import warnings
#from collections import defaultdict
#from copy import deepcopy
#from copy import copy
#from importlib.metadata import version  # type: ignore
#import types
#from itertools import combinations
#from importlib.metadata import version  # type: ignore

#import igraph as ig
#import matplotlib.pyplot as plt
#from cdt.metrics import SID
#from rpy2 import robjects
#from interpret.glassbox import ExplainableBoostingRegressor

#import sys
#sys.path.append("..")
#import seaborn as sns
#import statsmodels.api as sm

In [None]:
import avici,dask, pickle, torch, numpy, networkx, math, copy, tqdm, typing, scipy, pandas, sklearn, abc, pygam, dataclasses, warnings, inspect, collections, copy, importlib, types, itertools, igraph, matplotlib, cdt, rpy2, interpret, seaborn, statsmodels

packages=[avici,dask, pickle, torch, numpy, networkx, math, copy, tqdm, typing, scipy, pandas, sklearn, 
abc, pygam, dataclasses, warnings, inspect, collections, copy, importlib, types, 
itertools, igraph, matplotlib, cdt, rpy2, interpret, seaborn, statsmodels]

for pkg in packages:
    try:
        print(str(pkg).split(' ')[1].split("'")[1],'==',pkg.__version__)
    except:
        print('In.built python package.')