This notebook is for generating data

First, we generate training data

In [1]:
from multiprocessing import Pool
from d2c.data_generation.builder import TSBuilder
from d2c.descriptors import D2C, DataLoader
from d2c.benchmark import D2CWrapper



In [2]:
N_VARS = 5
MAXLAGS = 5
MB_SIZE = 3

In [3]:
tsbuilder = TSBuilder(observations_per_time_series=250, 
                              maxlags=5, 
                              n_variables=5, 
                              time_series_per_process=40, 
                              processes_to_use=[13,14], 
                              noise_std=0.1, 
                              max_neighborhood_size=2, 
                              seed=42, 
                              max_attempts=200,
                              verbose=True)

tsbuilder.build()

In [4]:
tsbuilder.get_generated_observations()[13][0]

array([[ 0.12572272,  0.19662052,  0.22587989, -0.22398384,  0.20737703],
       [-0.00134852,  0.01275251,  0.06918947, -0.33561031, -0.17130713],
       [-0.04560501, -0.1253232 ,  0.04204847, -0.24689942, -0.19282752],
       ...,
       [-0.02250109,  0.04263855, -0.04275641, -0.05926747, -0.00281437],
       [-0.13430916,  0.00977146, -0.08891246,  0.00441394, -0.15648334],
       [-0.01070794,  0.05603269, -0.24316952,  0.03918236,  0.06313588]])

In [5]:
dataloader = DataLoader(n_variables = N_VARS,
                    maxlags = MAXLAGS)
dataloader.from_tsbuilder(tsbuilder)

In [6]:
d2c = D2C(observations=dataloader.get_observations(), 
        dags=dataloader.get_dags(), 
        couples_to_consider_per_dag=20, 
        MB_size=MB_SIZE, 
        n_variables=N_VARS, 
        maxlags=MAXLAGS,
        seed=42,
        n_jobs=10,
        full=True)

d2c.initialize()

In [7]:
descriptors_df = d2c.get_descriptors_df()
descriptors_df

Unnamed: 0,graph_id,edge_source,edge_dest,is_causal,coeff_cause,coeff_eff,m_cau_q0,m_cau_q1,m_cau_q2,m_cau_q3,...,mbe_mbe_eff_q5,mbe_mbe_eff_q6,n_samples,n_features,n_features/n_samples,skewness_ca,skewness_ef,HOC_1_2,HOC_2_1,HOC_1_3
0,0,26,21,1,0.071611,0.074182,0.000144,0.000287,0.000718,0.001437,...,0.020338,0.020463,245,30,0.122449,-0.098597,-0.096436,0.049554,0.047801,0.384599
1,0,21,26,0,0.074182,0.071611,0.000000,0.000000,0.000000,0.000000,...,0.019750,0.019883,245,30,0.122449,-0.096436,-0.098597,0.047801,0.049554,0.470668
2,0,14,1,1,0.024888,0.060421,0.010407,0.020813,0.052033,0.104066,...,0.016470,0.016572,245,30,0.122449,-0.071024,-0.132746,0.026839,0.018316,0.440013
3,0,1,14,0,0.060421,0.024888,0.000000,0.000000,0.000000,0.000000,...,0.021310,0.021438,245,30,0.122449,-0.132746,-0.071024,0.018316,0.026839,0.355324
4,0,20,10,1,0.078130,0.068584,0.000000,0.000000,0.000000,0.000000,...,0.014063,0.014251,245,30,0.122449,0.156591,0.147501,-0.053803,0.078800,0.374496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,79,22,0,0,0.020693,0.009678,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,245,30,0.122449,0.148407,-0.024200,-0.053924,0.086400,0.142735
1436,79,23,3,0,0.026217,0.059798,0.006529,0.013058,0.032645,0.065291,...,0.000000,0.000000,245,30,0.122449,-0.002741,0.049217,-0.010034,-0.079916,0.415409
1437,79,16,4,0,0.019973,-0.029560,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,245,30,0.122449,0.096691,-0.097108,0.125478,0.073752,0.103269
1438,79,20,2,0,0.083048,0.072246,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,245,30,0.122449,-0.039638,0.157500,0.059145,-0.065952,0.598718


In [8]:
descriptors_df_train = descriptors_df.iloc[:len(descriptors_df)//2]
descriptors_df_test = descriptors_df.iloc[len(descriptors_df)//2:]

X_train = descriptors_df_train.drop(columns=['graph_id','edge_source','edge_dest','is_causal'])
y_train = descriptors_df_train['is_causal']
X_test = descriptors_df_test.drop(columns=['graph_id','edge_source','edge_dest','is_causal'])
y_test = descriptors_df_test['is_causal']

from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score

clf = BalancedRandomForestClassifier(n_estimators=1000, max_depth=None, random_state=0, sampling_strategy='auto',replacement=True,bootstrap=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[277, 203],
       [ 62, 178]])

In [9]:
observations = dataloader.get_original_observations()

d2cwrapper = D2CWrapper(ts_list=observations, n_variables=N_VARS, model=clf, maxlags=MAXLAGS, n_jobs = 40, full=True)

d2cwrapper.run()


<d2c.benchmark.d2c_wrapper.D2CWrapper at 0x7f91e0267a60>

In [38]:
true_causal_dfs = dataloader.get_true_causal_dfs()

causal_dfs_d2c = d2cwrapper.get_causal_dfs()

In [39]:
import pandas as pd 
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

causal_dfs_d2c_sorted = [causal_df.sort_values(by=['from','to']) for causal_df in causal_dfs_d2c.values()]

causal_dfs_d2c = pd.concat(causal_dfs_d2c_sorted)
true_causal_dfs = pd.concat(true_causal_dfs)

y_true = true_causal_dfs['is_causal'].astype(int)
y_pred_proba = causal_dfs_d2c['probability']

In [40]:
roc_auc_score(y_true, causal_dfs_d2c['probability'])

0.9065042313123076