# Study MPC Fact Accuracy for Random DAGs

In [1]:
import zipfile
import pandas as pd
import os


In [4]:

def read_zipped_csv(zip_path, extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
        # Assume only one CSV file in the zip
        csv_files = [f for f in os.listdir(extract_dir) if f.endswith('.csv')]
        if not csv_files:
            raise FileNotFoundError("No CSV file found in the zip archive.")
        csv_path = os.path.join(extract_dir, csv_files[0])
        return pd.read_csv(csv_path)

df_bnlearn = read_zipped_csv(
    '../../results/existing/runtime_sourcing_facts_bnlearn_graphs/runtime_results.csv.zip',
    'unzipped_bnlearn'
)
df_random = read_zipped_csv(
    '../../results/existing/runtime_sourcing_facts_random_graphs/runtime_results.csv.zip',
    'unzipped_random'
)

print(df_random.columns)

Index(['n_nodes', 'n_edges', 'seed', 'elapsed_time', 'fact_metadata',
       'true_dag'],
      dtype='object')


In [5]:
df_random.n_nodes.value_counts()


n_nodes
10    100
15    100
20    100
3      50
4      50
5      50
6      50
7      50
8      50
Name: count, dtype: int64

In [7]:
import json

def extract_fact_stats(fact_metadata):
    if isinstance(fact_metadata, str):
        facts = json.loads(fact_metadata)
    else:
        facts = fact_metadata
    stats = {}
    for n in range(0, 6):
        facts_n = [f for f in facts if len(f['node_set']) == n]
        count = len(facts_n)
        if count > 0:
            accuracy = sum(f['is_true'] for f in facts_n) / count
        else:
            accuracy = None
        stats[f'fact_count_cset_{n}'] = count
        stats[f'fact_accuracy_cset_{n}'] = accuracy
    return stats

df7 = df_random[df_random.n_nodes == 7].copy()
fact_stats = df7['fact_metadata'].apply(extract_fact_stats)
fact_stats_df = pd.DataFrame(list(fact_stats))

result_df = pd.concat([
    df7[['n_nodes', 'n_edges', 'seed', 'true_dag']].reset_index(drop=True),
    fact_stats_df.reset_index(drop=True)
], axis=1)

result_df.head()


Unnamed: 0,n_nodes,n_edges,seed,true_dag,fact_count_cset_0,fact_accuracy_cset_0,fact_count_cset_1,fact_accuracy_cset_1,fact_count_cset_2,fact_accuracy_cset_2,fact_count_cset_3,fact_accuracy_cset_3,fact_count_cset_4,fact_accuracy_cset_4,fact_count_cset_5,fact_accuracy_cset_5
0,7,7,7816,"[[0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],...",21,0.619048,36,0.666667,19,0.684211,2,0.5,0,,0,
1,7,7,3578,"[[0, 1, 0, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0],...",21,0.952381,29,0.896552,16,0.75,1,1.0,0,,0,
2,7,7,2656,"[[0, 0, 1, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 1],...",21,0.666667,12,0.833333,2,0.5,0,,0,,0,
3,7,7,2688,"[[0, 0, 0, 0, 1, 0, 1], [0, 0, 0, 0, 1, 0, 0],...",21,0.809524,42,0.833333,36,0.833333,12,0.75,1,1.0,0,
4,7,7,2494,"[[0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0],...",21,0.52381,15,0.866667,3,1.0,0,,0,,0,


In [11]:
# Calculate averages for each n in result_df
averages = {}
for n in range(6):
    count_col = f'fact_count_cset_{n}'
    acc_col = f'fact_accuracy_cset_{n}'
    averages[f'avg_count_cset_{n}'] = result_df[count_col].mean()
    averages[f'avg_accuracy_cset_{n}'] = result_df[acc_col].mean(skipna=True)

pd.DataFrame([averages]).T.sort_index()

Unnamed: 0,0
avg_accuracy_cset_0,0.699048
avg_accuracy_cset_1,0.820746
avg_accuracy_cset_2,0.792056
avg_accuracy_cset_3,0.826666
avg_accuracy_cset_4,0.884615
avg_accuracy_cset_5,1.0
avg_count_cset_0,21.0
avg_count_cset_1,40.04
avg_count_cset_2,23.9
avg_count_cset_3,5.8


# Conclusion

The reson of the ablation analysis anomaly is that there are just very small number of facts for conditioning set size larger than 2.