# Results: Generalization to Unseen Dynamics

A critical test for any supervised method is: **Does it work on physics it hasn't seen before?**

We trained TD2C on 9 specific types of non-linear processes (Process IDs 1, 3, 5...) and tested it on 9 *completely different* processes (Process IDs 2, 4, 6...).

### Performance Metrics
We focus on **Recall** (finding all true links) and **F1-Score** (balance), as Causal Discovery is a high-class-imbalance problem.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Define path to results (adjust if you are running locally vs cloud)
RESULTS_PATH = "../data/causal_dfs/causal_dfs_TEST.pkl"

if os.path.exists(RESULTS_PATH):
    # Load the results generated by reproduce/04_run_benchmarks.py
    import pickle
    with open(RESULTS_PATH, "rb") as f:
        loaded_data = pickle.load(f)
    
    # Unpack tuple (Standard format from benchmark script)
    # The order corresponds to: VAR, VARLiNGAM, PCMCI, MVGC, PCMCI-GPDC, Granger, DYNOTEARS, D2C
    # (Note: This depends on your specific run script order, verify with your pickle)
    results_d2c = loaded_data[7] # Assuming D2C is index 7
    results_pcmci = loaded_data[2]
    true_graphs = loaded_data[-1]
    
    print("Results loaded successfully.")
    
    # Calculate simple F1 score for D2C vs PCMCI on the first 5 runs
    from sklearn.metrics import f1_score
    
    scores = []
    threshold = 0.309
    
    for run_id in list(true_graphs.keys())[:50]: # First 50 runs
        y_true = true_graphs[run_id]['is_causal']
        
        # D2C
        y_prob = results_d2c[run_id]['probability']
        y_pred_d2c = (y_prob > threshold).astype(int)
        f1_d2c = f1_score(y_true, y_pred_d2c)
        
        # PCMCI
        y_pred_pcmci = results_pcmci[run_id]['is_causal']
        f1_pcmci = f1_score(y_true, y_pred_pcmci)
        
        scores.append({'Method': 'TD2C', 'F1': f1_d2c})
        scores.append({'Method': 'PCMCI', 'F1': f1_pcmci})
        
    df_scores = pd.DataFrame(scores)
    
    # Plot
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df_scores, x='Method', y='F1', palette="Set2")
    plt.title("Generalization to Unseen Synthetic Dynamics")
    plt.ylabel("F1 Score")
    plt.show()

else:
    print("⚠️ Benchmark results not found.")
    print("Please run `reproduce/py_scripts/04_run_benchmarks.py` to generate the data.")