# Run experiment on fMRI dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

print([np.__version__, pd.__version__])
np.set_printoptions(precision=3, suppress=True)


from src.data_preprocessing import preprocess_data
from src.plotting import plot_heatmap
from src.causal_matrix_evaluation import evaluate_causal_matrices
from src.run_causal_discovery import run_varlingam, run_pcmci, run_varlingam_bootstrap
from src.robust_varlingam import run_rcv_varlingam
from src.robust_pcmci import run_rcv_pcmci

['1.24.4', '2.0.3']


## Generate ground truth of adjacency matrices from relationships

In [3]:
def create_adjacency_matrices(input_file, num_nodes):
    # Read the CSV file
    df = pd.read_csv(input_file, names=['cause', 'effect', 'lag'])
    
    # Automatically detect the maximum lag
    max_lag = df['lag'].max()
    
    # Initialize adjacency matrices
    B_matrices = [np.zeros((num_nodes, num_nodes)) for _ in range(max_lag + 1)]
    
    # Populate the adjacency matrices
    for _, row in df.iterrows():
        cause, effect, lag = row['cause'], row['effect'], row['lag']
        B_matrices[lag][effect, cause] = 1  # Adjust for 0-based indexing
    
    print(f"Processing {input_file}")
    print(f"Max lag: {max_lag}")
    print(f"B_matrices shape: {B_matrices[0].shape}")
    
    return B_matrices, max_lag

def save_ground_truth(B_matrices, filepath):
    with open(filepath, 'w') as f:
        for i, B in enumerate(B_matrices):
            np.savetxt(f, B, delimiter=',', fmt='%.3f')
            if i < len(B_matrices) - 1:
                f.write('\n')

def get_num_vars(timeseries_file):
    # Read the first line of the CSV file to get the number of columns
    df = pd.read_csv(timeseries_file, nrows=1)
    return len(df.columns)

# Base paths
input_base_path = 'data/real/fMRI/relationships/'
output_base_path = 'data/real/fMRI/ground_truth/'
timeseries_base_path = 'data/real/fMRI/time_series/'

# Process all 28 datasets
for index in range(1, 29):
    input_file = f'{input_base_path}sim{index}_gt_processed.csv'
    output_file = f'{output_base_path}sim{index}_gt_processed_adj.csv'
    timeseries_file = f'{timeseries_base_path}timeseries{index}.csv'
    
    
    # Get the number of variables from the timeseries file
    n_vars = get_num_vars(timeseries_file)
    
    B_matrices, _ = create_adjacency_matrices(input_file, n_vars)
    save_ground_truth(B_matrices, output_file)
    
    print(f"Saved adjacency matrices for file {index} to {output_file}")
    print(f"Number of variables: {n_vars}\n")

print("All files processed successfully.")

Processing data/real/fMRI/relationships/sim1_gt_processed.csv
Max lag: 1
B_matrices shape: (5, 5)
Saved adjacency matrices for file 1 to data/real/fMRI/ground_truth/sim1_gt_processed_adj.csv
Number of variables: 5

Processing data/real/fMRI/relationships/sim2_gt_processed.csv
Max lag: 1
B_matrices shape: (10, 10)
Saved adjacency matrices for file 2 to data/real/fMRI/ground_truth/sim2_gt_processed_adj.csv
Number of variables: 10

Processing data/real/fMRI/relationships/sim3_gt_processed.csv
Max lag: 1
B_matrices shape: (15, 15)
Saved adjacency matrices for file 3 to data/real/fMRI/ground_truth/sim3_gt_processed_adj.csv
Number of variables: 15

Processing data/real/fMRI/relationships/sim4_gt_processed.csv
Max lag: 1
B_matrices shape: (50, 50)
Saved adjacency matrices for file 4 to data/real/fMRI/ground_truth/sim4_gt_processed_adj.csv
Number of variables: 50

Processing data/real/fMRI/relationships/sim5_gt_processed.csv
Max lag: 1
B_matrices shape: (5, 5)
Saved adjacency matrices for file

In [4]:
def ground_truth_to_matrices(csv_path):
    # Read the CSV file
    with open(csv_path, 'r') as f:
        content = f.read()

    # Split the content by double newlines to separate matrices
    matrix_strings = content.strip().split('\n\n')
    
    # Convert each matrix string to a numpy array
    matrices = []
    for matrix_string in matrix_strings:
        matrix = np.array([list(map(float, row.split(','))) for row in matrix_string.split('\n')])
        matrices.append(matrix)
    
    return matrices

# Function to load ground truth
def load_ground_truth(index):
    ground_truth_path = f'data/real/fMRI/ground_truth/sim{index}_gt_processed_adj.csv'
    try:
        return ground_truth_to_matrices(ground_truth_path)
    except FileNotFoundError:
        print(f"Ground truth file not found: {ground_truth_path}")
        return None

def save_adjacency_matrices_to_results(B_matrices, filepath):
    with open(filepath, 'w') as f:
        for i, B in enumerate(B_matrices):
            np.savetxt(f, B, delimiter=',', fmt='%.3f')
            if i < len(B_matrices) - 1:
                f.write('\n')

In [6]:
def run_all_experiments():
    methods = ['varlingam', 'pcmci', 'rcv_varlingam', 'rcv_pcmci', 'varlingam_bootstrap']
    methods = ['varlingam_bootstrap']
    results = {method: [] for method in methods}

    for index in range(1, 29):  # 28 datasets
        if index == 4:
            print("Skipping dataset 4")
            continue  # skip dataset 4
        print(f"Processing dataset {index}")
        
        # Load ground truth for this data type
        ground_truth_matrices = load_ground_truth(index)
        if ground_truth_matrices is None:
            print(f"Skipping dataset {index} due to missing ground truth")
            continue
        
        for method in methods:
            # Load data
            data = pd.read_csv(f'data/real/fMRI/time_series/timeseries{index}.csv')
            columns = data.columns.tolist()
            if "Date" in columns:
                data = data.drop(['Date'], axis=1).values
                columns.remove('Date')
            elif "timestamp" in columns:
                data = data.drop(['timestamp'], axis=1).values
                columns.remove('timestamp')
            else:
                data = data.values

            # Preprocess data
            data = preprocess_data(data, columns)

            # Run causal discovery method
            start_time = time.time()

            if method == 'varlingam':
                result = run_varlingam(data)
                adjacency_matrices = result.adjacency_matrices_
            elif method == 'pcmci':
                adjacency_matrices = run_pcmci(data, columns)
            elif method == 'rcv_varlingam':
                adjacency_matrices = run_rcv_varlingam(data)
            elif method == 'rcv_pcmci':
                adjacency_matrices = run_rcv_pcmci(data)
            elif method == 'varlingam_bootstrap':
                adjacency_matrices = run_varlingam_bootstrap(data)

            end_time = time.time()
            runtime = round(end_time - start_time, 4)

            if len(adjacency_matrices) > len(ground_truth_matrices):
                adjacency_matrices_save = adjacency_matrices[:len(ground_truth_matrices)]
            else:
                adjacency_matrices_save = adjacency_matrices

            # Save adjacency matrices
            output_file = f'results/real/fMRI/timeseries{index}/adj_matrices_{method}.csv'
            save_adjacency_matrices_to_results(adjacency_matrices_save, output_file)

            # Plot heatmap
            # plot_heatmap(adjacency_matrices_save, columns, title=f"Heatmap of Adjacency Matrices from {method} - Dataset {index}")

            # Evaluate results
            evaluation = evaluate_causal_matrices(ground_truth_matrices, adjacency_matrices)

            # Store results
            result = {
                'dataset': index,
                'Frobenius': evaluation['fro'],
                'SHD': evaluation['shd'],
                'F1': evaluation['f1'],
                'F1_directed': evaluation['f1_directed'],
                'runtime': runtime
            }
            results[method].append(result)

    # Calculate average results and save all results for each method
    for method in methods:
        df_results = pd.DataFrame(results[method])
        
        # Calculate average and standard deviation
        avg_result = df_results.mean()
        std_result = df_results.std()

        # Prepare the average row
        avg_row = {}
        for column in df_results.columns:
            if column == 'dataset':
                avg_row[column] = 'average'
            elif column == 'runtime':
                avg_row[column] = f"{avg_result[column]:.4f}"
            else:
                avg_row[column] = f"{avg_result[column]:.4f} ± {std_result[column]:.4f}"

        # Add average to results
        avg_df = pd.DataFrame([avg_row])
        df_results = pd.concat([df_results, avg_df], ignore_index=True)
        
        # Save results to CSV
        df_results.to_csv(f'results/real/fMRI/performance_{method}.csv', index=False)

# Run all experiments
run_all_experiments()

Processing dataset 1
Processing dataset 2
Processing dataset 3
Skipping dataset 4
Processing dataset 5
Processing dataset 6
Processing dataset 7
Processing dataset 8
Processing dataset 9
Processing dataset 10
Processing dataset 11
Processing dataset 12
Processing dataset 13
Processing dataset 14
Processing dataset 15
Processing dataset 16
Processing dataset 17
Processing dataset 18
Processing dataset 19
Processing dataset 20
Processing dataset 21
Processing dataset 22
Processing dataset 23
Processing dataset 24
Processing dataset 25
Processing dataset 26
Processing dataset 27
Processing dataset 28
