# Run experiment on FinanceCPT dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

print([np.__version__, pd.__version__])
np.set_printoptions(precision=3, suppress=True)


from src.data_preprocessing import preprocess_data
from src.plotting import plot_heatmap
from src.causal_matrix_evaluation import evaluate_causal_matrices
from src.run_causal_discovery import run_varlingam, run_pcmci, run_varlingam_bootstrap
from src.robust_varlingam import run_rcv_varlingam
from src.robust_pcmci import run_rcv_pcmci

## Generate ground truth of adjacency matrices from relationships

In [None]:
def create_adjacency_matrices(input_file, num_nodes):
    # Read the CSV file
    df = pd.read_csv(input_file, names=['cause', 'effect', 'lag'])
    
    # Automatically detect the maximum lag
    max_lag = df['lag'].max()
    
    # Initialize adjacency matrices
    B_matrices = [np.zeros((num_nodes, num_nodes)) for _ in range(max_lag + 1)]
    
    # Populate the adjacency matrices
    for _, row in df.iterrows():
        cause, effect, lag = row['cause'], row['effect'], row['lag']
        B_matrices[lag][effect, cause] = 1  # Adjust for 0-based indexing
    
    print(f"Processing {input_file}")
    print(f"Max lag: {max_lag}")
    print(f"B_matrices shape: {B_matrices[0].shape}")
    
    return B_matrices, max_lag

def save_ground_truth(B_matrices, filepath):
    with open(filepath, 'w') as f:
        for i, B in enumerate(B_matrices):
            np.savetxt(f, B, delimiter=',', fmt='%.3f')
            if i < len(B_matrices) - 1:
                f.write('\n')

# Base paths
input_base_path = 'data/real/FinanceCPT/relationships/'
output_base_path = 'data/real/FinanceCPT/ground_truth/'

# Process files A through E
for letter in 'ABCDE':
    input_file = f'{input_base_path}random-rels_20_1{letter}.csv'
    output_file = f'{output_base_path}random-rels_20_1{letter}_adj.csv'
    
    B_matrices, _ = create_adjacency_matrices(input_file, 25)
    save_ground_truth(B_matrices, output_file)
    
    print(f"Saved adjacency matrices for file {letter} to {output_file}\n")

print("All files processed successfully.")

In [None]:
def ground_truth_to_matrices(csv_path):
    # Read the CSV file
    with open(csv_path, 'r') as f:
        content = f.read()

    # Split the content by double newlines to separate matrices
    matrix_strings = content.strip().split('\n\n')
    
    # Convert each matrix string to a numpy array
    matrices = []
    for matrix_string in matrix_strings:
        matrix = np.array([list(map(float, row.split(','))) for row in matrix_string.split('\n')])
        matrices.append(matrix)
    
    return matrices

# Function to load ground truth
def load_ground_truth(letter):
    ground_truth_path = f'data/real/FinanceCPT/ground_truth/random-rels_20_1{letter}_adj.csv'
    try:
        return ground_truth_to_matrices(ground_truth_path)
    except FileNotFoundError:
        print(f"Ground truth file not found: {ground_truth_path}")
        return None

def save_adjacency_matrices_to_results(B_matrices, filepath):
    with open(filepath, 'w') as f:
        for i, B in enumerate(B_matrices):
            np.savetxt(f, B, delimiter=',', fmt='%.3f')
            if i < len(B_matrices) - 1:
                f.write('\n')

In [None]:
def run_experiment(data_type):
    methods = ['varlingam', 'pcmci', 'rcv_varlingam', 'rcv_pcmci']#, 'varlingam_bootstrap']
    
    # Load ground truth for this data type
    ground_truth_matrices = load_ground_truth(data_type)
    if ground_truth_matrices is None:
        print(f"Skipping experiments due to missing ground truth")
        return
    
    for method in methods:
        results = []
        
        # Load data
        data = pd.read_csv(f'data/real/FinanceCPT/returns/random-rels_20_1A_returns30007000.csv')
        columns = data.columns.tolist()
        if "Date" in columns:
            data = data.drop(['Date'], axis=1).values
            columns.remove('Date')
        elif "timestamp" in columns:
            data = data.drop(['timestamp'], axis=1).values
            columns.remove('timestamp')
        else:
            data = data.values
        
        # Preprocess data
        data = preprocess_data(data, columns)
        
        # Run causal discovery method
        start_time = time.time()
        
        if method == 'varlingam':
            result = run_varlingam(data)
            adjacency_matrices = result.adjacency_matrices_
        elif method == 'pcmci':
            adjacency_matrices = run_pcmci(data, columns)
        elif method == 'rcv_varlingam':
            adjacency_matrices = run_rcv_varlingam(data)
        elif method == 'rcv_pcmci':
            adjacency_matrices = run_rcv_pcmci(data)
        elif method == 'varlingam_bootstrap':
            adjacency_matrices = run_varlingam_bootstrap(data)
            
        end_time = time.time()
        runtime = round(end_time - start_time, 4)
        
        if len(adjacency_matrices) > len(ground_truth_matrices):
            adjacency_matrices_save = adjacency_matrices[:len(ground_truth_matrices)]
        else:
            adjacency_matrices_save = adjacency_matrices
        plot_heatmap(adjacency_matrices_save, columns, title=f"Heatmap of Adjacency Matrices from {method}")
        output_file = f'results/real/FinanceCPT/random-rels_20_1{data_type}/adj_matrices_{method}.csv'
        save_adjacency_matrices_to_results(adjacency_matrices_save, output_file)
        # Evaluate results
        evaluation = evaluate_causal_matrices(ground_truth_matrices, adjacency_matrices)
        
        # Store results
        results.append({
            'Frobenius': evaluation['fro'],
            'SHD': evaluation['shd'],
            'F1': evaluation['f1'],
            'F1_directed': evaluation['f1_directed'],
            'runtime': runtime
        })
        
        # Save results to CSV
        df_results = pd.DataFrame(results)
        df_results.to_csv(f'results/real/FinanceCPT/random-rels_20_1{data_type}/performance_{method}.csv', index=False)

In [None]:
data_types = ['A', 'B', 'C', 'D', 'E']
data_types = ['E']

for data_type in data_types:
    print(f"Running experiments for random-rels_20_1{data_type} data...")
    run_experiment(data_type)

print("All experiments completed.")