In [1]:
import os
import pandas as pd
import numpy as np

## Compute the level of agreement between indicators

In [None]:
# Directory containing the CSV files
input_directory = 'path/FDextracted'

In [63]:
# Mapping of original column names to standardized ones
method_map = {
    "Alencar et al.": "Alencar",
    "Noguera et al.": "Noguera",
    "Osman et al.": "Osman",
    "Ford and Labosier": "Ford_and_Labosier",
    "Pendergrass et al.": "Pendergrass",
    "Christian et al.": "Christian"
}

In [64]:
# Initialize a list to store the rows
rows_list = []

In [65]:
# Check columns once and filter safely
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        huc4_code = filename.split('_')[-1].split('.')[0]
        df = pd.read_csv(os.path.join(input_dir, filename))

        # Validate presence of required columns
        available_cols = set(df.columns)
        missing_cols = [col for col in method_map if col not in available_cols]
        if missing_cols:
            print(f"Skipping {filename}, missing columns: {missing_cols}")
            continue

        # Filter rows where any method detected an event
        method_cols = list(method_map.keys())
        filtered_df = df[df[method_cols].eq(1).any(axis=1)].copy()
        filtered_df["HUC4"] = huc4_code

        # Rename for standardization
        filtered_df = filtered_df.rename(columns=method_map)
        rows_list.append(filtered_df[["Date", "HUC4"] + list(method_map.values())])

# Combine all results
if rows_list:
    results = pd.concat(rows_list, ignore_index=True)

    # Compute agreement matrix
    methods = list(method_map.values())
    correlation_matrix = pd.DataFrame(index=methods, columns=methods, dtype=float)

    for m1 in methods:
        for m2 in methods:
            if m1 == m2:
                correlation_matrix.loc[m1, m2] = 100.0
            else:
                both_agree = np.sum((results[m1] == 1) & (results[m2] == 1))
                m1_total = np.sum(results[m1] == 1)
                correlation_matrix.loc[m1, m2] = (both_agree / m1_total * 100) if m1_total else 0

    print("Agreement correlation matrix:")
    print(correlation_matrix.round(1))
else:
    print("No valid data found in the input directory.")

Skipping huc4_event_summary_by_method_threshold.csv, missing columns: ['Alencar et al.', 'Noguera et al.', 'Osman et al.', 'Ford and Labosier', 'Pendergrass et al.', 'Christian et al.']
Agreement correlation matrix:
                   Alencar  Noguera  Osman  Ford_and_Labosier  Pendergrass  \
Alencar              100.0     49.2   65.4               49.0         27.1   
Noguera               35.0    100.0   51.7               33.0         24.2   
Osman                 20.6     22.9  100.0               39.5         13.8   
Ford_and_Labosier     27.4     26.0   70.3              100.0         15.9   
Pendergrass           29.3     36.8   47.4               30.8        100.0   
Christian             15.2     18.4   35.5               20.8         11.7   

                   Christian  
Alencar                 22.2  
Noguera                 19.2  
Osman                   16.3  
Ford_and_Labosier       17.1  
Pendergrass             18.5  
Christian              100.0  
