In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
import matplotlib as mpl
import re
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import os

# Set global font sizes - for manuscript
mpl.rcParams['font.size'] = 20
mpl.rcParams['axes.labelsize'] = 28
mpl.rcParams['xtick.labelsize'] = 24
mpl.rcParams['ytick.labelsize'] = 24
mpl.rcParams['legend.fontsize'] = 18


In [None]:

def spearman_correlation_by_classification(data: pd.DataFrame, metric1: str, metric2: str):
    """
    Compute the Spearman rank correlation coefficient and p-value for two subsets of data defined by classification.
    
    The data is split into two groups:
      - Group "B": where Classification is exactly "B".
      - Group "LSW/HSW": where Classification is either "LSW" or "HSW".
    
    Parameters:
        data (pd.DataFrame): The DataFrame containing your dataset.
        metric1 (str): The name of the first metric (column) in the DataFrame.
        metric2 (str): The name of the second metric (column) in the DataFrame.
        
    Returns:
        dict: A dictionary with keys "B" and "LSW/HSW". Each key maps to a tuple (rho, p_value)
              where rho is the Spearman correlation coefficient and p_value is the corresponding p-value.
    """
    # Check if required columns are present
    required_cols = {metric1, metric2, "Classification"}
    missing = required_cols - set(data.columns)
    if missing:
        raise ValueError(f"Missing required columns in DataFrame: {missing}")

    # Split data into two subsets:
    # Subset for Classification "B"
    subset_B = data[data["Classification"] == "B"]
    # Subset for Classification either "LSW" or "HSW"
    subset_LSW_HSW = data[data["Classification"].isin(["LSW", "HSW"])]

    # Function to compute Spearman correlation, dropping missing values
    def compute_corr(df_subset: pd.DataFrame):
        df_clean = df_subset[[metric1, metric2]].dropna()
        return spearmanr(df_clean[metric1], df_clean[metric2])
    
    # Calculate the correlations for each group
    results = {}
    results["B"] = compute_corr(subset_B)
    results["LSW/HSW"] = compute_corr(subset_LSW_HSW)
    
    return results



In [None]:

# Example usage:
if __name__ == "__main__":
    # Replace the file path with your actual CSV file location.
    file_path = r"C:\Users\Feifei\Box\BR_remote_sensing\ebi_combined_statistics.csv"
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        exit(1)
    
    # Define the two metrics you want to compare (update with actual column names from your CSV).
    metric_a = "Iw"  # Replace with your actual column name for metric 1
    metric_b = "norm_migration_rate"  # Replace with your actual column name for metric 2
    
    try:
        results = spearman_correlation_by_classification(df, metric_a, metric_b)
        for group, (rho, p_value) in results.items():
            print(f"Group '{group}': Spearman rho = {rho}, p-value = {p_value}")
    except ValueError as ve:
        print(ve)

In [None]:
import os
import glob

def count_folders_with_reprojected_gpkg(base_path):
    """
    Count the number of sub-folders in base_path that contain at least one GeoPackage (.gpkg) file 
    ending with '_reprojected.gpkg'.
    
    Parameters:
        base_path (str): The path to the directory containing the river folders.
    
    Returns:
        int: Number of folders that have the _reprojected.gpkg file.
    """
    count = 0
    # List all items in the base_path and filter for directories
    for folder in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder)
        if os.path.isdir(folder_path):
            # Use glob to search for any file ending with _reprojected.gpkg in the current folder
            matching_files = glob.glob(os.path.join(folder_path, "*_reprojected.gpkg"))
            if matching_files:
                count += 1
    return count

if __name__ == "__main__":
    # Update the path as needed for your environment
    ebi_results_path = r"C:\Users\Feifei\Box\BR_remote_sensing\ebi_results"
    
    num_folders = count_folders_with_reprojected_gpkg(ebi_results_path)
    print(f"Number of folders having a '_reprojected.gpkg' file: {num_folders}")


In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score  # ← import metrics

# 1) Load the dataset
csv_path = r"C:\Users\Feifei\Box\BR_remote_sensing\ebi_combined_statistics.csv"
df = pd.read_csv(csv_path)

# 2) Rename slope column for convenience
df.rename(columns={'Slope (cm/km) ': 'Slope_cm_per_km'}, inplace=True)

# 3) Map feature names and targets to LaTeX axis labels
label_map = {
    'dim_Q':                 r'$\mathit{Q}^*$',
    'cov_discharge_site':    r'$Q_{CV}$',
    'Slope_cm_per_km':       r'$\mathit{S}$',
    'mean_ebi_site':         r'$\langle\overline{\mathit{eBI}}\rangle$',
    'std_ebi_site':          r'$\langle\overline{\mathit{eBI_{std}}}\rangle$',
    'T_R':                   r'$T_{R}$',
    'CB/Aw':                 r'$CB_{norm}$',
    'eBI_BI_ratio_site':     r'$\langle\overline{\mathit{eBI}}\rangle/\langle\overline{\mathit{BI}}\rangle$',
    'Iw':                    r'$I_{w}$'
}

# 4) Define which features to use for each target variable
features_map = {
    'mean_ebi_site':       ['dim_Q', 'cov_discharge_site', 'Slope_cm_per_km'],
    'T_R':                 ['dim_Q', 'cov_discharge_site', 'mean_ebi_site', 'std_ebi_site'],
    'CB/Aw':               ['dim_Q', 'cov_discharge_site', 'T_R', 'mean_ebi_site', 'std_ebi_site'],
    # for eBI_BI_ratio_site use: dim_Q, Slope_cm_per_km, Iw, cov_discharge_site
    'eBI_BI_ratio_site':   ['dim_Q', 'Slope_cm_per_km', 'Iw', 'cov_discharge_site']
}

def fit_and_plot_rf(data, river_type, target, features):
    # a) Prepare X, y
    X = data[features]
    y = data[target]
    
    # b) Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # c) Fit random forest
    rf = RandomForestRegressor(n_estimators=500, random_state=42)
    rf.fit(X_train, y_train)
    
    # d) Compute predictions and metrics
    y_pred = rf.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2   = r2_score(y_test, y_pred)
    
    # print to console
    print(f"{river_type} {target} — RMSE: {rmse:.3f}, R²: {r2:.3f}")
    
    # e) Extract and sort importances
    imps = rf.feature_importances_
    order = imps.argsort()
    y_labels = [label_map[f] for f in [features[i] for i in order]]
    
    # f) Plot feature importances
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.barh(y_labels, imps[order], align='center')
    ax.set_xlabel("Feature Importance")
    ax.set_title(f"{river_type}\n{label_map[target]}", fontsize=14)
    
    # annotate metrics inside the plot
    ax.text(
        0.95, 0.05,
        f"RMSE: {rmse:.2f}\n$R^2$: {r2:.2f}",
        transform=ax.transAxes,
        ha="right", va="bottom",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7)
    )
    
    plt.tight_layout()
    
    # g) Sanitize filename and choose suffix
    safe_target = re.sub(r'[<>:"/\\|?*]', '_', target)
    suffix = '_B' if river_type == 'Braided' else '_W'
    outdir = r"C:\Users\Feifei\Box\BR_remote_sensing\figures"
    os.makedirs(outdir, exist_ok=True)
    filename = os.path.join(outdir, f"11_{safe_target}{suffix}.pdf")
    
    # h) Save to PDF
    fig.savefig(
        filename,
        format='pdf',
        dpi=500,
        bbox_inches='tight',
        transparent=True
    )
    plt.show()

# 5) Split into braided vs wandering
df_braided   = df[df['Classification'] == 'B']
df_wandering = df[df['Classification'].isin(['LSW','HSW'])]

# 6) Loop through each target and save both _B and _W versions
for target, feats in features_map.items():
    fit_and_plot_rf(df_braided,   "Braided",  target, feats)
    fit_and_plot_rf(df_wandering, "Wandering", target, feats)



