# Orbit Gauge Analysis

In [None]:

# Credit to Nikki Tebaldi, https://podaac.github.io/tutorials/notebooks/datasets/SWOT_L4_DAWG_SOS_DISCHARGE.html#plot-discharge-timeseries
import datetime
import time
import pathlib
import os,sys
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import netCDF4 as nc
import numpy as np
import pandas as pd
import cartopy
import geopandas as gpd
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import folium
import requests
from io import StringIO
import h5py
import math
import zipfile
from datetime import timedelta
import json
import seaborn as sns 
import warnings
from tqdm import tqdm 
from itertools import islice
from scipy.stats import pearsonr
from itertools import combinations
from scipy.stats import genextreme
from scipy.stats import ttest_1samp
from scipy.stats import ttest_rel
from scipy import stats
from scipy.stats import median_abs_deviation, shapiro, ttest_1samp, wilcoxon

#Specialized Functions
from swotOrbitFunctions_gauged import *


plt.rcParams['font.family'] = 'serif'

plt.rcParams.update({
    'axes.titlesize': 30,    # Title font size
    'axes.labelsize': 22,     # Axis labels font size
    'xtick.labelsize': 20,    # X-axis ticks font size
    'ytick.labelsize': 20,    # Y-axis ticks font size
    'legend.fontsize': 20,    # Legend font size
    'font.size': 20,          # Global font size for text
    'figure.titlesize': 24,   # Figure title font size
    'lines.linewidth': 2,     # Line width
    'axes.linewidth': 2,      # Axis line width
    'axes.grid': True,        # Show grid
    'grid.linestyle': '--',   # Dashed grid lines
    'grid.alpha': 0.5,        # Grid line transparency
    'figure.figsize': (10, 6) # Figure size (width, height in inches)
})


        
color_dict = {
    'sic4dvar': 'green',
    'momma': 'blue',
    'neobam': 'purple',
    'consensus': 'sienna',
    'metroman': 'orange',
    'geobam': 'purple',
    'hivdi': 'deeppink',
    'sad': 'tomato',
    'gauge': 'dimgrey',
    'gauge_swot_match': 'silver'
}

color_dict_runCombos = {
    'Continuous': '#D83A34',
    'Fast': '#FD8500',
    'Science': '#2B9EB3',
    'Sampled': '#6610F2',
    'gauge': 'black',
    'Continuous-FSO': '#FFEE32',
    'Continuous-SO': 'blue',
}

divide_date = pd.to_datetime('2023-07-11')


In [None]:
# CALL Gauge Matched

runA_perm_gauge = pd.read_csv('/all_gauge_a').drop_duplicates().dropna(subset='time')
runB_perm_gauge = pd.read_csv('/all_gauge_b').drop_duplicates().dropna(subset='time')
runC_perm_gauge = pd.read_csv('/all_gauge_c').drop_duplicates().dropna(subset='time')
runE_perm_gauge = pd.read_csv('/all_gauge_e').drop_duplicates().dropna(subset='time')
runE_perm_gauge =runE_perm_gauge[pd.to_datetime(runE_perm_gauge['time']) < divide_date]
runA_perm_gauge =runA_perm_gauge[pd.to_datetime(runA_perm_gauge['time']) < divide_date]

# Add continuous orbit split
runC_perm_gauge_fast = runC_perm_gauge[pd.to_datetime(runC_perm_gauge['time']) < divide_date]
runC_perm_gauge_science = runC_perm_gauge[pd.to_datetime(runC_perm_gauge['time']) >= divide_date]

dfs_gauge = {
    'Fast': runA_perm_gauge,
    'Science': runB_perm_gauge,
    'Continuous': runC_perm_gauge,
    'Sampled': runE_perm_gauge,
    'Continuous-FSO': runC_perm_gauge_fast,
    'Continuous-SO': runC_perm_gauge_science
}

for label, df in dfs_gauge.items():
    df['algo'] = df['algo'].replace({'geobam': 'neobam'})
    df = calc_cons(df=df)
    valid_reaches = df.loc[(df['algo'] == 'gauge') & (df['gauge_discharge'].notna()), 'reach_id'].unique()
    runE_perm_gauge = df[df['reach_id'].isin(valid_reaches)]

    dfs_gauge[label] = df 

### CV Filtering Analysis

In [None]:
#INITIAL CV - calc metrics and plot proposed change
# dfs_gauge_swot_metrics = {}
# for label, df in filtered_dfs.items():
#     df = calculate_metrics(df=df, reaches=list(df.reach_id.unique()))
#     dfs_gauge_swot_metrics[label] = df 
#     print('done: ', label)
    
dfs_gauge_swot_metrics =append_RMD(dfs_q=dfs_gauge_swot)
dfs_gauge_swot_metrics = append_coeffVar(dfs_q=dfs_gauge_swot_metrics)

In [None]:
# CV figures

algo_threshold=0.25
algo_threshold2=0.4

summary_df, rejected_data = plot_metric_cdfs_with_filters(
    df=dfs_gauge_swot_metrics['Continuous'],
    algo='consensus',
    algo_threshold=algo_threshold,
    algo_threshold2=algo_threshold2
)

summary_df

In [None]:
#Rejected Rach ID analysis

    
plot_reach_id_counts_by_algo(
    df_orig = dfs_gauge_swot_metrics['Continuous'],
    df=rejected_data[f'CV > {algo_threshold}'],
    filter_condition =f'CV > {algo_threshold}',
    color_dict=color_dict,
    exclude_algos=['gauge', 'consensus']
)
plot_reach_id_counts_by_algo(
    df_orig = dfs_gauge_swot_metrics['Continuous'],
    df=rejected_data[f'CV > {algo_threshold2}'],
    filter_condition =f'CV > {algo_threshold2}',
    color_dict=color_dict,
    exclude_algos=['gauge', 'consensus']
)



In [None]:
# Rejected reach hydrographs
# Assuming 'df_metrics' contains the necessary discharge and metrics data
plot_discharge_with_metrics(df_metrics=rejected_data[f'CV > {algo_threshold}'], divide_date=pd.to_datetime('2023-07-11'), color_dict=color_dict)


In [None]:

#NOW DO CV FILTERING ON ACTUAL DATA
for label, df in dfs_gauge.items():
    df['algo'] = df['algo'].replace({'geobam': 'neobam'})
    df = calc_cons(df=df)
    dfs_gauge[label] = df 
print(dfs_gauge['Science'].algo.unique())

dfs_gauge =append_RMD(dfs_q=dfs_gauge)
dfs_gauge = append_coeffVar(dfs_q=dfs_gauge)

dfs_gauge = remove_low_cv_and_recalc_consensus(dfs_gauge, CV_thresh = 0.25)

for label, df in dfs_gauge.items():
        df = df.drop(columns=['CV', 'CV_cons', 'CV_gauge','RMD_cons'])
        dfs_gauge[label] = df  # Save the modified DataFrame back
        
dfs_gauge =append_RMD(dfs_q=dfs_gauge)
dfs_gauge = append_coeffVar(dfs_q=dfs_gauge)

print(dfs_gauge['Science'])

for label, df in dfs_gauge.items():
    df = calculate_metrics(df=df, reaches=list(df.reach_id.unique()))
    dfs_gauge[label] = df
    print('done calculating metrics for :', label)

In [None]:
# CALL Gauge FullRaNGE

runA_perm_fullRange = pd.read_csv('/all_gauge_fullRange_a.csv').drop_duplicates().dropna(subset='time')
runB_perm_fullRange = pd.read_csv('/all_gauge_fullRange_b.csv').drop_duplicates().dropna(subset='time')
runC_perm_fullRange = pd.read_csv('/all_gauge_fullRange_c.csv').drop_duplicates().dropna(subset='time')
runE_perm_fullRange = pd.read_csv('/all_gauge_fullRange_e.csv').drop_duplicates().dropna(subset='time')
runA_perm_fullRange =runA_perm_fullRange[pd.to_datetime(runA_perm_fullRange['time']) < divide_date]
runE_perm_fullRange =runE_perm_fullRange[pd.to_datetime(runE_perm_fullRange['time']) < divide_date]


#Add continous orbit split
runC_perm_fullRange_fast = runC_perm_fullRange[pd.to_datetime(runC_perm_fullRange['time']) < divide_date]
runC_perm_fullRange_science = runC_perm_fullRange[pd.to_datetime(runC_perm_fullRange['time']) >= divide_date]


dfs_gauge_fullRange = {'Fast': runA_perm_fullRange, 'Science': runB_perm_fullRange, 'Continuous': runC_perm_fullRange, 'Sampled': runE_perm_fullRange, 'Continuous-FSO': runC_perm_fullRange_fast, f'Continuous-SO': runC_perm_fullRange_science}


for label, df in dfs_gauge_fullRange.items():
    df['algo'] = df['algo'].replace({'geobam': 'neobam'})
    df = calc_cons(df=df)
    valid_reaches = df.loc[(df['algo'] == 'gauge') & (df['gauge_discharge'].notna()), 'reach_id'].unique()
    runE_perm_gauge = df[df['reach_id'].isin(valid_reaches)]
    
    dfs_gauge_fullRange[label] = df 

In [None]:

dfs_gauge_fullRange = append_RMD(dfs_q=dfs_gauge_fullRange)
dfs_gauge_fullRange = append_coeffVar(dfs_q=dfs_gauge_fullRange)
    

dfs_gauge_fullRange = remove_low_cv_and_recalc_consensus(dfs_gauge_fullRange, CV_thresh = 0.25)

for label, df in dfs_gauge_fullRange.items():
        df = df.drop(columns=['CV', 'CV_cons', 'CV_gauge','RMD_cons'])
        dfs_gauge_fullRange[label] = df  # Save the modified DataFrame back


dfs_gauge_fullRange =append_RMD(dfs_q=dfs_gauge_fullRange)
dfs_gauge_fullRange = append_coeffVar(dfs_q=dfs_gauge_fullRange)

print(dfs_gauge_fullRange['Fast'])

for label, df in dfs_gauge_fullRange.items():
    df = calculate_metrics(df=df, reaches=list(df.reach_id.unique()))
    dfs_gauge_fullRange[label] = df
    print('done calculating metrics for :', label)

for label, df in dfs_gauge.items():
# Filter only 'gauge' rows
    #df['algo'] = df['algo'].replace('gauge', 'gauge_swot_match')
    df.loc[df['algo'] == 'gauge', 'algo'] = 'gauge_swot_match'
    gauge_rows = df[df['algo'] == 'gauge_swot_match'].copy()
    
    # Rename 'algo' to 'gauge_swot_match'
    # Append to target
    dfs_gauge_fullRange[label] = pd.concat([dfs_gauge_fullRange[label], gauge_rows], ignore_index=True)


In [None]:
#SAVE AS NEEDED
# pd.DataFrame(dfs_gauge_fullRange['Fast']).to_csv('//data/abcd_perm/dfs_gauge_fullRange_a_perm_filtered.csv', index=False)
# pd.DataFrame(dfs_gauge_fullRange['Science']).to_csv('//data/abcd_perm/dfs_gauge_fullRange_b_perm_filtered.csv', index=False)
# pd.DataFrame(dfs_gauge_fullRange['Continuous']).to_csv('//data/abcd_perm/dfs_gauge_fullRange_c_perm_filtered.csv', index=False)
# pd.DataFrame(dfs_gauge_fullRange['Sampled']).to_csv('//data/abcd_perm/dfs_gauge_fullRange_e_perm_filtered.csv', index=False)


# pd.DataFrame(dfs_gauge['Fast']).to_csv('//data/abcd_perm/dfs_gauge_a_perm_filtered.csv', index=False)
# pd.DataFrame(dfs_gauge['Science']).to_csv('//data/abcd_perm/dfs_gauge_b_perm_filtered.csv', index=False)
# pd.DataFrame(dfs_gauge['Continuous']).to_csv('//data/abcd_perm/dfs_gauge_c_perm_filtered.csv', index=False)
# pd.DataFrame(dfs_gauge['Sampled']).to_csv('//data/abcd_perm/dfs_gauge_e_perm_filtered.csv', index=False)


# Hydrographs

In [None]:
# Plot quad plot of reach IDs


quad_reach_ids = [78100600061, 23214100051, 81390400111,74294400231] #74294400031,  ]
# Color dictionary
# color_dict_timeseries = {
#     'Fast': 'lightskyblue', 
#     'Science': 'mediumblue', 
#     'Continuous': 'darkviolet', 
#     'Sampled': 'crimson', 
#     'gauge': 'darkgrey'
# }
color_dict_timeseries = {
    'Continuous': '#D83A34',
    'Fast': '#FD8500',
    'Science': '#2B9EB3',
    'Sampled': '#6610F2',
    'gauge': 'black'
}
plot_quad_consensus(
    dfs_dict=dfs_gauge_fullRange,
    labels=labels,
    divide_date=pd.to_datetime('2023-07-11'),
    color_dict=color_dict_timeseries,
    algo='consensus',
    reach_ids=quad_reach_ids
)


In [None]:
#Plot all hydrographs, edit funciton for certain reach IDs

labels = ['Fast', 'Science', 'Continuous', 'Sampled', 'gauge']

color_dict_timeseries = {
    'Continuous': '#D83A34',
    'Fast': '#FD8500',
    'Science': '#2B9EB3',
    'Sampled': '#6610F2',
    'gauge': 'black'
}

plot_consensus_from_multiple_dfs(
    dfs_dict=dfs_gauge_fullRange,
    labels=labels,
    divide_date=pd.to_datetime('2023-07-11'),
    color_dict=color_dict_timeseries,
    algo='consensus'
)

In [None]:
# Print statistics for dfs_gauge

# 1. Find common gauges across Fast, Science, and Continuous
runs_to_compare = ['Fast', 'Continuous', 'Science']
reach_ids_by_run = {}

for label in runs_to_compare:
    if label in dfs_gauge:
        df = dfs_gauge[label].copy()
        reach_ids = set(df['reach_id'].unique())
        reach_ids_by_run[label] = reach_ids
        print(f"Total gauges in {label}: {len(reach_ids)}")

# Find intersection
if len(reach_ids_by_run) == 3:
    common_reach_ids = reach_ids_by_run['Fast'].intersection(
        reach_ids_by_run['Science']).intersection(reach_ids_by_run['Continuous'])
    print(f"\nGauges that overlap Fast/Science/Continuous: {len(common_reach_ids)}")

# 2. Print total gauges in Sampled if available
if 'Sampled' in dfs_gauge:
    sampled_reach_ids = set(dfs_gauge['Sampled']['reach_id'].unique())
    print(f"Total gauges in Sampled: {len(sampled_reach_ids)}")

# 3. Find gauges with data before 2023-07-12 in Continuous orbit
if 'Continuous' in dfs_gauge:
    df_cont = dfs_gauge['Continuous'].copy()
    df_cont['time'] = pd.to_datetime(df_cont['time'])
    cutoff_date = pd.to_datetime('2023-07-12')
    
    gauges_before_cutoff = df_cont[df_cont['time'] < cutoff_date]['reach_id'].unique()
    print(f"\nGauges with data before 2023-07-12 in Continuous orbit: {len(gauges_before_cutoff)}")

In [None]:
#Get the gauge match reaches and lat/lon for plotting
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from pathlib import Path
import glob

color_dict_runCombos = {
    'Continuous': '#D83A34',
    'Fast': '#FD8500',
    'Science': '#2B9EB3',
    'Sampled': '#6610F2',
    'gauge': 'black',
    'Continuous-FSO': '#FFEE32',
    'Continuous-SO': 'blue',
}

# Read all SWORD v16 geopackage files
sword_path = '//SWORDv16/gpkg/'
sword_files = glob.glob(f'{sword_path}*_sword_reaches_v16.gpkg')

print(f"Found {len(sword_files)} SWORD geopackage files")

# Read and combine all SWORD reach data
sword_dfs = []
for sword_file in sword_files:
    print(f"Reading {Path(sword_file).name}...")
    gdf = gpd.read_file(sword_file)
    # print(gdf.columns.values.tolist())
    # Keep only reach_id, p_lat, p_lon, and geometry columns
    gdf = gdf[['reach_id', 'x', 'y', 'geometry']].copy()
    sword_dfs.append(gdf)

# Combine all SWORD data
sword_reaches = pd.concat(sword_dfs, ignore_index=True)
sword_reaches['reach_id'] = sword_reaches['reach_id'].astype(int)
print(f"Total SWORD reaches loaded: {len(sword_reaches)}")

# Create four DataFrames by joining gauged reach_ids with SWORD geographic data
dfs_gauge_geo = {}

for run_type, df in dfs_gauge.items():
    df['reach_id'] = df['reach_id'].astype(int)
    
    # Get unique reach_ids from the gauged data
    gauged_reaches = df[df['algo'] == 'consensus']['reach_id'].unique()
    print(f"\n{run_type}: {len(gauged_reaches)} unique reach_ids")
    
    # Join with SWORD geographic data
    df_geo = sword_reaches[sword_reaches['reach_id'].isin(gauged_reaches)].copy()
    df_geo['run_type'] = run_type
    
    dfs_gauge_geo[run_type] = df_geo
    print(f"{run_type}: {len(df_geo)} gauged reaches matched in SWORD")

def map_gauges_by_orbit(dict_of_dfs, scaling, color_dict, algo_name, common_reach_ids):
    """
    Map the locations of gauged reaches colored by run type.
    
    Parameters:
        dict_of_dfs (dict): Dictionary of DataFrames with geographic data
        scaling (str): Type of scaling (e.g., 'gauge')
        color_dict (dict): Dictionary mapping run types to colors
        algo_name (str): Algorithm name (e.g., 'consensus')
    """
    
    # Plot each run type with its corresponding color
    for run_type, df_geo in dict_of_dfs.items():
        if run_type in ['Continuous-SO', 'Continuous-FSO']:
            continue
        if len(df_geo) > 0:
            df_geo.to_csv(f'//figs/gauge_locations_{run_type}_consensus.csv')
            df_geo = df_geo[df_geo['reach_id'].isin(common_reach_ids)]
            fig = plt.figure(figsize=(16, 10))
            ax = plt.axes(projection=ccrs.PlateCarree())
            
            # Add map features
            ax.add_feature(cfeature.LAND, facecolor='lightgray', alpha=0.3)
            ax.add_feature(cfeature.OCEAN, facecolor='lightblue', alpha=0.3)
            ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
            ax.add_feature(cfeature.BORDERS, linewidth=0.5, linestyle=':')
            ax.add_feature(cfeature.RIVERS, alpha=0.5)
            
            color = color_dict.get(run_type, 'gray')
            
            ax.scatter(df_geo['x'], df_geo['y'], 
                      c=color, 
                      s=50, 
                      alpha=0.5,
                      label=run_type,
                      transform=ccrs.PlateCarree(),
                      edgecolors='black',
                      linewidths=0.5)
    
            ax.set_extent([-180, 180, -60, 75], crs=ccrs.PlateCarree())
            ax.gridlines(draw_labels=True, dms=True, x_inline=False, y_inline=False, alpha=0.3)
            
            plt.legend(loc='lower left', fontsize=12, framealpha=0.9)
            plt.title(f'Gauged Reach Locations for {run_type}', fontsize=16, pad=20)
            
            plt.tight_layout()
            plt.savefig(f'//figs/gauge_locations_{run_type}_{algo_name}.png', 
                        dpi=300, bbox_inches='tight')
            plt.show()
            plt.close()

# Call the mapping function
map_gauges_by_orbit(
    dict_of_dfs=dfs_gauge_geo,
    scaling='gauge',
    color_dict=color_dict_runCombos,
    algo_name='consensus',
    common_reach_ids=common_reach_ids
)

## Priors

In [None]:
priors_a = pd.read_csv('/prior_q_a_fsReaches.csv')
priors_b = pd.read_csv('/prior_q_b_fsReaches.csv')
priors_c = pd.read_csv('/prior_q_c_fsReaches.csv')
priors_e = pd.read_csv('/prior_q_e_fsReaches.csv')



dfs_priors = {
    'Fast': priors_a,
    'Science': priors_b,
    'Continuous': priors_c,
    'Sampled': priors_e,

}



In [None]:
#Match gauge data to prior data

dfs_gauge_priors = match_priors_to_gauge_data(dfs_gauge, dfs_priors)

# Verify the results
for orbit_type, df in dfs_gauge_priors.items():
    print(f"\n{orbit_type} DataFrame:")
    print(df.columns)
    
    # Only print head if the columns exist
    try:
        print(df[['reach_id', 'time', 'month', 'matched_model_monthly_q', 'matched_gauge_monthly_q', 'matched_gauge_CAL_flag']].head())
    except KeyError:
        print("Unable to print selected columns")

In [None]:
# Add continuous orbit split
priors_c_fast = dfs_gauge_priors['Continuous'][pd.to_datetime(dfs_gauge_priors['Continuous']['time']) < divide_date]
priors_c_science = dfs_gauge_priors['Continuous'][pd.to_datetime(dfs_gauge_priors['Continuous']['time']) >= divide_date]


dfs_gauge_priors = {
    'Fast': dfs_gauge_priors['Fast'],
    'Science': dfs_gauge_priors['Science'],
    'Continuous': dfs_gauge_priors['Continuous'],
    'Sampled': dfs_gauge_priors['Sampled'],
    'Continuous-FSO': priors_c_fast,
    'Continuous-SO': priors_c_science
}


In [None]:
#Plot the monthly prior and gauge/consensus discharge 

plot_multiple_reaches(dfs_gauge_priors, color_dict, num_reaches=5)



In [None]:
dfs_gauge_priors_pivoted = {}    
for label, df in dfs_gauge_priors.items():
    # Prepare the DataFrame
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])
    df = df[(df['Q'] > 1) & (df['Q'] < 1e7)]
    
    # Print columns to debug
    print(f"Columns in {label}:", list(df.columns))
    
    # Add matched_model_monthly_q as an algorithm
    model_df = df.copy()
    model_df = model_df[model_df['algo'] == 'consensus']
    model_df['algo'] = 'monthly_priors'
    
    # Use .loc to set Q column
    model_df.loc[:, 'Q'] = model_df['matched_model_monthly_q']
    
    # Combine original and model DataFrames
    full_df = pd.concat([df, model_df], ignore_index=True)
    dfs_gauge_priors_pivoted[label] = full_df

In [None]:


# Get dfs_gauge_priors and compare metric distribution
common_reach_ids = plot_algorithm_metric_cdfs_from_dict(
    dict_of_dfs=dfs_gauge,
    scaling='gauge',
    color_dict=color_dict_runCombos,
    algo_name='consensus',
    dfs_gauge_priors=dfs_gauge_priors 
)

In [None]:
# Seasonal log ratio by orbit: compare priors to gauge
plot_seasonal_log_ratio_by_orbit(
    dfs_dict=dfs_gauge_priors_pivoted,
    consensus_algo='consensus',
    gauge_algo='gauge_swot_match',
    monthly_prior_algo='monthly_priors',
    output_dir='//figs/'
)


In [None]:
# Seasonal log ratio by continent: compare priors to gauge

plot_seasonal_log_ratio_by_continent(
    dfs_dict=dfs_gauge_priors_pivoted,
    consensus_algo='consensus',
    gauge_algo='gauge_swot_match',
    monthly_prior_algo='monthly_priors',
    output_dir='//figs/'
)

### Comparisons and Stats

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp, wilcoxon, shapiro

# Grouped boxplot comparing prior/gauge by orbit

plot_grouped_boxplot_differences(
    dict_of_dfs=dfs_gauge,
    algo_name='consensus',
    color_dict=color_dict_runCombos,
    plot=True
)



## Region

In [None]:
# Grouped boxplot comparing prior/gauge by continent 

#By Reach


plot_grouped_boxplot_runs_by_reach(dfs_dict=dfs_gauge, consensus_algo='consensus')


In [None]:
# Quantile POint performance 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error



results_by_df = plot_nd_scatter_by_quantile_per_df(
    dfs_dict=dfs_gauge_fullRange,
    consensus_algo='consensus',
    gauge_algo='gauge'
)

all_tables = []

for orbit_name, res in results_by_df.items():
    if orbit_name in ['Continuous-SO', 'Continuous-FSO']:
        continue

    # Extract metrics and build DataFrame
    metrics = res['metrics']
    metrics_df = pd.DataFrame(metrics).T

    # Add orbit name as a column
    metrics_df.insert(0, "Orbit", orbit_name)

    # Add Quantile column (from index)
    metrics_df = metrics_df.reset_index().rename(columns={'index': 'Quantile'})

    all_tables.append(metrics_df)

# Combine all into one table
final_table = pd.concat(all_tables)

# Sort by Orbit then Quantile
final_table = final_table.sort_values(by=["Quantile"]).reset_index(drop=True)

final_table

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from scipy.stats import wilcoxon, binomtest

    
#Algo differences by continent w/stats
            
plot_selected_algorithm_differences_grouped_by_continent(
    dict_of_dfs=dfs_gauge,
    algo_name='consensus',
    color_dict=color_dict,
    plot=True
)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import cm  # old-style import for compatibility


plot_per_reach_metric_difference_heatmaps_switched_axes(
    dict_of_dfs=dfs_gauge,
    algo_name='consensus'
)

## Season

In [None]:
#REACH

plot_orbitwise_log_ratio_vs_gauge_by_season(dfs_dict=dfs_gauge, consensus_label='consensus', gauge_label='gauge_swot_match', plot=True)

# Orbit Regime Characteristics Gauge Summary

In [None]:
#Regime


aggregated_q_overall = summarize_overall_Q(dfs_q=dfs_gauge_fullRange, algo='consensus')
# Combine and save
overall_summary_all_cons = pd.concat(aggregated_q_overall.values(), ignore_index=True)

overall_summary_all_cons = summarize_peaks(overall_summary_all_cons)


print(overall_summary_all_cons.columns.values.tolist())
overall_summary_all_cons.to_csv('//data/abcd_perm/consensus_gauges_regime_summary.csv', float_format="%.2f", index=False)
print('save successful')


#Gauge full Range
aggregated_q_overall = summarize_overall_Q(dfs_q=dfs_gauge_fullRange, algo='gauge')
# Combine and save
aggregated_q_overall_gauge_fullRange = pd.concat(aggregated_q_overall.values(), ignore_index=True)

aggregated_q_overall_gauge_fullRange = summarize_peaks(aggregated_q_overall_gauge_fullRange)


aggregated_q_overall_gauge_fullRange.to_csv('//data/abcd_perm/gauge_gauges_regime_summary.csv', float_format="%.2f", index=False)
print('save successful')


#Gauge SWOT Match
aggregated_q_overall = summarize_overall_Q(dfs_q=dfs_gauge_fullRange, algo='gauge_swot_match')
# Combine and save
overall_summary_all_gauge_swot = pd.concat(aggregated_q_overall.values(), ignore_index=True)

overall_summary_all_gauge_swot = summarize_peaks(overall_summary_all_gauge_swot)

overall_summary_all_gauge_swot.to_csv('//data/abcd_perm/gauge_swot_match_gauges_regime_summary.csv', float_format="%.2f", index=False)
print('save successful')


aggregated_q_overall = summarize_overall_Q(dfs_q=dfs_gauge_fullRange, algo='metroman')

# Combine and save
overall_summary_all_metroman = pd.concat(aggregated_q_overall.values(), ignore_index=True)
overall_summary_all_metroman = summarize_peaks(overall_summary_all_metroman)

overall_summary_all_metroman.to_csv('//data/abcd_perm/metroman_gauges_regime_summary.csv', float_format="%.2f", index=False)
print('save successful')



aggregated_q_overall = summarize_overall_Q(dfs_q=dfs_gauge_fullRange, algo='sic4dvar')
# Combine and save
overall_summary_all_sic4dvar = pd.concat(aggregated_q_overall.values(), ignore_index=True)

overall_summary_all_sic4dvar = summarize_peaks(overall_summary_all_sic4dvar)

overall_summary_all_sic4dvar.to_csv('//data/abcd_perm/sic4dvar_gauges_regime_summary.csv', float_format="%.2f", index=False)
print('save successful')



aggregated_q_overall = summarize_overall_Q(dfs_q=dfs_gauge_fullRange, algo='momma')
overall_summary_all_momma = pd.concat(aggregated_q_overall.values(), ignore_index=True)

overall_summary_all_momma = summarize_peaks(overall_summary_all_momma)

# Combine and save
overall_summary_all_momma.to_csv('//data/abcd_perm/momma_gauges_regime_summary.csv', float_format="%.2f", index=False)
print('save successful')

aggregated_q_overall = summarize_overall_Q(dfs_q=dfs_gauge_fullRange, algo='neobam')
overall_summary_all_geobam = pd.concat(aggregated_q_overall.values(), ignore_index=True)

overall_summary_all_geobam = summarize_peaks(overall_summary_all_geobam)


# Combine and save
overall_summary_all_geobam.to_csv('//data/abcd_perm/neobam_gauges_regime_summary.csv', float_format="%.2f", index=False)
print('save successful')

In [None]:
#By season
aggregated_q_overall_season = summarize_overall_Q_by_season(dfs_q=dfs_gauge_fullRange, algo='consensus')
# Combine and save
overall_summary_all_cons_season = pd.concat(aggregated_q_overall_season.values(), ignore_index=True)


print(overall_summary_all_cons_season.columns.values.tolist())
overall_summary_all_cons_season.to_csv('//data/abcd_perm/consensus_gauges_regime_season_summary.csv', float_format="%.2f", index=False)
print('save successful')


#Gauge full Range
aggregated_q_overall_season = summarize_overall_Q_by_season(dfs_q=dfs_gauge_fullRange, algo='gauge')
# Combine and save
aggregated_q_overall_gauge_fullRange_season = pd.concat(aggregated_q_overall_season.values(), ignore_index=True)



aggregated_q_overall_gauge_fullRange_season.to_csv('//data/abcd_perm/gauge_gauges_regime_season_summary.csv', float_format="%.2f", index=False)
print('save successful')


#Gauge SWOT Match
aggregated_q_overall_season = summarize_overall_Q_by_season(dfs_q=dfs_gauge_fullRange, algo='gauge_swot_match')
# Combine and save
overall_summary_all_gauge_swot_season = pd.concat(aggregated_q_overall_season.values(), ignore_index=True)


overall_summary_all_gauge_swot_season.to_csv('/gauge_swot_match_gauges_regime_season_summary.csv', float_format="%.2f", index=False)
print('save successful')


aggregated_q_overall_season = summarize_overall_Q_by_season(dfs_q=dfs_gauge_fullRange, algo='metroman')

# Combine and save
overall_summary_all_metroman_season = pd.concat(aggregated_q_overall_season.values(), ignore_index=True)

overall_summary_all_metroman_season.to_csv('/metroman_gauges_regime_season_summary.csv', float_format="%.2f", index=False)
print('save successful')



aggregated_q_overall_season = summarize_overall_Q_by_season(dfs_q=dfs_gauge_fullRange, algo='sic4dvar')
# Combine and save
overall_summary_all_sic4dvar_season = pd.concat(aggregated_q_overall_season.values(), ignore_index=True)


overall_summary_all_sic4dvar_season.to_csv('/sic4dvar_gauges_regime_season_summary.csv', float_format="%.2f", index=False)
print('save successful')



aggregated_q_overall_season = summarize_overall_Q_by_season(dfs_q=dfs_gauge_fullRange, algo='momma')
overall_summary_all_momma_season = pd.concat(aggregated_q_overall_season.values(), ignore_index=True)


# Combine and save
overall_summary_all_momma_season.to_csv('/momma_gauges_regime_season_summary.csv', float_format="%.2f", index=False)
print('save successful')

aggregated_q_overall_season = summarize_overall_Q_by_season(dfs_q=dfs_gauge_fullRange, algo='neobam')
overall_summary_all_geobam_season = pd.concat(aggregated_q_overall_season.values(), ignore_index=True)


# Combine and save
overall_summary_all_geobam_season.to_csv('/neobam_gauges_regime_season_summary.csv', float_format="%.2f", index=False)
print('save successful')

#### Flow Duration Curves

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, wasserstein_distance


color_dict_runCombos = {
    'Continuous': '#D83A34',
    'Fast': '#FD8500',
    'Science': '#2B9EB3',
    'Sampled': '#6610F2',
    'gauge': 'black',
    'Continuous-FSO': '#FFEE32',
    'Continuous-SO': 'turquoise',
}

summary_metrics = plot_reach_consensus_cdfs(
    df_dict=dfs_gauge_fullRange,
    variable='Q',
    algo='consensus',
    color_dict=color_dict_runCombos
)

summary_metrics

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, wasserstein_distance

#TAKES AWHILE - need to parallelize
summary_metrics = plot_reach_cdfs_horizontal_with_ks_emd(
    df_dict=dfs_gauge_fullRange,
    variable='Q',
    algos=['consensus', 'gauge', 'gauge_swot_match']
)

# Display the table
print(summary_metrics.head())


#### Percentile analysis

In [None]:
peak_type = '90'

for orb in ['Fast', 'Science', 'Sampled', 'Continuous']:
    
    seasonal_summary_cont = overall_summary[overall_summary['run'] == orb].copy()

    # --- Add season columns based on peak max_date ---
    for col in seasonal_summary_cont.columns:
        if col.endswith('_max_date') and peak_type in col:  # or use '_start' if that's what defines peak timing
            season_col = col.replace('_max_date', '_season')
            overall_summary[season_col] = overall_summary[col].apply(
                lambda d: get_season_orbits(d) if pd.notna(d) else np.nan
            )


    # Melt only the columns that exist in df
    season_cols = [col for col in seasonal_summary_cont.columns if col.endswith('_season')]
    df_melt = seasonal_summary_cont.melt(
        id_vars=['algo', 'reach_id'],  
        value_vars=season_cols,
        var_name='peak',
        value_name='season'
    )
    # Drop missing season values
    df_melt = df_melt.dropna(subset=['season'])

    # Filter for specific orbits
    df_melt = df_melt[df_melt['algo'].isin(['consensus', 'gauge', 'sic4dvar', 'neobam', 'metroman', 'momma'])]

    # --- Count peaks per reach_id ---
    counts_per_reach = (
        df_melt.groupby(['reach_id', 'algo', 'season'])
        .size()
        .reset_index(name='n_peaks')
    )

    # --- Average across reach_ids ---
    avg_counts = (
        counts_per_reach.groupby(['algo', 'season'])['n_peaks']
        .mean()
        .reset_index()
    )

    # Check if there's data to plot
    if avg_counts.empty or avg_counts['n_peaks'].isna().all():
        print(f"No data to plot for orbit: {orb}")
        continue

    # --- Plot as barplot ---
    plt.figure(figsize=(24, 12))
    ax = sns.barplot(
        data=avg_counts,
        x='season',
        y='n_peaks',
        hue='algo',
        palette=color_dict,
        hue_order=['gauge', 'consensus', 'sic4dvar', 'neobam', 'momma', 'metroman']
    )

    # --- Annotate bars ---
    for p in ax.patches:
        height = p.get_height()
        if pd.isna(height) or height <= 0:
            continue
        ax.annotate(
            f"{height:.1f}",                   # one decimal place
            (p.get_x() + p.get_width() / 2., height),
            ha='center', va='bottom',
            fontsize=25, color='black',
            xytext=(0, 3), textcoords='offset points'
        )
    
    # --- Add padding above bars for annotations ---
    ymax = avg_counts['n_peaks'].max()
    if pd.notna(ymax) and np.isfinite(ymax):  # Check if ymax is valid
        plt.ylim(0, ymax * 1.15)  # add 15% headroom (tweak as needed)

    plt.title(f"{orb} Run - Q{peak_type}", fontsize=44)
    plt.ylabel(f"Average Reach Q{peak_type} Events", fontsize=44)
    plt.xticks(fontsize=40)
    plt.yticks(fontsize=40)
    plt.xlabel("Season", fontsize=44)
    plt.legend(title='Algorithm', title_fontsize=28, fontsize=26)
    plt.tight_layout()
    plt.savefig(f'//figs/seasonPeakCount_{orb}_{peak_type}.png', dpi=350)
    plt.show()

In [None]:
#REVISIT TIME PLOTS

# Color dictionary
# color_dict_timeseries = {
#     'Fast': 'lightskyblue', 
#     'Science': 'mediumblue', 
#     'Continuous': 'darkviolet', 
#     'Sampled': 'crimson', 
#     'gauge': 'darkgrey'
# }

get_revisit_times_overall(dfs_q=dfs_gauge_fullRange, color_dict_timeseries=color_dict_timeseries)
get_revisit_times(dfs_q=dfs_gauge_fullRange, color_dict_timeseries=color_dict_timeseries)


In [None]:

plot_q90_summary_by_run(overall_summary, peaks_long, overlaps, color_dict_timeseries, peak_type="10")
plot_q90_summary_by_run(overall_summary, peaks_long, overlaps, color_dict_timeseries, peak_type="90")

# Algo Analysis

In [None]:
# Call summary of algos and print
summary_algo_df = plot_metric_cdfs_by_algo(
    dict_of_dfs=dfs_gauge,    # your dict of DataFrames keyed by orbit labels
    scaling='gauge',
    color_dict=color_dict_timeseries,  # your colors keyed by orbit labels (dict keys)
    algo_name=None          # or specify algo like 'consensus'
)
summary_algo_df

In [None]:
summary_algo_df.sort_values(by=['Metric', 'Orbit'])

In [None]:
# Metrics by orbit/run

combined_df = plot_metric_cdfs_faceted_by_orbit(
    dict_of_dfs=dfs_gauge,          # your dict of DataFrames keyed by orbit labels
    scaling='gauge',
    color_dict_algo=color_dict,     # your colors keyed by algo names
    algo_name=None                  # or specify algo like 'consensus'
)

In [None]:
def summarize_median_p67_by_orbit_algo_metric(combined_df):
    """
    Summarize median, 67th percentile (p67), and count of unique reach_ids (n)
    for each algorithm × metric × orbit combination.

    Parameters:
    - combined_df (DataFrame): with columns:
        ['Metric', 'Value', 'algo', 'orbit', 'reach_id']

    Returns:
    - summary_df (DataFrame): with columns:
        ['orbit', 'Metric', 'algo', 'median', 'p67', 'n']
    """
    summary_df = (
        combined_df
        .groupby(['orbit', 'Metric', 'algo'])
        .agg(
            median=('Value', lambda x: np.median(x.dropna())),
            p67=('Value', lambda x: np.percentile(x.dropna(), 67)),
            n=('reach_id', lambda x: x.nunique())
        )
        .reset_index()
    )
    return summary_df

summary = summarize_median_p67_by_orbit_algo_metric(combined_df)
summary[summary['Metric'].isin(['r','nBIAS','NSE'])].sort_values(['algo','Metric'])


In [None]:
#Algo and Gauge CV


algos_to_plot = ['hivdi', 'sic4dvar', 'momma', 'neobam', 'consensus', 'geobam', 'metroman', 'gauge_swot_match','gauge']

plot_cdf_coeff(dfs_q=dfs_gauge_fullRange, color_dict=color_dict, algos_to_plot = algos_to_plot)


#RMD
    
cdfPlot_RMD(dfs_q=dfs_gauge, color_dict=color_dict, column_to_plot = 'RMD_cons')
#cdfPlot_RMD(dfs_q=dfs_q, color_dict=color_dict, column_to_plot = 'RMD_gauge')