# Multi-model ensembles

The FDCs estimated by individual models are combined into multi-model ensembles in this notebook.  The goal is to evaluate whether multi-model ensembles can exploit low rank correlation to improve performance overall.


In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

import multiprocessing as mp
import json

from utils.kde_estimator import KDEEstimator
from utils.fdc_estimator_context import FDCEstimationContext
from utils.fdc_data import StationData
from utils.evaluation_metrics import EvaluationMetrics

import utils.data_processing_functions as dpf

In [2]:
attr_fpath = 'data/catchment_attributes_with_runoff_stats.csv'
attr_df = pd.read_csv(attr_fpath, dtype={'Official_ID': str})
station_ids = sorted(attr_df['official_id'].unique().tolist())

# streamflow folder from (updated) HYSETS
HYSETS_DIR = Path('/home/danbot/code/common_data/HYSETS')
hs_df = pd.read_csv('data/HYSETS_watershed_properties.txt', sep=';')
hs_df = hs_df[hs_df['Official_ID'].isin(station_ids)]
hs_df.head(2)

Unnamed: 0,Watershed_ID,Source,Name,Official_ID,Centroid_Lat_deg_N,Centroid_Lon_deg_E,Drainage_Area_km2,Drainage_Area_GSIM_km2,Flag_GSIM_boundaries,Flag_Artificial_Boundaries,...,Land_Use_Wetland_frac,Land_Use_Water_frac,Land_Use_Urban_frac,Land_Use_Shrubs_frac,Land_Use_Crops_frac,Land_Use_Snow_Ice_frac,Flag_Land_Use_Extraction,Permeability_logk_m2,Porosity_frac,Flag_Subsoil_Extraction
846,847,HYDAT,CROWSNEST RIVER AT FRANK,05AA008,49.59732,-114.4106,402.6522,,0,0,...,0.0103,0.0065,0.0328,0.0785,0.0015,0.0002,1,-15.543306,0.170479,1
849,850,HYDAT,CASTLE RIVER NEAR BEAVER MINES,05AA022,49.48866,-114.1444,820.651,,0,0,...,0.0058,0.0023,0.0105,0.1156,0.0246,0.0,1,-15.929747,0.150196,1


In [3]:
watershed_id_dict = {row['Watershed_ID']: row['Official_ID'] for _, row in hs_df.iterrows()}
# and the inverse
official_id_dict = {row['Official_ID']: row['Watershed_ID'] for _, row in hs_df.iterrows()}
# also for drainage areas
da_dict = {row['Official_ID']: row['Drainage_Area_km2'] for _, row in hs_df.iterrows()}

In [4]:
# retrieve LSTM ensemble predictions
LSTM_ensemble_result_folder = '/home/danbot/code/neuralhydrology/data/ensemble_results_20250514'# uses NSE mean as loss function
# LSTM_ensemble_result_folder = '/home/danbot/code/neuralhydrology/data/ensemble_results_20250627'# uses NSE 95% as loss function
lstm_result_files = os.listdir(LSTM_ensemble_result_folder)
lstm_result_stns = [e.split('_')[0] for e in lstm_result_files]

# filter for the common stations between BCUB region and LSTM-compatible (i.e. 1980-)
daymet_concurrent_stations = list(set(station_ids) & set(lstm_result_stns))
# assert '012414900' in daymet_concurrent_stations
print(f'There are {len(daymet_concurrent_stations)} monitored basins concurrent with LSTM ensemble results.')


There are 715 monitored basins concurrent with LSTM ensemble results.


Load the global mean PMF and resample to the higher resolution evaluation grid (12 bits)

In [5]:
# load the baseline PMFs from the previous notebook
pmf_path = Path(os.getcwd()) / 'data' / 'results' / 'baseline_distributions' / f'pmf_obs.csv'
pmf_df = pd.read_csv(pmf_path, index_col=0)
pmf_stations = pmf_df[[c for c in daymet_concurrent_stations if c in pmf_df.columns]].columns

In [6]:
# see Notebook 1 for details on these exclusions
exclude_stations = ['08FA009', '08GA037', '08NC003', '12052500', '12090480', '12107950', '12108450', '12119300', 
                    '12119450', '12200684', '12200762', '12203000', '12409500', '15056070', '15081510',
                    '12323760', '12143700', '12143900', '12398000', '12058800', '12137800', '12100000']

official_ids_to_include = [s for s in pmf_stations if s not in exclude_stations]

In [7]:
# load the predicted parameter results
parameter_prediction_results_folder = os.path.join('data', 'results', 'parameter_prediction_results', )
predicted_params_fpath   = os.path.join(parameter_prediction_results_folder, 'OOS_parameter_predictions.csv')
rdf = pd.read_csv(predicted_params_fpath, index_col=['official_id'], dtype={'official_id': str})
predicted_param_dict = rdf.to_dict(orient='index')
predicted_param_dict['0212414900'].keys()

dict_keys(['uar_mean_predicted', 'uar_mean_actual', 'uar_std_predicted', 'uar_std_actual', 'uar_median_predicted', 'uar_median_actual', 'uar_mad_predicted', 'uar_mad_actual', 'log_uar_mean_predicted', 'log_uar_mean_actual', 'log_uar_std_predicted', 'log_uar_std_actual', 'log_uar_median_predicted', 'log_uar_median_actual', 'log_uar_mad_predicted', 'log_uar_mad_actual'])

In [8]:
LSTM_forcings_folder = '/home/danbot/neuralhydrology/data/BCUB_catchment_mean_met_forcings_20250320'
# LSTM_ensemble_result_folder = '/home/danbot/code/neuralhydrology/data/ensemble_results'
attr_df_fpath = os.path.join('data', f'catchment_attributes_with_runoff_stats.csv')
baseline_distribution_folder = 'data/results/baseline_distributions'

methods = ('parametric', 'lstm', 'knn',)
# methods = ('knn',)
include_pre_1980_data = True  # use only stations with data 1980-present concurrent with Daymet
daymet_start_date = '1980-01-01'  # default start date for Daymet data
if include_pre_1980_data:
    daymet_start_date = '1950-01-01'

# load the predicted parameter results (Notebook 3)
target_cols = [
    'uar_mean_predicted', 'uar_std_predicted', 'uar_median_predicted', 'uar_mad_predicted',
    'log_uar_mean_predicted', 'log_uar_std_predicted', 'log_uar_median_predicted', 'log_uar_mad_predicted',
]

input_data = {
    'attr_df_fpath': attr_df_fpath,
    'LSTM_forcings_folder': LSTM_forcings_folder,
    'LSTM_ensemble_result_folder': LSTM_ensemble_result_folder,
    'include_pre_1980_data': include_pre_1980_data,  # use only stations with data 1980-present concurrent with Daymet
    'predicted_param_dict': predicted_param_dict,
    'divergence_measures': ['DKL', 'EMD'],
    'eps': 1e-12,
    'min_flow': 1e-4,
    'n_grid_points': 2**12,
    'min_record_length': 5,
    'minimum_days_per_month': 20,
    'parametric_target_cols': target_cols,
    'all_station_ids': official_ids_to_include,
    'daymet_concurrent_stations': daymet_concurrent_stations,
    'baseline_distribution_folder': baseline_distribution_folder,
    'delta': 0.01
}

fdc_context = FDCEstimationContext(**input_data)


    Using all stations in the catchment data with a baseline PMF (validated): 715
    ...overlap dict loaded from data/record_overlap_dict.json


In [9]:
def compute_multi_model_ensemble_pmf(stn, rev_date, which_models, result_folder=None):
    # load the knn_result
    knn_fpath = result_folder / 'knn' / f'{stn}_fdc_results.json'
    knn_pmfs = {}
    with open(knn_fpath, 'rb') as file:
        knn_dict = json.load(file)
        # retrieve the PMF for the 4_NN_0_minOverlapPct_attribute_dist_ID2
        knn_models = list(knn_dict.keys())
        knn_models = [k for k in knn_models if '_NN_attribute_dist_ID2_freqEnsemble' in k]
        for m in sorted(knn_models):
            knn_pmfs[m] = knn_dict[m]['pmf']
            bias = knn_dict[m]['bias']
        assert knn_models, f'No knn model found for {stn}'
        # knn_pmf = knn_dict[knn_model[0]]['pmf']

    lstm_fpath = result_folder / f'lstm_{rev_date}' / f'{stn}_fdc_results.json'
    with open(lstm_fpath, 'rb') as file:
        lstm_dict = json.load(file)
        lstm_pmf = lstm_dict['frequency']['pmf']

    param_fpath = result_folder / 'parametric' / f'{stn}_fdc_results.json'
    with open(param_fpath, 'rb') as file:
        param_dict = json.load(file)
        # retrieve the PMF for the 'PredictedMOM' model
        param_models = list(param_dict.keys())
        param_model = [k for k in param_models if 'PredictedLog' in k]
        assert param_model, f'No parametric model found for {stn}'
        param_pmf = param_dict[param_model[0]]['pmf']

    # compute an ensemble PMF as the average of the knn and lstm PMFs
    # compute the mean ensemble along the support evaluation grid
    ensemble_pmfs = {}
    # assert knn_pmfs[m].sum() and lstm_pmf.sum() == 1 so it's an equally weighted average
    assert np.isclose(np.sum(lstm_pmf), 1.0), f'LSTM PMF does not sum to 1 for {stn}'
    assert np.isclose(np.sum(param_pmf), 1.0), f'Parametric PMF does not sum to 1 for {stn}'

    for m in knn_pmfs:
        assert np.isclose(np.sum(knn_pmfs[m]), 1.0), f'KNN PMF does not sum to 1 for {stn} model {m}'

    if which_models == 'knn-lstm':
        for m in knn_pmfs:
            ensemble_pmf = np.mean([knn_pmfs[m], lstm_pmf], axis=0)
            ensemble_pmf /= np.sum(ensemble_pmf)
            ensemble_pmfs[m] = ensemble_pmf
    elif which_models == 'knn-lstm-parametric':
        for m in knn_pmfs:
            ensemble_pmf = np.mean([knn_pmfs[m], lstm_pmf, param_pmf], axis=0)
            ensemble_pmf /= np.sum(ensemble_pmf)
            ensemble_pmfs[m] = ensemble_pmf
    elif which_models == 'knn-parametric':
        for m in knn_pmfs:
            ensemble_pmf = np.mean([knn_pmfs[m], param_pmf], axis=0)
            ensemble_pmf /= np.sum(ensemble_pmf)
            ensemble_pmfs[m] = ensemble_pmf
    else:
        raise ValueError(f'which_models {which_models} not recognized, must be one of knn-lstm, knn-lstm-parametric, knn-parametric')
    return ensemble_pmfs

In [10]:
def compute_ensemble_divergence(stn, rev_date, pmf_obs_df, which_models, result_folder=None):
    station = StationData(fdc_context, stn)
    eval_object = EvaluationMetrics(log_x=station.log_x, log_w=station.log_w)
    ensemble_pmfs = compute_multi_model_ensemble_pmf(stn, rev_date, which_models=which_models, result_folder=result_folder)
    results = {}
    for m in ensemble_pmfs:
        results[m] = eval_object._evaluate_fdc_metrics_from_pmf(ensemble_pmfs[m], pmf_obs_df[stn].values)
    return (results, ensemble_pmfs)

In [11]:
ensembles = {}
ensemble_folder = 'data/results/ensemble_results/'
results_folder = Path('data/results/fdc_estimation_results/')
assert os.path.exists(results_folder), f'Results folder {results_folder} does not exist'
process_ensembles = True
for ep in ['knn_lstm/', 'knn_lstm_lognorm/', 'knn_lognorm/']:
    folder = Path(ensemble_folder) / ep
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    model_ensemble = 'knn-lstm'
    if ep == 'knn_lstm_lognorm/':
        model_ensemble = 'knn-lstm-parametric'
    elif ep == 'knn_lognorm/':
        model_ensemble = 'knn-parametric'

    rev_date = LSTM_ensemble_result_folder.split('_')[-1]
    n = 0
    if process_ensembles:
        max_nae = 0
        for stn in daymet_concurrent_stations:
            n += 1
            ensemble_output_fpath = folder / f'{stn}-{model_ensemble}.csv'
            if os.path.exists(ensemble_output_fpath):
                # print(f'     {ensemble_output_fpath} already exists, skipping')
                continue
            results, ensemble_pmfs = compute_ensemble_divergence(stn, rev_date, pmf_df, which_models=model_ensemble, result_folder=results_folder)
            results_df = pd.DataFrame(results)
            results_df.columns = [e.split('_')[1] for e in results_df.columns]
            results_df.index.name = 'metric'
            results_df.to_csv(ensemble_output_fpath, index=True)
            
            if n % 50 == 0:
                print(f'{n}/{len(daymet_concurrent_stations)} processed')

In [12]:
for ep in ['knn_lstm/', 'knn_lstm_lognorm/', 'knn_lognorm/']:
    folder = Path(ensemble_folder) / ep
    if not process_ensembles:
        break
    which_ensemble = '-'.join(ep.split('/')[0].split('_'))
    nn_ensemble_results = {which_ensemble: {}}

    for n in range(1, 11):
        nn_results = []
        for f in os.listdir(folder):
            stn = f.split('.')[0]
            df = pd.read_csv(os.path.join(folder, f))
            if 'Unnamed: 0' in df.columns:
                df.rename({'Unnamed: 0': 'metric'}, axis=1, inplace=True)
            df.set_index('metric', inplace=True)
            res = df[[str(n)]].to_dict()
            res[str(n)]['stn_id'] = stn
            nn_results.append(res[str(n)])

        nn_df = pd.DataFrame(nn_results)
        nn_df.set_index('stn_id', inplace=True)
        fname = f'{which_ensemble}_{n}NN.csv'
        nn_df.to_csv(os.path.join(ensemble_folder, fname), index=True)
        print(f'    ...saved {len(nn_df)} results to {fname}')

    ...saved 715 results to knn-lstm_1NN.csv
    ...saved 715 results to knn-lstm_2NN.csv
    ...saved 715 results to knn-lstm_3NN.csv
    ...saved 715 results to knn-lstm_4NN.csv
    ...saved 715 results to knn-lstm_5NN.csv
    ...saved 715 results to knn-lstm_6NN.csv
    ...saved 715 results to knn-lstm_7NN.csv
    ...saved 715 results to knn-lstm_8NN.csv
    ...saved 715 results to knn-lstm_9NN.csv
    ...saved 715 results to knn-lstm_10NN.csv
    ...saved 715 results to knn-lstm-lognorm_1NN.csv
    ...saved 715 results to knn-lstm-lognorm_2NN.csv
    ...saved 715 results to knn-lstm-lognorm_3NN.csv
    ...saved 715 results to knn-lstm-lognorm_4NN.csv
    ...saved 715 results to knn-lstm-lognorm_5NN.csv
    ...saved 715 results to knn-lstm-lognorm_6NN.csv
    ...saved 715 results to knn-lstm-lognorm_7NN.csv
    ...saved 715 results to knn-lstm-lognorm_8NN.csv
    ...saved 715 results to knn-lstm-lognorm_9NN.csv
    ...saved 715 results to knn-lstm-lognorm_10NN.csv
    ...saved 715 r

### Concatenate all results into a single data structure for easier plotting and comparison

Compute baseline values to represent the "null" models of using the global mean PMF, and the uniform distributions for all locations.  These are benchmarks to help understand how much value is added by using different models to predict FDCs.

In [13]:
from multiprocessing import Pool

results_dfs = {}
lstm_rev_date = LSTM_ensemble_result_folder.split('_')[-1]
sub_folder = f'lstm_{lstm_rev_date}'
# results_folder = '/media/danbot/Samsung_T5/fdc_estimation_results/'
results_folder = 'data/results/fdc_estimation_results/'
completed_stns = [c.split('_')[0] for c in os.listdir(os.path.join(results_folder, 'knn'))]
print(f'Found {len(set(completed_stns))} completed stations in {sub_folder} results folder.')

for method in ['parametric', 'lstm', 'knn']:
    print(f'   Loading {method} results')
    method_results_fpath = os.path.join('data', 'results', f'{method}_all_results.csv')
    if method == 'lstm':
        rev_date = LSTM_ensemble_result_folder.split('_')[-1]
        method_results_fpath = os.path.join('data', 'results', f'{method}_all_results_{rev_date}.csv')
    if os.path.exists(method_results_fpath):
        results_dfs[method] = pd.read_csv(method_results_fpath, dtype={'Official_ID': str})
    else:
        print(f'   {method} results not found in {method_results_fpath}, loading from individual station files...')
        res_folder = os.path.join(results_folder, method)
        if method == 'lstm':
            res_folder = os.path.join(results_folder, f'{method}_{rev_date}')
        args = [(stn, res_folder, method) for stn in completed_stns]

        with Pool() as pool:
            results_list = pool.map(dpf.load_results, args)

        merged = pd.concat(results_list, ignore_index=True)
        bad_dkl = merged[merged['KLD'].isna() | (merged['KLD'] < 0)].copy()
        if not bad_dkl.empty:
            print(f'Warning: {len(bad_dkl)} {method} rows with NaN or negative DKL values.')
            bad_stns = bad_dkl['Official_ID'].values
            raise Exception(f'Results have {len(bad_stns)} NaN or negative DKL values: {bad_stns}')
        method_results = pd.concat(results_list, ignore_index=True)
        results_dfs[method] = method_results
        print(f'   Loaded {len(set(completed_stns))} station results for {method} results')
        method_results.to_csv(method_results_fpath, index=False)


Found 715 completed stations in lstm_20250514 results folder.
   Loading parametric results
   Loading lstm results
   Loading knn results


In [14]:
# format the metrics to align score interpretation (zero better)
for k, r in results_dfs.items():
    # take exponential to express as geometric mean / average multiplicative deviation
    results_dfs[k]['RMSE'] = 100 * (np.exp(results_dfs[k]['RMSE']) - 1) 
    results_dfs[k]['RB'] = 100 * results_dfs[k]['RB'] # express as percentage
    results_dfs[k]['NAE'] = 100 * (1 - results_dfs[k]['VE']) # express as %, 0 is perfect
    results_dfs[k]['NSE'] = 1 - results_dfs[k]['NSE'] # express as 0 is perfect
    results_dfs[k]['KGE'] = 1 - results_dfs[k]['KGE']   # express as 0 is perfect   

In [15]:
def split_knn_label_col(df):
    """kNN results have a label column that needs to be split into multiple columns."""
    # Split the string column
    # Determine format based on length
    if 'MDB' in df.columns:
        df.drop(labels=['MDB'], axis=1, inplace=True)
    # df.rename({'TBV': 'PVB'}, inplace=True)
    split_labels = df['Label'].str.split('_')
    df['n_parts'] = split_labels.str.len()

    assert len(set(df['n_parts'])) == 1, "Not all labels have the same number of parts"

    # Define expected column structures
    # format_a_cols = ["Official_ID", "k", "NN", 'concurrent', 'tree_type', 'dist', 'weighting', 'ensemble_method']
    format_cols = ["Official_ID", "k", "NN", 'tree_type', 'dist', 'ensemble_weight', 'ensemble_method']

    # Subset by format
    df_a = df[df['n_parts'] == len(format_cols)].copy()

    # Split and join with suffix to avoid conflicts
    df_a_split = df_a['Label'].str.split('_', expand=True)
    df_a_split.columns = format_cols
    merged = pd.concat([df_a.reset_index(drop=True), df_a_split.reset_index(drop=True)], axis=1)

    # Drop duplicates (if any) and update
    merged.drop(columns=['NN', 'dist', 'n_parts', 'minYears', 'minOverlapPct'], errors='ignore', inplace=True)
    merged = merged.loc[:, ~merged.columns.duplicated()]
    return merged

In [16]:
parametric_targets = list(set(results_dfs['parametric']['Label'].values))
results_dfs['knn'] = split_knn_label_col(results_dfs['knn'])
knn_formatted_results = results_dfs['knn'].copy()
knn_formatted_fpath = os.path.join('data', 'results', f'knn_all_results_formatted.csv')
knn_formatted_results.to_csv(knn_formatted_fpath, index=False)


### Load the total sample mean PMF

Here we want to pre-compute benchmark performance measures based on the "global" mean PMF and the uniform distribution.  These represent null models to provide context for comparing the value added by using different models to predict FDCs.

In [14]:
# import the mean global PMF
mean_pmf_df_bits = pd.read_csv('data/results/mean_distribution_8bits.csv')
mean_pmf_df_bits.drop(columns=['Unnamed: 0'], inplace=True)
# upsample to the 12 bits over the same range
a, b = mean_pmf_df_bits['left_log_edges'].min(), mean_pmf_df_bits['right_log_edges'].max()
log_edges_10bit = np.linspace(a, b, 2**10 + 1)
log_x_10bit = 0.5 * (log_edges_10bit[:-1] + log_edges_10bit[1:])
# interpolate the mean_pmf_df to the 12-bit edges
pmf_12bit_resampled = np.interp(
    x=log_x_10bit,
    xp=mean_pmf_df_bits['log_x'],
    fp=mean_pmf_df_bits['pmf']
)
mean_pmf_df = pd.DataFrame({
    'left_log_edges': log_edges_10bit[:-1],
    'right_log_edges': log_edges_10bit[1:],
    'log_x': log_x_10bit,
    'pmf': pmf_12bit_resampled,
})
mean_pmf_df['pmf'] /= mean_pmf_df['pmf'].sum() # normalize

In [15]:
fdc_df = pd.concat([results_dfs['parametric'], results_dfs['lstm']], axis=0)
pmf_path = Path(os.getcwd()) / 'data' / 'results' / 'baseline_distributions' / f'pmf_obs.csv'
pmf_obs_df = pd.read_csv(pmf_path)
log_edges = np.concatenate([mean_pmf_df['left_log_edges'].values[:1], mean_pmf_df['right_log_edges'].values])
log_w = np.diff(log_edges)
eval_obj = EvaluationMetrics(log_x=mean_pmf_df['log_x'].values, log_w=log_w)

formatted_fdc_results_fpath = 'data/results/formatted_results_by_performance_measure.csv'

if not os.path.exists(formatted_fdc_results_fpath):

    for stn in fdc_df['Official_ID'].unique():
        pmf_obs_baseline = pmf_obs_df[stn].values
        mean_pmf = mean_pmf_df['pmf'].values
        stn_data = StationData(fdc_context, stn)
        _, prior_adjusted_pmf = stn_data._compute_adjusted_distribution_with_mixed_uniform(mean_pmf)

        u = np.ones_like(mean_pmf) / len(mean_pmf)
        for new_dist, label in zip([mean_pmf, u], ['Mean_PMF', 'Uniform']):
            new_eval = eval_obj._evaluate_fdc_metrics_from_pmf(new_dist, pmf_obs_baseline)
            
            # Prepare a new row with the results for this station
            # result_keys = ['kld', 'emd', 'rmse', 'mean_error', 'pct_vol_bias', 'mean_abs_rel_error', 'nse', 'kge', 've', 'pb_50', 'vb_pmf', 'vb_fdc', 'mean_frac_diff']
            # df_labels = ['KLD', 'EMD', 'RMSE', 'MB', 'RB', 'MARE', 'NSE', 'KGE', 'VE', 'PB_50', 'VB_PMF', 'VB_FDC', 'MEAN_FRAC_DIFF']
            result_keys = ['kld', 'emd', 'rmse', 'mean_error', 'pct_vol_bias', 'mean_abs_rel_error', 'nse', 'kge', 've']
            df_labels = ['KLD', 'EMD', 'RMSE', 'MB', 'RB', 'MARE', 'NSE', 'KGE', 'VE']
            new_row = {dl: new_eval[rk] for rk, dl in zip(result_keys, df_labels)}
            new_row['Official_ID'] = stn
            new_row['Label'] = label

            # Add missing columns as NaN if needed
            for col in fdc_df.columns:
                if col not in new_row:
                    new_row[col] = np.nan
            # Append the new row to the dataframe
            new_row['RMSE'] = 100 * (np.exp(new_row['RMSE']) - 1)
            new_row['RB'] = 100 * new_row['RB']
            new_row['NSE'] = 1 - new_row['NSE']
            new_row['NAE'] = 100 * (1 - new_row['VE'])
            fdc_df = pd.concat([fdc_df, pd.DataFrame([new_row])], ignore_index=True)

    fdc_df.sort_values(by=['Official_ID'], inplace=True)
    fdc_df.reset_index(drop=True, inplace=True)
    fdc_df.to_csv(formatted_fdc_results_fpath, index=False)
        