In [1]:
import os
import pandas as pd
import numpy as np
from time import time

import geopandas as gpd
from shapely.geometry import Point
import xyzservices.providers as xyz

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot, row, column
from bokeh.transform import factor_cmap, linear_cmap
from bokeh.models import ColumnDataSource, LinearAxis, Range1d
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7, Category20, Bokeh6, Bokeh7, Bokeh8, Greys256

import xgboost as xgb
xgb.config_context(verbosity=2)

from sklearn.cluster import AgglomerativeClustering

from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    roc_curve, auc,
    accuracy_score,
    confusion_matrix,
)

from scipy.stats import linregress
from scipy.stats import lognorm, norm
from scipy.special import kl_div


# import jax
# import jax.numpy as jnp
# from jax.scipy.stats import gaussian_kde as jkde
# from jax import config as jax_config
# jax_config.update("jax_enable_x64", False)

# from jax import jit
# from jax import vmap

from KDEpy import FFTKDE

import data_processing_functions as dpf

import multiprocessing as mp
from functools import partial

# from sklearn.model_selection import StratifiedKFold
output_notebook()

In [2]:
BASE_DIR = os.getcwd()

In [4]:
# load the catchment characteristics
fname = 'BCUB_watershed_attributes_updated_20250227.csv'
attr_df = pd.read_csv(os.path.join('data', fname))
attr_df.columns = [c.lower() for c in attr_df.columns]
attr_df['tmean'] = (attr_df['tmin'] + attr_df['tmax']) / 2.0
station_ids = attr_df['official_id'].values
print(f'There are {len(station_ids)} monitored basins in the attribute set.')

There are 1308 monitored basins in the attribute set.


In [5]:
# load the lognormal fit parameter results
ln_fit_fname = 'LN_fit_method_comparison_20250128.csv'
ln_fit_fpath = os.path.join('data', 'results', ln_fit_fname)
ln_df = pd.read_csv(ln_fit_fpath)

ln_df = ln_df[ln_df['official_id'].isin(attr_df['official_id'].values)]
ln_df['mean_runoff_mm_day'] = ln_df['mean_uar'] * 3.6 / 1000
ln_df['sd_runoff_mm_day'] = ln_df['sd_uar'] * 3.6 / 1000
ln_df.columns

target_columns = [c for c in ln_df.columns if c not in attr_df.columns]
for tc in target_columns:
    # create a dict of the
    target_dict = ln_df[['official_id', tc]].copy().set_index('official_id').to_dict()[tc]
    if tc not in attr_df.columns:
        attr_df[tc] = attr_df['official_id'].apply(lambda x: target_dict[x])
# create a dict of 'official_id': 'drainage area'
da_dict = attr_df[['official_id', 'drainage_area_km2']].copy().set_index('official_id').to_dict()['drainage_area_km2']

In [6]:
ln_df.head()

Unnamed: 0,official_id,record_length_yrs,mean_uar,sd_uar,mean_logx,sd_logx,LN_MMO_sd_hat,LN_MMO_mu_hat,scipy.norm mu_hat,scipy.norm sd_hat,scipy.lognorm mu_hat,scipy.lognorm sd_hat,DKL(AKDE||LN Func,DKL(AKDE||Scipy Norm,DKL(AKDE||Scipy LN,DKL(AKDE||KDE,mean_runoff_mm_day,sd_runoff_mm_day
0,08BB005,29.238356,23.291309,23.432722,2.542487,1.176584,0.699219,2.798471,2.542487,1.176584,2.542487,1.176584,0.617094,0.333133,0.333133,0.001319,0.083849,0.084358
1,15031000,10.00274,1375.179864,1846.653356,5.834176,2.241452,1.030773,6.710953,5.834176,2.241452,5.834176,2.241452,1.954496,0.487795,0.487795,0.070881,4.950648,6.647952
2,15039900,16.268493,766.120211,811.452431,5.967802,1.254919,0.752285,6.265196,5.967802,1.254919,5.967802,1.254919,0.623047,0.285162,0.285162,0.000852,2.758033,2.921229
3,15040000,20.106849,99.955452,98.630142,4.008159,1.182995,0.679889,4.26478,4.008159,1.182995,4.008159,1.182995,0.668964,0.354124,0.354124,0.001314,0.35984,0.355069
4,15041200,31.465753,22.912254,23.171799,2.532035,1.165159,0.704475,2.779435,2.532035,1.165159,2.532035,1.165159,0.577014,0.316482,0.316482,0.001911,0.082484,0.083418


In [7]:
# open an example pairwise results file
input_folder = os.path.join(
    BASE_DIR, "data", "processed_divergence_inputs",
)
pairs_files = os.listdir(input_folder)
rev_date = '20250119'
n_rows = None
# parametric_df = pd.read_csv(os.path.join(input_folder, f'MEMBAKDE_results_{rev_date}.csv'), nrows=n_rows)
# parametric_df.head()
fname = 'Results_estimated_vs_observed_LN_fits_20250118.csv'
bootstrap_result_fpath = os.path.join(os.getcwd(), 'data', 'parametric_fits', fname)
param_df = pd.read_csv(bootstrap_result_fpath)
param_df.drop('Unnamed: 0', inplace=True, axis=1)
param_df.head()


Unnamed: 0,official_id,obs_mean_mm_day,obs_std,pred_mean_mm_day,pred_sigma,KL_KDE_AKDE_2.5,KL_KDE_AKDE_50,KL_KDE_AKDE_97.5,KL_AKDE_LNobs_2.5,KL_AKDE_LNobs_50,KL_AKDE_LNobs_97.5,KL_AKDE_LNest_2.5,KL_AKDE_LNest_50,KL_AKDE_LNest_97.5
0,05AA023,0.763394,1.286813,0.821331,1.201441,0.004435,0.00669,0.011342,1.026485,1.044057,1.065971,0.700195,0.715953,0.735906
1,05AA035,0.772076,1.449857,0.662123,1.017802,0.021181,0.02906,0.043829,0.687753,0.749125,0.806108,0.678864,0.744101,0.802399
2,05AD033,4.073865,5.079821,3.869882,4.717802,0.039788,0.053163,0.073777,0.698456,0.78305,0.865009,0.741019,0.827571,0.912939
3,05BF017,1.162493,1.957323,1.48197,1.963457,2.081178,2.130511,2.189047,0.864798,0.900978,0.940367,0.476573,0.506467,0.536577
4,05BJ010,0.821697,1.169613,0.695347,1.056124,0.011753,0.021304,0.038843,1.333488,1.355958,1.382492,1.697818,1.721834,1.752463


In [12]:
# Define the ranges and associated errors
error_points = np.array([0.01, 0.1, 1.0, 10, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7])  # Magnitude points in L/s/km^2
error_values = np.array([1., 0.5, 0.25, 0.1, 0.05, 0.05, 0.075, 0.1, 0.15, 0.20])    # Associated errors (as proportions)

efig = figure(title="Estimated Measurement Error Model", width=600, height=400, x_axis_type='log')
efig.line(error_points, error_values, line_color='red', line_width=2, legend_label='Measurement Error Model')
efig.xaxis.axis_label = r'$$\text{Flow } m^3/s$$'
efig.yaxis.axis_label = r'$\text{Error } [\text{x}100\%]$'
efig.legend.background_fill_alpha = 0.5
efig = dpf.format_fig_fonts(efig, font_size=14)

layout = gridplot([efig], ncols=2, width=500, height=350)
show(layout)

In [6]:
def local_bandwidth_func(x):
    """
    Use np.interp to pick a bandwidth based on x.
    For x < error_points[0], it will extrapolate to error_values[0].
    For x > error_points[-1], it will extrapolate to error_values[-1].
    """
    return np.interp(x, error_points, error_values)


def single_kde_fit(data, log_grid):
    kde_pdf = FFTKDE(bw='ISJ').fit(np.log(data)).evaluate(log_grid)
    pdf_check = np.trapz(kde_pdf, x=log_grid)
    # check that the numerical integration over the KDE pdf is close to 1
    assert np.isclose(pdf_check, 1, atol=1e-4), pdf_check
    kde_cdf = np.cumsum(kde_pdf)
    kde_cdf /= kde_cdf[-1]
    kde_pmf = np.diff(kde_cdf, prepend=0)
    assert np.abs(np.sum(kde_pmf) - 1.0) <= 0.001
    return kde_pmf

In [None]:
def compute_lognorm_pmf(log_x, mu, sigma, integral_tol=1e-4):
    # Lognormal parameters
    norm_pdf = norm.pdf(log_x, loc=mu, scale=sigma)
    norm_check = np.trapz(norm_pdf, x=log_x)
    lin_grid = np.exp(log_x)
    norm_lin_check = np.trapz(norm_pdf / lin_grid, x=lin_grid) 
    nc1 = np.isclose(norm_check, 1, atol=integral_tol)#, norm_check
    nc2 = np.isclose(norm_lin_check, 1, atol=integral_tol)#, norm_lin_check
    # print(f'Norm integral: {norm_check:.4f}')
    
    norm_cdf = np.cumsum(norm_pdf)
    norm_cdf /= norm_cdf[-1]
    norm_pmf = np.diff(norm_cdf, prepend=0)
    return norm_pmf

In [None]:
def plot_distributions(data, log_grid, kde_pmf, skde, gkde, fkde, ln_pmf):

    # hist, log_edges = np.histogram(np.log(data), bins=log_grid, density=True)
    hist, log_edges = np.histogram(np.log(data), bins=64, density=True)
    hist_cdf = np.cumsum(hist)
    hist_cdf /= hist_cdf[-1]
    hist_pmf = np.diff(hist_cdf, prepend=0)

    # edges = np.exp(log_edges)
    edges = np.exp(log_edges)
    fig = figure(width=600, height=500, x_axis_type='log')
    equiv_years = len(data) / 365.0
    fig.quad(bottom=0, left=edges[:-1], right=edges[1:], legend_label=f'Data (N={equiv_years:.1f} years)',
             top=hist_pmf, color='lightgreen', alpha=0.6)
    
    x = np.exp(log_grid)
    fig.line(x, kde_pmf, color='dodgerblue', line_width=2, legend_label='KDE', alpha=0.5)
    fig.line(x, ln_pmf, color='orange', line_width=2, line_dash='solid', legend_label='LN MLE')
    fig.line(x, skde, color='darkblue', line_width=3, line_dash='dotted', legend_label='EpaKDE')
    fig.line(x, gkde, color='black', line_width=3, line_dash='dashed', legend_label='GKDE')
    # fig.line(x, fkde, color='grey', line_width=3, line_dash='dotted', legend_label='FFTKDE')
    fig.xaxis.axis_label = r'$$\text{Runoff } [L s^{-1} \text{km}^{-2}]$$'
    fig.yaxis.axis_label = r'$$\text{Pr}(X)$$'
    fig.legend.click_policy = 'hide'
    fig = dpf.format_fig_fonts(fig, font_size=14)
    show(fig)

In [None]:
def deconvolution_kde_fft_gaussian_local(
    data: np.ndarray,
    x_grid: np.ndarray
) -> np.ndarray:
    
    logdata = np.log(data)  # Log-transform data
    h = local_bandwidth_function(data)  # Compute adaptive bandwidth

    if np.any(h <= 0):
        raise ValueError("Bandwidth values must be positive.")

    # Normalize data by local bandwidth (rescaling step)
    scaled_data = logdata / h
    scaled_x_grid = np.asarray(x_grid) / h[:, None]  # Broadcast over data

    # Apply FFT-based KDE
    kde = FFTKDE(kernel="gaussian", bw=1.0).fit(scaled_data)  # Use unit bandwidth
    pdf_values = kde.evaluate(scaled_x_grid.flatten())

    # Normalize and convert PDF to PMF
    cdf = np.cumsum(pdf_values)
    cdf /= cdf[-1]  # Normalize
    pmf = np.diff(cdf, prepend=0)

    return pmf

In [None]:
def local_bandwidth_function(x):
    return np.interp(x, error_points, error_values)
    

def epanechnikov_cdf_array(u: np.ndarray) -> np.ndarray:
    """
    Vectorized Epanechnikov CDF for an array u.
    Returns values in [0, 1].
    """
    # Prepare an output array of the same shape
    out = np.empty_like(u, dtype=np.float64)
    
    # The polynomial part
    # CDF = 0.5 + 0.75*u - 0.25*(u^3) for -1 <= u <= 1
    out[:] = 0.5 + 0.75*u - 0.25*(u**3)
    
    # Now clamp for u <= -1
    out[u <= -1] = 0.0
    
    # And clamp for u >= 1
    out[u >= 1]  = 1.0

    return out
    

def intervals_for_unique_values_adjacent(u: np.ndarray) -> dict:
    """
    Given a sorted array of unique values 'u', define intervals as follows:
      - For u[0], the interval is [max(1e-4, 0.9*u[0]), u[1]]
      - For 0 < i < n-1, the interval is [u[i-1], u[i+1]]
      - For u[n-1], the interval is [u[n-2], 1.1*u[n-1]]

    Returns
    -------
    dict :
       { u[i] : (left_edge, right_edge) }
    """
    n = u.size
    # u = np.asarray(u)
    left_edges = np.r_[
        max(1e-4, 0.099*u[0]),
        u[:-2],        # fill positions 1..(n-2)
        u[n-2]         # last position
    ]

    right_edges = np.r_[
        u[1],
        u[2:],
        1.2501*u[-1]
    ]
    return dict(zip(u, zip(left_edges, right_edges)))


def deconvolution_kde_epanechnikov_local(
    data: np.ndarray,
    x_grid: np.ndarray
) -> np.ndarray:

    logdata = np.asarray(np.log(data))
    unique = np.unique(data)

    intervals_dict = intervals_for_unique_values_adjacent(unique)

    x_grid = np.asarray(x_grid)
    # Extract interval boundaries for all unique values
    unique_vals = np.unique(data)
    L = np.array([intervals_dict[val][0] for val in data])  # Left boundary for each data point
    R = np.array([intervals_dict[val][1] for val in data])  # Right boundary for each data point

    # Log-transformed interval boundaries
    log_L = np.log(L)
    log_R = np.log(R)

    # Compute log-widths (check for invalid cases where L >= R)
    w = log_R - log_L
    if np.any(w <= 0):
        invalid_idx = np.where(w <= 0)[0]
        raise ValueError(f"Invalid intervals detected at indices: {invalid_idx}")

    # Vectorized bandwidth calculation
    h = local_bandwidth_function(data)
    if np.any(h <= 0):
        invalid_idx = np.where(h <= 0)[0]
        raise ValueError(f"Bandwidth must be positive at indices: {invalid_idx}")

    # Compute matrix of Epanechnikov CDF values (broadcasting over x_grid)
    u_left  = (x_grid[:, None] - log_L[None, :]) / h[None, :]
    cdf_left  = epanechnikov_cdf_array(u_left)
    
    u_right = (x_grid[:, None] - log_R[None, :]) / h[None, :]
    cdf_right = epanechnikov_cdf_array(u_right)
    t2 = time()
    print(f'    {t2-t1:.2f}s for epanechnikof cdf_vec')
    # Compute kernel integral contributions and sum over all observations
    pdf_values = np.sum((cdf_left - cdf_right) / w, axis=1) / len(data)

    cdf = np.cumsum(pdf_values)
    cdf /= cdf[-1]
    pmf = np.diff(cdf, prepend=0)
    return pmf

In [None]:
n_bootstrap_samples = 100
k_nearest = 10
to_check = [
'05AD003','05AD031', '05AB022', '05AB030', '05AD031', '12091050'
]
for i, row in attr_df[attr_df['official_id'].isin(to_check)].copy().iterrows():
# for i, row in attr_df.iterrows():
    result = {}
    t0 = time()
    target_stn = row['official_id']
    da = row['drainage_area_km2']
    print(target_stn)
    # import streamflow data
    stn_df = dpf.get_timeseries_data(target_stn)
    stn_df[f'{target_stn}_uar'] = 1000 * stn_df[target_stn] / da
    data = stn_df[f'{target_stn}_uar'].dropna().values
    log_data = np.log(data)

    epsilon = 1e-6 
    minx, maxx = np.min(data) - epsilon, np.max(data) + epsilon
    n_eval_pts = 2**12
    log_grid = np.linspace(np.log(minx) - 2., np.log(maxx) + 1., n_eval_pts)
    lin_grid = np.exp(log_grid)

    # akde = error_based_kde_fit_bootstrap(data, log_grid)
    t0 = time()
    skde = deconvolution_kde_epanechnikov_local(data, log_grid)
    gkde = deconvolution_kde_gaussian_local(data, log_grid)
    # fkde = deconvolution_kde_fft_gaussian_local(data, log_grid)
    t1 = time()
    print(f'{t1-t0:.2f}s for epanechnikov kernel local bandwidth')

    # plot the lognorm
    ln_params = ln_df[ln_df['official_id'] == target_stn].copy()
    mean_logx = ln_params['mean_logx']
    sd_logx = ln_params['sd_logx']
    ln_pmf = compute_lognorm_pmf(log_grid, mean_logx, sd_logx)
    # print(ln_pmf)
    fkde=None
    
    # fit KDE
    kde_pmf = single_kde_fit(data, log_grid)
    plot_distributions(data, log_grid, kde_pmf, skde, gkde, fkde, ln_pmf)
    print('')
