In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats

from bokeh.plotting import figure, show
from bokeh.layouts import row, column
from bokeh.io import output_notebook
from bokeh.palettes import Vibrant4
from numba import jit

from scipy.interpolate import interp1d

output_notebook()

In [2]:
def create_fig(df, bitrate):
    fig1 = figure(width=500, height=350, title=f'{bitrate} bit compression comparison (N={len(df)})',
                y_axis_type='log')
    fig2 = figure(width=500, height=350, title='Attribute distance vs. Compression',
                 y_axis_type='log')
    cs = Vibrant4
    n = 0
    min_compression = df[[c for c in df.columns if c.endswith('compression_ratio')]].copy().min().min()
    max_compression = df[[c for c in df.columns if c.endswith('compression_ratio')]].copy().max().max()
    for m in ['uniform', 'equiprobable', 'proportional']:
        # Fit a regression model
        xl, yl = f'{dist_measure}_attr_dist', f'{m}_compression_ratio'
        data = df[[xl, yl]].copy().dropna(how='any')
        slope, intercept, r_value, p_value, std_err = stats.linregress(data[xl], data[yl])
        r2 = r_value**2
        if r2 < 0.1:
            print(f'R^2={r_value**2:.2f}, slope is meaningless.')
        else:
            print(f'{bitrate}bit {m}: Y={slope:.2f}x + {intercept:.2f} (R^2={r_value**2:.2f})')
    
        # ll_quad = df[(df[f'{m}_num_symbols'] <= 2**bitrate) & (df[f'{m}_compression_ratio'] < 1.)].copy()
        pct_comp = len(df[df[f'{m}_compression_ratio'] < 1.].copy()) / len(df)
        # pct_good = len(ll_quad) / len(df)
        fig1.circle(df[f'{m}_num_symbols'], df[f'{m}_compression_ratio'], color=cs[n],
                  legend_label=f'{m}', alpha=0.8, size=5)

        fig2.circle(data[xl], data[yl], color=cs[n],
                  legend_label=f'{m} {pct_comp:.2f}', alpha=0.8, size=5)
        xx = np.linspace(0, data[xl].max(), 1000)
        fig2.line(xx, [slope*x + intercept for x in xx], line_dash='dashed', 
                  line_width=3, color=cs[n], legend_label=f'{m}: {r_value**2:.2f}')
        n += 1
        
    fig1.line([2**bitrate, 2**bitrate], [min_compression, max_compression], 
             legend_label='Quantization bitrate', line_dash='dashed', color='firebrick')
    max_symbols = df[[c for c in df.columns if c.endswith('num_symbols')]].copy().max().max()
    fig1.line([0, max_symbols], [1, 1], line_dash='dashed', color='grey', 
             legend_label='Zero compression')
    max_dist = df[[c for c in df.columns if c == f'{dist_measure}_attr_dist']].copy().max().max()
    fig2.line([0, max_dist], [1, 1], line_dash='dashed', color='grey',
              legend_label='Zero compression')
    fig1.xaxis.axis_label = 'Number of symbols in residuals'
    fig1.yaxis.axis_label = 'Compression Ratio [%] (lower = more compression)'
    fig1.legend.location = 'top_left'
    fig1.legend.click_policy = 'hide'
    
    fig2.xaxis.axis_label = f'{dist_measure} Attribute distance'
    fig2.yaxis.axis_label = 'Compression Ratio'
    fig2.legend.location = 'top_left'
    fig2.legend.click_policy = 'hide'
    return row([fig1, fig2])

In [45]:
dist_measure = 'L2'

df = pd.read_csv(f'compression_test_results_exponent_optimization.csv')
df['dist_scaled'] = (df['distance'] - df['distance'].min()) / (df['distance'].max() - df['distance'].min())
l = 'area_ratio'
df[l] = np.log10(df['target_area'] / df['proxy_area'])
df['AR_scaled'] = (df[l] - df[l].min()) / (df[l].max() - df[l].min())
l = 'Slope_deg_diff'
df['slope_scaled'] = (df[l] - df[l].min()) / (df[l].max() - df[l].min())

22406

In [65]:

attr_cols = ['Drainage_Area_km2_diff', 'Elevation_m_diff', 'Slope_deg_diff',
       'Gravelius_diff', 'Perimeter_diff', 'Aspect_deg_diff',
       'Land_Use_Forest_frac_diff', 'Land_Use_Grass_frac_diff',
       'Land_Use_Wetland_frac_diff', 'Land_Use_Water_frac_diff',
       'Land_Use_Urban_frac_diff', 'Land_Use_Shrubs_frac_diff',
       'Land_Use_Crops_frac_diff', 'Land_Use_Snow_Ice_frac_diff',
       'Permeability_logk_m2_diff', 'Porosity_frac_diff']

dist_attrs = attr_cols 
df_norm = pd.DataFrame()
for c in dist_attrs:
    df_norm[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())

print(len(dist_attrs) + 2)
df['dist_measure'] = df_norm[dist_attrs].abs().sum(axis=1)
df['dist_measure'] += df['AR_scaled'] + df['dist_scaled']#['AR_scaled', 'dist_scaled', 'slope_scaled']

print(df['dist_measure'].min(), df['dist_measure'].max())
print(df['AR_scaled'].min(), df['AR_scaled'].max())
print(df['dist_scaled'].min(), df['dist_scaled'].max())
print(df['kge_AR'].min(), df['kge_AR'].max())



18
3.046961134352765 10.800834815826272
0.0 1.0
0.0 1.0
-3709792.841905464 0.8517760088123147


In [66]:
df.columns
df['sse_diff'] = (df['target_sse'] - df['proxy_sse']) / df['target_area']

In [67]:
# @jit(nopython=True)
def bootstrap_confidence_intervals(x, y, num_bins, n_bootstrap=5000, confidence_level=0.95):
    # Calculate bin edges based on quantiles
    sorted_x = np.sort(x)
    indices = np.linspace(0, len(sorted_x) - 1, num_bins + 1).astype(int)
    # bin_edges = np.quantile(sorted_x, np.linspace(0, 1, num_bins + 1))
    bin_edges = sorted_x[indices]

    # Initialize lists to store bootstrap results for each bin
    bootstrap_results = [[] for _ in range(num_bins)]

    for i in range(n_bootstrap):
        # Resample entire dataset with replacement
        indices = np.random.choice(len(x), size=len(x), replace=True)
        x_resampled = x[indices]
        y_resampled = y[indices]

        # Bin the resampled data and compute statistics
        bin_indices = np.digitize(x_resampled, bin_edges)
        for j in range(1, num_bins + 1):
            y_in_bin = y_resampled[bin_indices == j]
            if y_in_bin.size > 0:
                bootstrap_results[j - 1] += list(y_in_bin)  # Change to median or other statistic if desired

    # Convert lists to NumPy arrays for percentile computation
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    lower_bounds = np.zeros(num_bins)
    upper_bounds = np.zeros(num_bins)

    for j in range(num_bins):
        bin_data = np.array(bootstrap_results[j])
        bin_data = bin_data[~np.isnan(bin_data)]
        lower_bounds[j] = np.percentile(bin_data, (1 - confidence_level) / 2 * 100)
        upper_bounds[j] = np.percentile(bin_data, (1 + confidence_level) / 2 * 100)

    return bin_centers, lower_bounds, upper_bounds




In [68]:
# diffs are proxy - target
diff_cols = ['Drainage_Area_km2_diff']
min_diff, max_diff = df[diff_cols].min(), df[diff_cols].max()
print(min_diff)
print(max_diff)

Drainage_Area_km2_diff   -771983.4242
dtype: float64
Drainage_Area_km2_diff    771976.1722
dtype: float64


In [69]:
fig1 = figure(width=400, height=300, title=f'SSE vs. distance (N={len(df)})',
            y_axis_type='log')
fig1.circle(df['dist_scaled'], df['target_sse'], alpha=0.25)
fig1.yaxis.axis_label = 'SSE'
fig1.xaxis.axis_label = 'Scaled Attribute Distance'
fig2 = figure(width=400, height=300, title=f'SSE Proxy vs. Target (N={len(df)})',
             y_axis_type='log', x_axis_type='log')
fig2.circle(df['proxy_sse'], df['target_sse'], alpha=0.2)
fig2.line([0.1, 1E10], [0.1, 1E10], line_dash='dashed',
          color='red', line_width=3)
fig2.xaxis.axis_label = 'Proxy SSE'
fig2.yaxis.axis_label = 'Target SSE'

y_label = 'kge_AR'
bin_centres, lower_bounds, upper_bounds = bootstrap_confidence_intervals(df['dist_measure'].values, df[y_label].values, 20)
# for ll, mm, uu in zip(lower_bounds, bin_centres, upper_bounds):
#     print(ll, mm, uu)
fig3 = figure(width=400, height=300, title=f'KGE vs. distance (N={len(df)})',
              y_range=(-5, 5), x_range=(0.1, len(dist_attrs))) #y_axis_type='log', x_axis_type='log')

fig3.circle(df['dist_measure'], df[y_label], alpha=0.2)
fig3.line([0, len(dist_attrs)], [0., 0.], color='red', legend_label='0 KGE',line_width=3,
         line_dash='dashed') 
fig3.varea(bin_centres, lower_bounds, upper_bounds, color='blue', legend_label='95%', fill_alpha=0.4) 

fig3.xaxis.axis_label = 'Scaled Attribute Distance'
fig3.yaxis.axis_label = 'Proxy vs. Target Difference'
fig3.legend.location='top_left'

y_label = 'nse_AR'
bin_centres, lower_bounds, upper_bounds = bootstrap_confidence_intervals(df['dist_measure'].values, df[y_label].values, 20)
print('')
for ll, mm, uu in zip(lower_bounds, bin_centres, upper_bounds):
    print(ll, mm, uu)
fig4 = figure(width=400, height=300, title=f'NSE vs. distance (N={len(df)})',
              y_range=(-5, 5), x_range=(0.1, len(dist_attrs))) #y_axis_type='log', x_axis_type='log')

fig4.circle(df['dist_measure'], df['nse_AR'], alpha=0.2)
fig4.line([0, len(dist_attrs)], [0., 0.], color='red', legend_label='0 NSE',line_width=3,
         line_dash='dashed') 
fig4.varea(bin_centres, lower_bounds, upper_bounds, color='green', legend_label='95%', fill_alpha=0.4) 

fig4.xaxis.axis_label = 'Scaled Attribute Distance'
fig4.yaxis.axis_label = 'Proxy vs. Target Difference'
fig4.legend.location='top_left'

show(row([fig1, fig3, fig4]))


-24646125.00953203 5.150732821800139 -0.0476548710894884
-79756148.23088038 7.731498728558098 -0.048424745175218
-100874.13422141866 8.352005268452452 0.0862422009086741
-27062.300195879438 8.552782878628914 0.1318447761741237
-91912.04644549338 8.654330650098892 0.1662666939773366
-122228.27281132588 8.733613944579954 0.1409807206478918
-187395.32029809823 8.798895800383754 0.1228339044324603
-201980.28045469028 8.859415175962955 0.0898638058205693
-223670.150749863 8.918824395564846 0.1780958126970604
-335234.1042304818 8.976523053015578 0.0900317483115712
-538232.445254267 9.033695956656668 0.1373883957913931
-434306.3021227637 9.090457816403529 0.147284095253056
-695644.8773054609 9.147616297412828 0.0829599490335756
-492158.03192464384 9.207077165238358 0.0969507792137626
-847921.0330774668 9.271541194526662 0.0421523711626204
-620557.1112863801 9.343315164927805 0.0251577438415713
-734801.2439939763 9.423053583420085 0.0182845078171893
-906354.2022501422 9.51867598416984 0.04291

In [51]:
figs = []
for m in ['nse', 'kge']:
    l1, l2 = f'{m}_AR', f'{m}_AR_exp'
    
    fig = figure(width=500, height=350, title=f'Exponent Optimization (N={len(df)})',
             x_range=(-1, 1), y_range=(-1, 1))
            #y_axis_type='log', x_axis_type='log')
    fig.circle(df[l1], df[l2], alpha=0.5)
    fig.line([0, 0], [-5E5, 1], legend_label='0 NSE',
            line_dash='dashed', color='red', line_width=3)
    fig.line([-5E5, 1], [0, 0], legend_label='0 KGE',
            line_dash='dotted', color='red', line_width=3)
    fig.line([-1, 1], [-1, 1], legend_label='1:1', line_dash='dashed', 
             width=3, color='black')
    fig.xaxis.axis_label = l1
    fig.yaxis.axis_label = l2
    fig.legend.location = 'top_left'
    figs.append(fig)
show(row(figs))