In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats

from bokeh.plotting import figure, show
from bokeh.layouts import row, column
from bokeh.io import output_notebook
from bokeh.palettes import Vibrant4

output_notebook()

In [2]:
def create_hhist():
    # create the horizontal histogram
    hhist, hedges = np.histogram(source.data['x'], bins=NBINS, normed=True)
    hzeros = np.zeros(len(hedges)-1)
    hmax = max(hhist)*1.1
    
    LINE_ARGS = dict(color="#3A5785", line_color=None)
    
    ph = figure(toolbar_location=None, plot_width=p.plot_width, plot_height=200, x_range=p.x_range,
                y_range=(0, hmax), min_border=10, min_border_left=50, y_axis_location=None,
                lod_factor=10, x_axis_location=None)
    ph.xgrid.grid_line_color = None
    ph.yaxis.major_label_orientation = np.pi/4
    ph.background_fill_color = "#fafafa"

In [3]:
def create_fig(df, bitrate):
    p_width = 400
    fig1 = figure(width=p_width, height=300, title=f'{bitrate} bit compression comparison (N={len(df)})',
                 )#y_axis_type='log')
    # fig2 = figure(width=p_width, height=300, title='Attribute distance vs. Compression',
    #              y_axis_type='log')
    # fig3 = figure(width=p_width, height=300, title='Attribute distance vs. Residual symbols',
    #              )#y_axis_type='log')
    cs = Vibrant4
    n = 0
    min_compression = df[[c for c in df.columns if c.endswith('jsd')]].copy().min().min()
    max_compression = df[[c for c in df.columns if c.endswith('jsd')]].copy().max().max()
    for m in ['uniform', 'equiprobable']:#, 'proportional']:
        # Fit a regression model
        # xl, yl, zl = f'{dist_measure}_attr_dist', f'{m}_compression_ratio', f'{m}_num_symbols'
        xl, yl = f'{dist_measure}_attr_dist', f'{m}_jsd'#, f'{m}_num_symbols'
        
        # ll_quad = df[(df[f'{m}_num_symbols'] <= 2**bitrate) & (df[f'{m}_compression_ratio'] < 1.)].copy()
        # pct_comp = len(df[df[yl] < 1.].copy()) / len(df)
        # pct_good = len(ll_quad) / len(df)
        fig1.circle(df[xl], df[yl], color=cs[n],
                  legend_label=f'{m}', alpha=0.25, size=5)

        # fig2.circle(df[xl], df[yl], color=cs[n],
        #           legend_label=f'{m} {pct_comp:.2f}', alpha=0.3, size=5)

        # fig3.circle(df[xl], df[zl], color=cs[n],
        #             legend_label=f'{m}', alpha=0.3, size=5)
        
        n += 1
    n = 0
    for m in ['uniform', 'equiprobable']:#, 'proportional']:
        # xl, yl, zl = f'{dist_measure}_attr_dist', f'{m}_compression_ratio', f'{m}_num_symbols'
        xl, yl = f'{dist_measure}_attr_dist', f'{m}_jsd'#, f'{m}_num_symbols'
        data = df[[xl, yl]].copy().dropna(how='any')
        slope, intercept, r_value, p_value, std_err = stats.linregress(data[xl], data[yl])
        r2 = r_value**2
        if r2 < 0.1:
            print(f'R^2={r_value**2:.2f}, slope is meaningless.')
        else:
            print(f'{bitrate}bit {m}: Y={slope:.2f}x + {intercept:.2f} (R^2={r_value**2:.2f})')
        # xx = np.linspace(0, data[xl].max(), 1000)
        # fig2.line(xx, [slope*x + intercept for x in xx], line_dash='dashed', 
        #           line_width=3, color=cs[n], legend_label=f'{m}: {r_value**2:.2f}')

        # data1 = df[[xl, zl]].copy().dropna(how='any')
        # slope, intercept, r_value, p_value, std_err = stats.linregress(data1[xl], data1[zl])
        # r2 = r_value**2
        # if r2 < 0.1:
        #     print(f'   R^2={r_value**2:.2f}, slope is meaningless.')
        # else:
        #     print(f'   {bitrate}bit {m}: Y={slope:.2f}x + {intercept:.2f} (R^2={r_value**2:.2f})')

        # xx = np.linspace(0, data[xl].max(), 1000)
        # fig3.line(xx, [slope*x + intercept for x in xx], line_dash='dashed', 
        #           line_width=3, color=cs[n], legend_label=f'{m}: {r_value**2:.2f}')

        n += 1
        
    # fig1.line([2**bitrate, 2**bitrate], [min_compression, max_compression], line_width=3, 
    #          legend_label='Quantization bitrate', line_dash='dashed', color='firebrick')
    # max_symbols = df[[c for c in df.columns if c.endswith('num_symbols')]].copy().max().max()
    # fig1.line([0, max_symbols], [1, 1], line_dash='dashed', color='grey', line_width=3,
    #          legend_label='Zero compression')
    # max_dist = df[[c for c in df.columns if c == f'{dist_measure}_attr_dist']].copy().max().max()
    # fig2.line([0, max_dist], [1, 1], line_dash='dashed', color='grey', line_width=3,
    #           legend_label='Zero compression')
    fig1.xaxis.axis_label = 'Normalized Distance'
    fig1.yaxis.axis_label = 'Jensen-Shannon Divergence [bps]'
    fig1.legend.location = 'top_left'
    fig1.legend.click_policy = 'hide'
    
    # fig2.xaxis.axis_label = f'{dist_measure} Attribute distance'
    # fig2.yaxis.axis_label = 'Compression Ratio'
    # fig2.legend.location = 'top_left'
    # fig2.legend.click_policy = 'hide'
    
    # fig3.xaxis.axis_label = f'{dist_measure} Attribute distance'
    # fig3.yaxis.axis_label = 'Number of symbols in residualss'
    # fig3.legend.location = 'top_left'
    # fig3.legend.click_policy = 'hide'
    # fig3.line([0, 1], [2**bitrate, 2**bitrate], line_dash='dashed', color='red', line_width=3,
    #           legend_label='Quantization bitrate')
    fig1.legend.background_fill_alpha = 0.3
    # fig2.legend.background_fill_alpha = 0.3
    # fig3.legend.background_fill_alpha = 0.3
    # return row([fig1, fig2, fig3])
    return fig1

In [158]:
dist_measure = 'L2'
bitrates = [4, 5, 6, 7, 8]
fig_rows = []
attr_cols = ['Drainage_Area_km2',
       'Elevation_m', 'Slope_deg', 'Gravelius', 'Perimeter', 'Aspect_deg',
       'Land_Use_Forest_frac', 'Land_Use_Grass_frac', 'Land_Use_Wetland_frac',
       'Land_Use_Water_frac', 'Land_Use_Urban_frac', 'Land_Use_Shrubs_frac',
       'Land_Use_Crops_frac', 'Land_Use_Snow_Ice_frac', 'Permeability_logk_m2',
       'Porosity_frac', 'tmax', 'tmin', 'prcp', 'srad', 'swe', 'vp',
       'high_prcp_freq', 'low_prcp_freq', 'high_prcp_duration',
       'low_prcp_duration']

for b in [6]:#bitrates[:1]:
    fname = f'compression_test_results_{b}bits_20240128.csv'
    df = pd.read_csv(fname)
    # print(df.columns)
    df['dist_scaled'] = (df['centroid_distance'] - df['centroid_distance'].min()) / (df['centroid_distance'].max() - df['centroid_distance'].min())
    df[attr_cols] = (df[attr_cols] - df[attr_cols].min()) / (df[attr_cols].max() - df[attr_cols].min())
    # print(df.head())
    if 'dist_scaled' not in attr_cols:
        attr_cols += ['dist_scaled']

    exclude_cols = ['Gravelius', 'Perimeter']    
    dist_cols = [e for e in attr_cols if e not in exclude_cols]
    # dist_cols = ['Drainage_Area_km2', 'dist_scaled']#, 'prcp', 'tmax', 'tmin', 'srad', 'swe', 'Land_Use_Forest_frac']
    weight_mapper = {k: 1 for k in dist_cols}
    # weight_mapper['dist_scaled'] = 2
    # weight_mapper['prcp'] = 2
    for k, v in weight_mapper.items():
        df[k] = np.multiply(df[k], weight_mapper[k])        
    
    df[f'{dist_measure}_attr_dist'] = df[dist_cols].sum(1)
    # print(df[[f'{dist_measure}_attr_dist']].head())
    
    # print(df[f'{dist_measure}_attr_dist'].min())
    # print(f'N={len(df)} ')
    # print(df[f'{dist_measure}_attr_dist'].min(), df[f'{dist_measure}_attr_dist'].max())
    fig_row = create_fig(df, b)
    fig_rows.append(fig_row)

R^2=0.00, slope is meaningless.
R^2=0.00, slope is meaningless.


In [159]:
layout = column(fig_rows)
show(layout)