In [1]:
import os
import time

import pandas as pd
import numpy as np

import scipy.stats as st

from bokeh.io import output_notebook, show
from bokeh.models import ColorBar, LinearColorMapper, LogColorMapper, HoverTool
from bokeh.plotting import figure
from bokeh.transform import linear_cmap, log_cmap
from bokeh.util.hex import hexbin
from bokeh.layouts import column, row

output_notebook()

In [2]:
def create_band(rdf, param, n_bins):
       
    if param == 'similarity':
        domain = 'normed_distance'
    else:
        domain = 'similarity'
        
#     hist, bin_edges = np.histogram(rdf[domain], density=True, bin_edges=b_edges)
#     n_samples_per_bin = 200
#     n_bins_100_sample = int(len(rdf) / n_samples_per_bin)
#     b_edges1 = st.mstats.mquantiles(rdf[param], np.linspace(0., 1.0, n_bins_100_sample))
    b_edges = st.mstats.mquantiles(rdf[param], np.linspace(0., 1.0, n_bins))
#     print(b_edges)
    b_edges = np.where(np.isnan(b_edges), 1, b_edges)
#     print(b_edges)
    
    tdf = pd.DataFrame()
    tdf['bin_edges'] = b_edges

    median_vals, lbound1, hbound1 = [], [], []
    bin_vals, lbound2, hbound2 = [], [], []

    for i in range(1,len(tdf)):
        min_edge = tdf.loc[i-1, 'bin_edges']
        max_edge = tdf.loc[i, 'bin_edges']
        mid_bin = (min_edge + max_edge) / 2
        bin_vals.append(mid_bin)
        grouped_vals = rdf[(rdf[param] >= min_edge) & (rdf[param] < max_edge)][domain]
#         print(f'{len(grouped_vals)} values between {min_edge:.5f} and {max_edge:.5f}')
        
        if len(grouped_vals) < 1:
            median_vals.append(np.nan)
            lbound1.append(np.nan)
            hbound1.append(np.nan)
            lbound2.append(np.nan)
            hbound2.append(np.nan)

        else:
            median_vals.append(np.percentile(grouped_vals, 50))
            lbound1.append(np.percentile(grouped_vals, 5))
            hbound1.append(np.percentile(grouped_vals, 95))
            lbound2.append(np.percentile(grouped_vals, 33.33333))
            hbound2.append(np.percentile(grouped_vals, 66.666667))
            
            
    band = pd.DataFrame()
    band['edge'] = bin_vals

    band['median'] = median_vals
    band['lbnd1'] = lbound1
    band['hbnd1'] = hbound1

    band['lbnd2'] = lbound2
    band['hbnd2'] = hbound2
#     print('')
  


    return band
    

In [3]:
 #output to static HTML file
def create_fig(band, rdf, param):
    
    title = f"Expected similarity metric by equiprobable distance metric binning ({len(rdf)} basin pairs)"
    if param == 'similarity':
        title = f"Expected distance metric by equiprobable similarity metric binning ({len(rdf)} basin pairs)"

#     p = figure(plot_width=600, plot_height=550, tools="wheel_zoom,box_zoom,pan,reset,lasso_select",
#                title=title,
#               match_aspect=True, background_fill_color='#440154')

#     p.grid.visible = False
    
#     bins = hexbin(rdf['normed_dist'], rdf['similarity'], 0.05)
#     print(bins)
    
#     cmap = linear_cmap('counts', 'Viridis256', 0, max(bins.counts))

#     h = p.hex_tile(q="q", r="r", size=0.01, line_color=None, source=bins,
#                fill_color=cmap)
    
#     p.circle(rdf['normed_distance'], rdf['similarity'], color="white", size=1,
#             alpha=0.6)
    
    rdf.dropna(inplace=True)
    
    if param == 'similarity':

        domain = 'normed_distance'
        p.harea(band['lbnd1'], band['hbnd1'], 
                 band['edge'], alpha=0.3, color='white')
        p.harea(band['lbnd2'], band['hbnd2'], 
                 band['edge'], alpha=0.5, color='white')
        
        sim_all = np.percentile(rdf[domain], 50)
        sim_lb1 = np.percentile(rdf[domain], 5)
        sim_lb2 = np.percentile(rdf[domain], 33.33333)
        sim_hb1 = np.percentile(rdf[domain], 95)
        sim_hb1 = np.percentile(rdf[domain], 66.66667)
        
        y_vals = np.linspace(0, 1, 20)
        x_vals = [sim_all for _ in y_vals]
#         print(sim_all)
        p.line(x_vals, y_vals, color='#F012BE', line_width=3, 
              legend_label='EV (all data)', line_dash='dashed')
        
        p.line(band['median'], band['edge'], color='#F012BE', 
               line_width=2, legend_label='EV (bin)')
        
    else:

        domain = 'similarity'
#         p.varea(band['edge'], band['lbnd1'], band['hbnd1'], 
#                  alpha=0.3, color='white')
#         p.varea(band['edge'], band['lbnd2'], band['hbnd2'], 
#                  alpha=0.5, color='white')
        
        similarity_all = np.percentile(rdf['normed_distance'], 50)
        

        sim_all = np.percentile(rdf[domain], 50)
        sim_lb1 = np.percentile(rdf[domain], 5)
        sim_lb2 = np.percentile(rdf[domain], 33.33333)
        sim_hb1 = np.percentile(rdf[domain], 95)
        sim_hb1 = np.percentile(rdf[domain], 66.66667)
    
        x_vals = np.linspace(0, 1, 20)
        y_vals = [sim_all for _ in x_vals]
        

        
#         p.line(x_vals, y_vals, color='#F012BE', line_width=3,
#               legend_label='EV (all data)', line_dash='dashed')
        
#         p.line(band['edge'], band['median'], color='#F012BE',
#               line_width=2, legend_label='EV (bin)')

#     hover = HoverTool(
#         tooltips=[('distance', '@normed_distance'),
#                  ()],
#         mode='mouse',
#         point_policy='follow_mouse',
#         renderers=[h],
#     )
#     p.add_tools(hover)
        
#     min_d = rdf['normed_distance'].min()
#     max_d = rdf['normed_distance'].max()

#     p.xaxis.axis_label = f'Euclidean Distance of Parameter Differences (1 = most dissimilar)'
#     p.yaxis.axis_label = f'Similarity Metric (COD/R^2)'
    
#     return p


In [4]:
def create_layout(normed_df, n_bins, modes):
#     similarity_df = create_band(normed_df, 'similarity', n_bins).dropna()
    dist_df = create_band(normed_df, 'normed_distance', n_bins).dropna()
#     p1 = create_fig(similarity_df, normed_df, 'similarity')
#     p2 = create_fig(dist_df, normed_df, 'normed_distance')
    print(dist_df.head())
    
    return (dist_df['edge'].to_numpy(), dist_df['median'].to_numpy())
    

In [5]:
def import_results_and_normalize(mode):
    # rdf = pd.read_csv('results_allPairs_BCAB.csv')
    rdf = pd.read_csv(f'results/pca_result_{mode}_modes.csv')
    
    rdf.drop(columns=['Unnamed: 0'], inplace=True)
    
    normed_df = (rdf - rdf.min()) / (rdf.max() - rdf.min())
#     normed_df.dropna(inplace=True)
    
#     print(normed_df)
#     print(f'Analyzing {len(normed_df)} basin pairs.')
#     print(normed_df.head())
    print(f'Calculating results for mode {mode}')

    basin_characteristic_cols = [e for e in normed_df.columns if e != 'similarity']
#     print(distance_cols)
    # distance_cols = ['distance', 'Drainage_Area_km2', 'Elevation_m']
    eu_dist = np.sqrt((normed_df[basin_characteristic_cols]**2).sum(axis=1))
    # eu_dist1 = 
    normed_df['normed_distance'] = normed_df['pca_euclidean'].copy()#(eu_dist - eu_dist.min()) / (eu_dist.max() - eu_dist.min())
    
#     print(normed_df.head())
    
    return normed_df

    

In [16]:
results = {}
modes = []
n_bins = 50
for mode in range(1, 15):
    modes.append(mode)
    normed_df = import_results_and_normalize(mode)
#     print(normed_df.head())
    band = create_band(normed_df, 'normed_distance', n_bins).dropna()
    
    results[mode] = {'x': band['edge'].to_numpy(),
                    'y': band['median'].to_numpy(),
                    'mode': [mode for _ in band.to_numpy()]}
    

Calculating results for mode 1
Calculating results for mode 2
Calculating results for mode 3
Calculating results for mode 4
Calculating results for mode 5
Calculating results for mode 6
Calculating results for mode 7
Calculating results for mode 8
Calculating results for mode 9
Calculating results for mode 10
Calculating results for mode 11
Calculating results for mode 12
Calculating results for mode 13
Calculating results for mode 14


In [17]:
from bokeh.palettes import Viridis11 as pal
from bokeh.models import HoverTool
from bokeh.plotting import ColumnDataSource, figure, output_file, show


p = figure(plot_width=700, plot_height=500, 
           title=f"Example Similarity vs. Distance {len(normed_df)} basin pairs")

hover = HoverTool(tooltips=[
    ("mode", "@mode"),
])

for mode in list(results.keys())[:10]:
#     x = results[mode]['x']
#     y = results[mode]['y']
#     print(results[mode])
#     if (mode > 5) & (mode <7):
#         print(results[mode])
    source = ColumnDataSource(results[mode])
    p.line('x', 'y', color=pal[mode], legend_label=f'mode {mode}',
          line_width=2, source=source)

In [19]:
p.add_tools(hover)
p.xaxis.axis_label = 'Distance Metric'
p.yaxis.axis_label = 'Similarity Metric'
p.legend.click_policy = 'hide'

output_file('modes.html')
show(p)

In [9]:
layout = create_layout(import_results_and_normalize(2), 50)
show(layout)

       distance  Drainage_Area_km2  Elevation_m  Gravelius  Aspect_deg  \
0      0.009608           0.001667     0.081209   0.227740    0.010889   
1      0.009225           0.000225     0.065631   0.169879    0.267130   
2      0.021806           0.001913     0.049701   0.309460    0.061282   
3      0.010690           0.000721     0.044235   0.238488    0.064114   
4      0.029372           0.003021     0.133916   0.133235    0.441050   
...         ...                ...          ...        ...         ...   
59609  0.062529           0.013678     0.213134   0.343556    0.716205   
59610  0.630836           0.127745     0.048179   0.067127    0.257749   
59611  0.702875           0.125276     0.086987   0.112295    0.354239   
59612  0.622314           0.044231     0.041463   0.083434    0.297565   
59613  0.115744           0.002468     0.038808   0.179422    0.096479   

       Slope_deg  Land_Use_Forest_frac  Land_Use_Grass_frac  \
0       0.223527              0.345654          

TypeError: create_layout() missing 1 required positional argument: 'modes'

In [21]:
layout = create_layout(import_results_and_normalize(3), 50)
show(layout)

Analyzing 59614 basin pairs.


In [22]:
layout = create_layout(import_results_and_normalize(4), 50)
show(layout)

Analyzing 59614 basin pairs.


In [23]:
layout = create_layout(import_results_and_normalize(14), 50)
show(layout)

Analyzing 59614 basin pairs.


## Principal Component Analysis on Basin Attributes

