In [None]:
import os
import time

import pandas as pd
import numpy as np

import scipy.stats as st

from bokeh.io import output_notebook, show
from bokeh.models import ColorBar, LinearColorMapper, LogColorMapper, HoverTool
from bokeh.plotting import figure
from bokeh.transform import linear_cmap, log_cmap
from bokeh.util.hex import hexbin
from bokeh.layouts import column, row

output_notebook()

In [None]:
rdf = pd.read_csv('results1.csv')
normed_df = (rdf - rdf.min()) / (rdf.max() - rdf.min())
normed_df.dropna(inplace=True)
print(f'Analyzing {len(normed_df)} basin pairs.')
print(normed_df.head())

distance_cols = [e for e in normed_df.columns if e != 'similarity']

In [None]:
print(distance_cols)
eu_dist = np.sqrt((normed_df[distance_cols]**2).sum(axis=1))
normed_df['normed_distance'] = (eu_dist - eu_dist.min()) / (eu_dist.max() - eu_dist.min())
normed_df.head()

In [None]:
def create_band(rdf, param, n_bins):
       
    if param == 'similarity':
        domain = 'normed_distance'
    else:
        domain = 'similarity'
        
#     hist, bin_edges = np.histogram(rdf[domain], density=True, bin_edges=b_edges)
#     n_samples_per_bin = 200
#     n_bins_100_sample = int(len(rdf) / n_samples_per_bin)
#     b_edges1 = st.mstats.mquantiles(rdf[param], np.linspace(0., 1.0, n_bins_100_sample))
    b_edges = st.mstats.mquantiles(rdf[param], np.linspace(0., 1.0, n_bins))
#     print(b_edges)
    b_edges = np.where(np.isnan(b_edges), 1, b_edges)
#     print(b_edges)
    
    tdf = pd.DataFrame()
    tdf['bin_edges'] = b_edges

    median_vals, lbound1, hbound1 = [], [], []
    bin_vals, lbound2, hbound2 = [], [], []

    for i in range(1,len(tdf)):
        min_edge = tdf.loc[i-1, 'bin_edges']
        max_edge = tdf.loc[i, 'bin_edges']
        mid_bin = (min_edge + max_edge) / 2
        bin_vals.append(mid_bin)
        grouped_vals = rdf[(rdf[param] >= min_edge) & (rdf[param] < max_edge)][domain]
#         print(f'{len(grouped_vals)} values between {min_edge:.5f} and {max_edge:.5f}')
        
        if len(grouped_vals) < 1:
            median_vals.append(np.nan)
            lbound1.append(np.nan)
            hbound1.append(np.nan)
            lbound2.append(np.nan)
            hbound2.append(np.nan)

        else:
            median_vals.append(np.percentile(grouped_vals, 50))
            lbound1.append(np.percentile(grouped_vals, 5))
            hbound1.append(np.percentile(grouped_vals, 95))
            lbound2.append(np.percentile(grouped_vals, 33.33333))
            hbound2.append(np.percentile(grouped_vals, 66.666667))
            
            
    band = pd.DataFrame()
    band['edge'] = bin_vals

    band['median'] = median_vals
    band['lbnd1'] = lbound1
    band['hbnd1'] = hbound1

    band['lbnd2'] = lbound2
    band['hbnd2'] = hbound2
    print('')
  


    return band
    

In [None]:
 #output to static HTML file
def create_fig(band, rdf, param):
    
    title = f"Expected similarity metric by equiprobable distance metric binning ({len(rdf)} basin pairs)"
    if param == 'similarity':
        title = f"Expected distance metric by equiprobable similarity metric binning ({len(rdf)} basin pairs)"

    p = figure(plot_width=600, plot_height=550, tools="wheel_zoom,box_zoom,pan,reset,lasso_select",
               title=title,
              match_aspect=True, background_fill_color='#440154')

    p.grid.visible = False
    
#     bins = hexbin(rdf['normed_dist'], rdf['similarity'], 0.05)
#     print(bins)
    
#     cmap = linear_cmap('counts', 'Viridis256', 0, max(bins.counts))

#     h = p.hex_tile(q="q", r="r", size=0.01, line_color=None, source=bins,
#                fill_color=cmap)
    
    p.circle(rdf['normed_distance'], rdf['similarity'], color="white", size=1,
            alpha=0.6)
    
    rdf.dropna(inplace=True)
    
    if param == 'similarity':

        domain = 'normed_distance'
        p.harea(band['lbnd1'], band['hbnd1'], 
                 band['edge'], alpha=0.3, color='white')
        p.harea(band['lbnd2'], band['hbnd2'], 
                 band['edge'], alpha=0.5, color='white')
        
        sim_all = np.percentile(rdf[domain], 50)
        sim_lb1 = np.percentile(rdf[domain], 5)
        sim_lb2 = np.percentile(rdf[domain], 33.33333)
        sim_hb1 = np.percentile(rdf[domain], 95)
        sim_hb1 = np.percentile(rdf[domain], 66.66667)
        
        y_vals = np.linspace(0, 1, 20)
        x_vals = [sim_all for _ in y_vals]
#         print(sim_all)
        p.line(x_vals, y_vals, color='#F012BE', line_width=3, 
              legend_label='EV (all data)', line_dash='dashed')
        
        p.line(band['median'], band['edge'], color='#F012BE', 
               line_width=2, legend_label='EV (bin)')
        
    else:

        domain = 'similarity'
        p.varea(band['edge'], band['lbnd1'], band['hbnd1'], 
                 alpha=0.3, color='white')
        p.varea(band['edge'], band['lbnd2'], band['hbnd2'], 
                 alpha=0.5, color='white')
        
        similarity_all = np.percentile(rdf['normed_distance'], 50)
        

        sim_all = np.percentile(rdf[domain], 50)
        sim_lb1 = np.percentile(rdf[domain], 5)
        sim_lb2 = np.percentile(rdf[domain], 33.33333)
        sim_hb1 = np.percentile(rdf[domain], 95)
        sim_hb1 = np.percentile(rdf[domain], 66.66667)
    
        x_vals = np.linspace(0, 1, 20)
        y_vals = [sim_all for _ in x_vals]
        
        p.line(x_vals, y_vals, color='#F012BE', line_width=3,
              legend_label='EV (all data)', line_dash='dashed')
        
        p.line(band['edge'], band['median'], color='#F012BE',
              line_width=2, legend_label='EV (bin)')

#     hover = HoverTool(
#         tooltips=[('Counts', '@counts')],
#         mode='mouse',
#         point_policy='follow_mouse',
#         renderers=[h],
#     )
#     p.add_tools(hover)
        
    min_d = rdf['normed_distance'].min()
    max_d = rdf['normed_distance'].max()

    p.xaxis.axis_label = f'Euclidean Distance of Parameter Differences (1 = most dissimilar)'
    p.yaxis.axis_label = f'Similarity Metric (COD/R^2)'
    return p


In [None]:
n_bins = 50
similarity_df = create_band(normed_df, 'similarity', n_bins).dropna()
dist_df = create_band(normed_df, 'normed_distance', n_bins).dropna()

In [None]:
p1 = create_fig(similarity_df, normed_df, 'similarity')
p2 = create_fig(dist_df, normed_df, 'normed_distance')

In [None]:
layout = row(p1, p2)
show(layout)