In [None]:
import os
import math

import pandas as pd
import numpy as np
import pygrib

from scipy import stats
import scipy.special

import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points

from multiprocessing import Pool
import time

from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.tile_providers import Vendors, get_provider
from bokeh.layouts import gridplot
from bokeh.models import Band, ColumnDataSource
from bokeh.models import Label, LabelSet, Range1d
from bokeh.palettes import Plasma4, Plasma6, Plasma10
from bokeh.layouts import column, gridplot

import matplotlib.pyplot as plt

import geoviews as gv
import geoviews.feature as gf
import geoviews.tile_sources as gvts

from geoviews import opts

from cartopy import crs

import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

output_notebook()

# Exploring Datasets

Compare metadata from different datasets.


       
       'max_water_content', 'sand_frac',
       'silt_frac', 'clay_frac', 'water_frac', 'organic_frac', 'other_frac',
       'frac_forest', 'lai_max', 'lai_diff', 'gvf_max', 'gvf_diff',
       'dom_land_cover_frac', 'dom_land_cover', 'root_depth_50',
       'root_depth_99', 'p_mean', 'pet_mean', 'p_seasonality', 'frac_snow',
       'aridity', 'high_prec_freq', 'high_prec_dur', 'high_prec_timing',
       'low_prec_freq', 'low_prec_dur', 'low_prec_timing', 'geol_1st_class',
       'glim_1st_class_frac', 'geol_2nd_class', 'glim_2nd_class_frac',
       'carbonate_rocks_frac', 

| HYSETS Parameter Label | HYSETS | CAMELS | Notes |
|---|---|---|---|
| `Watershed_ID` | ✅ | ❌ | HYSETS-specific, simply numerical order | 
| `Source` | ✅ | ❌ | HYSETS is comprised of smaller datasets, source is the dataset origin. |
| `Name` | ✅ | ✅ | Station "Local" Name, (`gauge_name` in CAMELS) |
| `Official_ID` | ✅ | ✅ | ID specific to original datset, `gauge_id` in CAMELS |
| `Centroid_Lat_deg_N` | ✅ | ❌ | CAMELS uses coords of **gauge location** (`gauge_lat`, `gauge_lon`) |
| `Centroid_Lon_deg_E` | ✅ | ❌ | **could generate centroid coords from polygons...** |
| `Drainage_Area_km2` | ✅ | ✅ | DA derived from basin polygons where available from origin agency. (CAMELS: `area_gages2`) |
| `Drainage_Area_GSIM_km2` | ✅ | ❓ | Global Streamflow Indices and Metadata (GSIM) (CAMELS uses `area_geospa_fabric`  |
| `Flag_GSIM_boundaries` | ✅ | ❌ | 1 (GSIM) 0 (official agencies) |
| `Flag_Artificial_boundaries` | ✅ | ❌ | Bounds/DAs for stations < 50 $km^2$ (inaccurate delineation) |
| `Elevation_m` | ✅ | ✅ | *mean* basin elevation (CAMELS: `elev_mean`) |
| `Slope_deg` | ✅ |  | mean elevation difference between DEM "tiles" (CAMELS: `slope_mean`)  |
| `Gravelius` | ✅ | ❌ | ratio of basin perimeter to a circle of the same area (large # indicates long catchment) |
| `Perimeter` | ✅ | ❌ | can be calculated with polygons |
| `Flag_Shape_Extraction` | ✅ | ❌ | ? |
| `Aspect_deg` | ✅ | ❌ | "main orientation" "where the average slope points towards" |
| `Flag_Terrain_Extraction` | ✅ | ❌ |  |
| `Land_Use_Forest_frac` | ✅ |  | |
| `Land_Use_Grass_frac` | ✅ |  | |
| `Land_Use_Wetland_frac` | ✅ |  | |
| `Land_Use_Water_frac` | ✅ | ✅ | |
| `Land_Use_Urban_frac` | ✅ |  | |
| `Land_Use_Shrubs_fra` | ✅ |  | |
| `Land_Use_Crops_frac` | ✅ |  | |
| `Land_Use_Snow_Ice_frac` | ✅ |  | |
| `Flag_Land_Use_Extraction` | ✅ |  | |
| `Permeability_logk_m2` | ✅ |  | subsurface permeability below soil horizon (arithmetic mean) (CAMELS: `geol_permeability`)|
| `Porosity_frac` | ✅ |  | soil porosity $\frac{V_v}{V_T}$ (geometric mean) (CAMELS: `geol_porostiy` |
| `Flag_Subsoil_Extraction` | ✅ |  | |
| `q_mean` | ❌ | ✅ | can be calculated for HYSETS |
| `runoff_ratio` | ❌ | ✅ | can be calculated for HYSETS |
| `slope_fdc` | ❌ | ✅ | ? can be calculated for HYSETS? |
| `baseflow_index` | ❌ | ✅ |? can be calculated for HYSETS? |
| `stream_elas` | ❌ | ✅ | ? can be calculated for HYSETS |
| `q5` | ❌ | ✅ | can be calculated for HYSETS |
| `q95` | ❌ | ✅ | can be calculated for HYSETS |
| `high_q_freq` | ❌ | ✅ | can be calculated for HYSETS |
| `high_q_dur` | ❌ | ✅ | can be calculated for HYSETS |
| `low_q_freq` | ❌ | ✅ | can be calculated for HYSETS |
| `low_q_dur` | ❌ | ✅ | can be calculated for HYSETS |
| `zero_q_freq` | ❌ | ✅ | can be calculated for HYSETS |
| `hfd_mean` | ❌ | ✅ | ? can be calculated for HYSETS |
| `huc_02` | ❌ | ✅ | ? |
| `soil_depth_pelletier` | ❌ | ✅ | ? |
| `soil_depth_statsgo` | ❌ | ✅ | ? |
| `taxmax/tasmin` | ✅ | ? | Daily Maximum/Minimum Near-Surface Air Temperature |
| `tasmin` | ❌ | ✅ | ? |
| `` | ❌ | ✅ | ? |

## CAMELS Classes

CAMELS groups catchment attributes in six classes: topography, climate, streamflow, land cover, soil, and geology.



In [None]:
# import hysets
hysets_df = pd.read_csv('data/HYSETS_watershed_properties.txt', sep=';', dtype={'Official_ID': str})

In [None]:
hysets_df.head()


In [None]:
# create a dictionary of identifying information to facilitate
# selection of specific watersheds
hysets_dict = hysets_df[['Watershed_ID', 'Official_ID', 'Drainage_Area_km2', 'Name']].set_index('Official_ID').to_dict(orient='index')


In [None]:
all_official_IDs = list(hysets_dict.keys())

In [None]:
def create_geodf(tp1, tp2, lat_tag, lon_tag, df):
    df_obs = pd.DataFrame({'latitude': df[lat_tag].to_numpy(), 
                       'longitude': df[lon_tag].to_numpy(), 
                       tp1: df[tp1],
                       tp2: df[tp2]
                    })

#     gdf = gpd.GeoDataFrame(df_obs, geometry=gpd.points_from_xy(df_obs['longitude'], df_obs['latitude']), crs='epsg:3857')
    return gv.Points(df_obs, ['longitude', 'latitude'], [tp1], label='DA')

In [None]:
tp1 = 'Drainage_Area_km2'
tp2 = 'Official_ID'
cm_lat = 'Centroid_Lat_deg_N'
cm_lon = 'Centroid_Lon_deg_E'
n_watersheds = len(hysets_df)

hysets_pts = create_geodf(tp1, tp2, cm_lat, cm_lon, hysets_df)

In [None]:
(gvts.EsriNatGeo *
 hysets_pts.opts(
     title=f'HYSETS Watershed Dataset ({n_watersheds} watersheds)',
     logz=True,
     size=2, #+ gv.dim(target_param)*2, 
     fill_color=tp1, 
     line_color=None,
     cmap='Plasma',
     alpha=0.5,
     colorbar=True, clabel='km²',
     width=700, 
     height=450, 
     global_extent=False, 
     tools=['hover'], 
     show_legend=False))
#      show_legend=False) *
#  gv.Labels(pts).opts(
#     text_font_size='8pt', text_color='black')) 


In [None]:
cm_hyd = pd.read_csv('data/camels_attributes_v2.0/camels_hydro.txt', sep=';', dtype={'gauge_id': str})
cm_names = pd.read_csv('data/camels_attributes_v2.0/camels_name.txt', sep=';', dtype={'gauge_id': str})
cm_topo = pd.read_csv('data/camels_attributes_v2.0/camels_topo.txt', sep=';', dtype={'gauge_id': str})
cm_soil = pd.read_csv('data/camels_attributes_v2.0/camels_soil.txt', sep=';', dtype={'gauge_id': str})
cm_vege = pd.read_csv('data/camels_attributes_v2.0/camels_vege.txt', sep=';', dtype={'gauge_id': str})
cm_clim = pd.read_csv('data/camels_attributes_v2.0/camels_clim.txt', sep=';', dtype={'gauge_id': str})
cm_geol = pd.read_csv('data/camels_attributes_v2.0/camels_geol.txt', sep=';', dtype={'gauge_id': str})

dfs = [cm_hyd, cm_names, cm_topo, cm_soil, cm_vege, cm_clim, cm_geol]

In [None]:
# map names to the cm_hyd file
from functools import reduce

camels_df = reduce(lambda left, right: pd.merge(left, right, on='gauge_id'), dfs)


In [None]:
camels_df.head()
print(len(camels_df))

In [None]:
# Check overlap between CAMELS and HYSETS
cm = camels_df[['gauge_id', 'gauge_name']].copy()
hs = hysets_df[['Official_ID', 'Name']].copy()
cm.set_index('gauge_id', inplace=True)
hs.set_index('Official_ID', inplace=True)

overlap = pd.concat([cm, hs], join='inner', axis=1)
print(f'There are {len(overlap)} stations in common between CAMELS and HYSETS.')
overlap

In [None]:
tp1 = 'area_gages2'
tp2 = 'area_geospa_fabric'
cm_lat = 'gauge_lat'
cm_lon = 'gauge_lon'
n_watersheds = len(camels_df)

camels_pts = create_geodf(tp1, tp2, cm_lat, cm_lon, camels_df)

In [None]:
(gvts.EsriNatGeo *
 camels_pts.opts(
     title=f'HYSETS Watershed Dataset ({n_watersheds} watersheds)',
     logz=True,
     size=4, #+ gv.dim(target_param)*2, 
     fill_color=tp1, 
     line_color=None,
     cmap='Plasma',
     alpha=0.5,
     colorbar=True, clabel='km²',
     width=700, 
     height=450, 
     global_extent=False, 
     tools=['hover'], 
     show_legend=False))
#      show_legend=False) *
#  gv.Labels(pts).opts(
#     text_font_size='8pt', text_color='black'))


## Import HYSETS Data

Below is a test demo to load the HYSETS database and extract the (non-null) timeseries data for a specific HYDAT timeseries by the `Official_ID` parameter, which is the ID used by the governing organization that manages the dataset.

### NOTE: When using `.sel()` on xarray, the watershedID in the HYSETS dict starts at 1, while the xarray dataset is zero indexed.

As a result, **subtract 1 from the watershedID when using `.sel()`, i.e.:

>`data = ds.sel(watershed=ws['Watershed_ID']-1, drop=True)`

In [None]:
import xarray as xr
import os

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
# DB_DIR = os.path.dirname(os.path.abspath(os.getcwd()))
DB_DIR = os.path.join(BASE_DIR, 'hysets_db/')

hysets_filename = 'HYSETS_2020_NRCAN.nc'


In [None]:
ds = xr.open_dataset(DB_DIR + hysets_filename)
ds

In [None]:
# 08MH147 is Stave River above Stave Lake
# ws = hysets_dict['08MH147']
# print(ws)
# print(ws['Watershed_ID'])

In [None]:
def extract_streamflow_series(stn):
    ws = hysets_dict[stn]
    df = ds.sel(watershed=ws['Watershed_ID']-1, drop=True).to_dataframe()

    df = df[['discharge']].dropna()
    wsid = ws['Watershed_ID']

    df.to_csv(f'/media/ danbot/T7 Touch/hysets_series/{stn}.csv')
    


In [None]:
from multiprocessing import Pool

with Pool() as pool:
    pool.map(extract_streamflow_series, all_official_IDs)


In [None]:
# filter the main dataframe for the watershed of interest
ds_filtered = ds.sel(watershed=ws['Watershed_ID']-1, drop=True)

In [None]:
df = ds_filtered.to_dataframe()
df = df[~df['discharge'].isnull()]

df.index = pd.to_datetime(df.index)
n_years = len(set(df.index.year))

print(df['discharge'].max(), df['discharge'].min())
print(len(df))

### Compression Analysis

Quantize the entire dataset with n [equiprobable bins](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html).  This results in a uniform distribution, and no effect is introduced by tuning distribution parameters.

In [None]:
def generate_uniform_noise(n):
    noise = np.random.uniform(-1, 1, n)
    return noise / 100

In [None]:
# find the set of all repeated variables and add uniform noise
# to facilitate equiprobable bin creation.
import collections

repeated_vals = [item for item, count in collections.Counter(df['discharge'].to_numpy()).items() if count > 1]

for repeated_val in repeated_vals:
    n_repeats = len(df.loc[df['discharge'] == repeated_val, 'discharge'])
    df.loc[df['discharge'] == repeated_val, 'discharge'] += generate_uniform_noise(n_repeats)

In [None]:
import scipy.stats as st
# Quantize the daily flow series
# set the bounds at 0 and the 99th percentile flow
# stat, edges, bnum = st.binned_statistic(df['discharge'], values=None, statistic='count', bins=8, range=(0, np.percentile(df['discharge'], 99)))
def derive_equiprobable_bin_edges(n_bins, df):
    n_obs = len(df)
    probs = np.linspace(0.0,1, n_bins+1)
    bin_edges = st.mstats.mquantiles(df['discharge'], prob=probs)
    # print(hist)
#     print('    bin edges:')
#     print(f'        {bin_edges.round(1)}')
    return bin_edges

In [None]:
bin_edges = derive_equiprobable_bin_edges(8, df)
df['bin_no'] = np.digitize(df['discharge'], bin_edges)
hist, edges = np.histogram(df['discharge'], bin_edges, density=False)

In [None]:
def make_plot(title, hist, edges, x=None, pdf=None, cdf=None):
    p = figure(title=title, background_fill_color="#fafafa",
               width=600, height=450,
#               y_range=(1505, 1510),
              )
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5, 
          legend_label='Test')
#     p.line(x, pdf, line_color="#ff8888", line_width=4, alpha=0.7, legend_label="PDF")
#     p.line(x, cdf, line_color="orange", line_width=2, alpha=0.7, legend_label="CDF")

#     p.y_range.start = 0
    p.legend.location = "top_right"
    p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'Q [cms]'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"
    return p

In [None]:
n_bins = 8
p = make_plot(f'Equiprobable Binning ({n_bins} bins)', hist, edges, )
show(p)

In [None]:
# create an array of all combinations of periods 
# this will be used as input to filter the dataframe
# for sub-periods 
import random
import itertools

def format_input_data(df):
    years = list(set(df.index.year))
    
    all_periods = []
    len_tracker = {}

    for period_len in range(1, 5):
        samples = list(itertools.combinations(years, period_len))
        all_periods += samples
#         print(f'There are {len(samples)} samples for a period length of {period_len} years.')
        len_tracker[period_len] = len(samples)
    return all_periods, len_tracker
        


In [None]:
def calculate_entropy(probs, bins, base=2):
    """ Computes entropy of label distribution. 
    By default, 
        -uses base 2.
        -uses 8 bins.
        
    Returns Entropy of a sequence (in bits per sample)
    """
    ent = 10E-20
    # Compute entropy from probability (counts)
    for p in probs:
        if p != 0:
            # convert to 
            p = p / sum(probs)
            ent -= p * math.log(p, base)
            
    return ent 


In [None]:
# calculate entropy
h_complete = round(calculate_entropy(hist, n_bins), 3)

print(f'The entropy of the entire distribution is {h_complete} bits')

### Calculate Entropy over (all) periods of varying length

Above, bin edges were determined for a full dataset based on equiprobable bins.  For periods from 1 to len(record) years, calculate the entropy of subsets of data using the same bin edges, and also collect the bin edges for subsets to develop a distribution of both bin edge locations and entropy for different sub-period lengths.

In [None]:
def entropy_const_edges(input_data):
    sample_len = len(input_data[0])
    data = input_data[-1]
    n_bins = input_data[2]
    years_in_subsample = input_data[0]
    sample_data = data[data.index.year.isin(years_in_subsample)]
    hist, edges = np.histogram(sample_data['bin_no'], density=False, bins=n_bins)
    sample_h = calculate_entropy(hist, bins=n_bins)
    return (sample_len, sample_h)

def calculate_entropy_distributions(df, bin_edges, n_bins, all_periods):
    input_array = [(y, bin_edges, n_bins, df) for y in all_periods]
    t0 = time.time()
    pool = Pool()
    result_const_bin_edges = pool.map(entropy_const_edges, input_array)
    pool.close()
    pool.join()

    t1 = time.time()
    t_tot = t1 - t0
    print(f'for {n_bins} bins, time to calculate all scenarios: {t_tot:.1f}s')
    return result_const_bin_edges


In [None]:
def convert_output_to_dict(results):
    sample_lens = list(set([e[0] for e in results]))
    results_dict = {l: [] for l in sample_lens}
    for r in results:
        results_dict[r[0]].append(r[1])
        
    return results_dict

In [None]:
def min_mean_max(results, n_bins, len_tracker):
    
    out_mean, out_max, out_min = [], [], []
    n_samples = []
    max_entropy = np.log2(n_bins)
    for p in results.keys():
        out_mean.append(np.mean(results[p]) / max_entropy)
        out_max.append(np.percentile(results[p], 95) / max_entropy)
        out_min.append(np.percentile(results[p], 5) / max_entropy)
        n_samples.append(f'n={len_tracker[p]}')
        
    print(f'    mean entropy: ')
    print(f'         {[e.round(2) for e in out_mean]}')
        
    return pd.DataFrame({'sample_len': results.keys(),
                        'mean': out_mean,
                        'max': out_max,
                        'min': out_min,
                         'n_samples': n_samples,
                         'max_entropy': [1 for e in results.keys()]
                        })

In [None]:
def entropy_main(df, n_bins):
    print(f'Starting entropy calculation for {n_bins} bins')
    # set the bin edges for the total sample based on equiprobable bins
    bin_edges = derive_equiprobable_bin_edges(n_bins, df)
    # prepare input data of subsamples
    all_periods, len_tracker = format_input_data(df)
    # calculate entropy for all subsamples
    results_raw = calculate_entropy_distributions(df, bin_edges, n_bins, all_periods)
    results_dict_const_bin = convert_output_to_dict(results_raw)
#     len_tracker_dict = {k: len([e for e in all_periods if len(e) == k]) for k in range(1,5)}
    results_df = min_mean_max(results_dict_const_bin, n_bins, len_tracker)
    print('')
    return results_df

In [None]:
const_bin_results = entropy_main(df, 8)

In [None]:
source = ColumnDataSource(const_bin_results)

fig = figure(title=f"Compressibility Distribution by Sample Length (8 bins)",
           width=600, height=450, x_range=(0.8, 4.5), y_range=(0.2, 1.1))

fig.line(const_bin_results['sample_len'], const_bin_results['mean'], line_color="green", line_width=3, 
         alpha=0.7, legend_label="Mean", line_dash='dashed')

fig.varea(x='sample_len', y1='min', y2='max', source=source, level='underlay',
           fill_alpha=0.5, fill_color='dodgerblue', legend_label='2σ interval')

fig.line([1, 4], [1, 1], color='firebrick', alpha=0.8, line_width=3, 
         legend_label='Maximum Entropy', line_dash='dashed')

# add labels for sample size at each subset length
labels = LabelSet(x='sample_len', y='max_entropy', text='n_samples', level='glyph',
              x_offset=-15, y_offset=5, source=source, render_mode='canvas')

fig.add_layout(labels)

fig.legend.location = "center_right"
fig.xaxis.axis_label = 'Subset length (n years)'
fig.yaxis.axis_label = 'Temporal Compression [bps]'

show(fig)

In [None]:
bin_comparison = {}
for n in [6, 8, 10, 12, 14, 16]:
    bin_comparison[n] = entropy_main(df, n)

In [None]:
source = ColumnDataSource(const_bin_results)

fig = figure(title=f"Compressibility Distribution by Sample Length as a function of Number of Bins",
           width=600, height=450, x_range=(0.5, 4.5), y_range=(0., 1.02))

i = 0
for k in bin_comparison.keys():
    x = bin_comparison[k]['sample_len']
    y = bin_comparison[k]['mean']
    fig.line(x, y, legend_label=f'{k} bins', color=Plasma6[i], line_width=3)
    
    i += 1
    

fig.line([1, 4], [1, 1], color='firebrick', alpha=0.8, line_width=3, 
         legend_label='Maximum Entropy', line_dash='dashed')


fig.legend.location = "bottom_right"
fig.xaxis.axis_label = 'Subset length (n years)'
fig.yaxis.axis_label = 'Temporal Compression Ratio [-]'

show(fig)

## Bin Edges

How sensitive are the bin edge definitions?  Instead of using constant bin edges defined by the entire record as above, determine the distribution of bin edge locations based on all the subsets.

In [None]:
n_bins = 8

In [None]:
def bin_edge_sensitivity(input_data):
    data = input_data[0]
    sample_periods = input_data[2]
    sample_data = data[data.index.year.isin(sample_periods)]
    n_bins = input_data[1]
    sample_len = len(sample_periods)
    bin_edges = derive_equiprobable_bin_edges(n_bins, sample_data)
    return (sample_len, bin_edges)

In [None]:
t0 = time.time()
input_data_array = []

all_periods, len_tracker = format_input_data(df)
input_array = [(df, n_bins, p) for p in all_periods]
t0 = time.time()
pool = Pool(12)
results_var_bin_edges = pool.map(bin_edge_sensitivity, input_array)
pool.close()
pool.join()

t1 = time.time()
t_tot = t1 - t0
print(f'for {n_bins} bins, time to calculate {len(all_periods)} scenarios: {t_tot:.1f}s')


In [None]:
edge_dict = {}
for r in results_var_bin_edges:
    sample_len = r[0]
    if sample_len not in edge_dict.keys():
        edge_dict[sample_len] = [r[1]]
    else:
        edge_dict[sample_len].append(r[1])

In [None]:
edge_sensitivity_dict = {}
for k in edge_dict.keys():
    edge_df = pd.DataFrame(edge_dict[k])
    edge_df.loc['Mean', :] = edge_df.mean(axis=0)
    edge_df.loc['5pct', :] = edge_df.quantile(.05, axis=0)
    edge_df.loc['95pct', :] = edge_df.quantile(.95, axis=0)
    edge_sensitivity_dict[k] = edge_df.loc[['Mean', '5pct', '95pct'], :].T

In [None]:


fig_row = []
for k in edge_dict.keys():
    fig = figure(title=f"Bin Edge Sensitivity ({k} years)",
           width=700, height=450)
    i = 0
    for r in edge_sensitivity_dict[k].index:
        # center of the vertical bar
        x = edge_sensitivity_dict[k].loc[r, ['Mean']]
        # width of the vertical bar
        w = edge_sensitivity_dict[k].loc[r, '95pct'] - edge_sensitivity_dict[k].loc[r, '5pct']
        fig.line([x, x], [0, 0.5], color=Plasma10[i], line_width=3, line_dash='dashed')
        
        fig.vbar(x=x, top=0.5, bottom=0, width=w, level='underlay',
           fill_alpha=0.5, fill_color=Plasma10[i],
                line_width=0)
        i += 1
    
#     fig.legend.location = "top_left"
    fig.xaxis.axis_label = 'Q [cms]'
    fig.yaxis.axis_label = 'P(X)'
    fig_row.append(fig)
    


grid = gridplot([fig_row[:2], fig_row[2:]], plot_width=400, plot_height=350)
    
show(grid)

In [None]:
def make_plot(title, hist, edges, x, pdf, cdf):
    p = figure(title=title, tools='', background_fill_color="#fafafa")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
    p.line(x, pdf, line_color="#ff8888", line_width=4, alpha=0.7, legend_label="PDF")
    p.line(x, cdf, line_color="orange", line_width=2, alpha=0.7, legend_label="CDF")

    p.y_range.start = 0
    p.legend.location = "center_right"
    p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"
    return p


In [None]:
# Plot the bin edges as distributions
fig = figure(title=f"Bin Edge Sensitivity ({k} years)",
       width=700, height=450)

sub_bins = [5, 10, 13, 15]
b = 0
for l in edge_dict.keys():
    
    edge_df = pd.DataFrame(edge_dict[l])
    # width of the vertical bar
#     print(f'for sample length {l} there are {len(edge_df)} samples')
    y = l
    i = 0
#     print(f'sample length: {l}')
    for c in edge_df.columns:
#         print(edge_df[c])
        n = int(10E-4 * len(edge_df) + 5)
        num_sub_bins = sub_bins[b]
#         print(f'there should be {num_sub_bins} sub bins for {l} sample len')
        hist, edges = np.histogram(edge_df[c], density=True)#, bins=sub_bins[b])
        fig.quad(top=hist + l, bottom=l, left=edges[:-1], right=edges[1:],
            fill_color=Plasma10[i], line_color=None, alpha=0.7)
        i += 1
    b += 1

fig.xaxis.axis_label = 'Distribution of Bin Edges (Q [cms])'
fig.yaxis.axis_label = 'Sample Length (Years)'

print('__')
show(fig)