In [1]:
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd

import pandas as pd
import numpy as np

from shapely.geometry import Point

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import VArea, ColumnDataSource, ColorBar, ColorMapper, LinearColorMapper, EqHistColorMapper
from bokeh.transform import linear_cmap
from bokeh.palettes import Colorblind3
output_notebook()

## Compare perimeter measurement


Basin polygons are used to extract attributes from various geospatial layers.  Depending upon the resolution of dem from which polygons are derived, and depending upon whether polygons are manipulated (such as simplifying), the pixels representing the basin will change and the length of the basin perimeter will change.  In `validation/Basin_perimeter_comparison.ipynb`, three different sources of basin polygon are compared to higlight the difference in boundaries.  

The effect of DEM resolution on basin attributes (slope), and the proportion of cells at the perimeter are considered in `validation/technical_validation.ipynb`.  Since basin perimeter is used in some Large Sample Hydrology (LSH) research, we show here why comparison of any length-based attribute must at least be from a consistent resolution input dem and use the same methodology to derive basins.

Before running this notebook, the `basin_polygon_smoothing.py` script should be run to find the common stations between the HYSETS and (updated) WSC basin polygons.  Where basin polygons are found, the overlay accuracy is checked (intersection / union) and only those with > 90% accuracy are used in the comparison of polygon perimeter values.  The general process is as follows: 

* find the set of streamflow monitoring stations in HYSETS that has an updated polygon in the WSC basin update published in July 2022
* find the number of basins that saw significant changes to the station polygon, indicated by a low accuracy
* find pairs with >= 90% polygon match between the HYSETS and updated WSC polygon (intersected area / union area),
    * compare the perimeter length
    * in general the updated polygon has a much longer perimeter
* simplify the newer polygon iteratively with `shapely.simplify(tolerance)` until the perimeter matches the value published in hysets.
* when the perimeter values match, record the simplified polygon area and the final tolerance
* compare the final deviation in perimeter and check the effect of polygon simplification on basin area

    

In [2]:
df = pd.read_csv('data/polygon_simplified_results.csv')
print(len(df))
df.head()

1755


Unnamed: 0,Official_ID,HYSETS_perimeter,HYSETS_area_km2,tolerances,original_area,simplified_area,updated_perimeter,smoothed_perimeter,flag,accuracy
0,01AD002,1194.505,14703.9211,70.0,14677.362,14677.47585,1547.64,1205.516263,False,0.992878
1,01AD003,269.164,1358.6435,80.0,1348.299,1348.0407,346.56,270.937044,False,0.967065
2,01AD015,381.994,2712.0,850.0,2707.6419,2712.51495,583.8,383.041819,False,0.957446
3,01AE001,413.839,2245.7638,45.0,2238.9498,2238.9354,526.38,415.45702,False,0.994227
4,01AF006,,,,,,,,,0.196332


In [3]:
# find the number of basins where the updated polygon has > 10% of the area not shared with the baseline (HYSETS) polygon
mean_deviation = df['accuracy'].mean()
min_match = df['accuracy'].min()
jaccard_threshold = 0.95
deviated_basins = df[df['accuracy'] < jaccard_threshold].copy()
print(f'{len(deviated_basins)} deviated basins (min = {min_match:.2f}, mean deviation = {mean_deviation:.2f}')

1035 deviated basins (min = 0.00, mean deviation = 0.76


In [4]:
df['perimeter_diff_pct'] = 100*abs(df['HYSETS_perimeter'] - df['updated_perimeter']) / df['HYSETS_perimeter']
df = df[df['flag'] == False]
df = df[df['accuracy'] >= jaccard_threshold]
print(len(df))
df.head()

715


Unnamed: 0,Official_ID,HYSETS_perimeter,HYSETS_area_km2,tolerances,original_area,simplified_area,updated_perimeter,smoothed_perimeter,flag,accuracy,perimeter_diff_pct
0,01AD002,1194.505,14703.9211,70.0,14677.362,14677.47585,1547.64,1205.516263,False,0.992878,29.563292
1,01AD003,269.164,1358.6435,80.0,1348.299,1348.0407,346.56,270.937044,False,0.967065,28.754217
2,01AD015,381.994,2712.0,850.0,2707.6419,2712.51495,583.8,383.041819,False,0.957446,52.829626
3,01AE001,413.839,2245.7638,45.0,2238.9498,2238.9354,526.38,415.45702,False,0.994227,27.194392
5,01AF007,154.916,328.4387,60.0,325.1961,324.93915,201.96,155.952322,False,0.960974,30.367425


In [5]:
df['area_diff_pct'] = 100 * (df['HYSETS_area_km2'] - df['original_area']) / df['HYSETS_area_km2']
max_a_diff = df['area_diff_pct'].max()
mean_a_diff = df['area_diff_pct'].mean()
print(f'  mean area difference = {mean_a_diff:.1f}, max area difference = {max_a_diff:.1f}')

  mean area difference = 0.1, max area difference = 8.5


In [15]:
def binned_fig(df, param1, param2, samples_per_bin):
       
    p = figure(width=550, height=400, 
               toolbar_location='above')
    
    p.toolbar.autohide = True

    p.circle(df[param1], df[param2], alpha=0.5, color=Colorblind3[0],
            )

    p.line([0, 0], [0, 65], line_dash='dashed', color='red',
           legend_label='Equal Area', line_width=3)
    p.line([-8, 8], [0, 0], line_dash='dotted', color='red',
           legend_label='Equal Perimeter', line_width=3)
    
    p.xaxis.axis_label = 'Deviation from baseline area [%]'
    return p


In [16]:
print(f'sample size = {len(df)}')

sample size = 715


In [18]:
p1 = binned_fig(df, 'area_diff_pct', 'perimeter_diff_pct', 100)
p1.yaxis.axis_label = 'Deviation from baseline perimeter [%] '
p1.toolbar.autohide = True
show(p1)