In [1]:
import os

import geopandas as gpd
import pandas as pd
import numpy as np

from shapely.wkb import loads

Compare perimeter qualities from three different sources:

1. HYSETS (source is WSC and USGS, derived from EarthEnv DEM90 where unavailable from official resources, or approximated as as a square centred at the officially reported station location with area equal to the reported area.
2. **Updated WSC** Water Survey of Canada (WSC) updated over 8000 basin polygons in July 2022.  These can be downloaded [here](https://collaboration.cmc.ec.gc.ca/cmc/hydrometrics/www/HydrometricNetworkBasinPolygons/).
3. **BCUB**: the BCUB polygons were filtered for the best match, where a match is measured by the polygon overlap accuracy.  Accuracy is % intersected area divided by reported station drainage area.

In [2]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, 'input_data/')

In [3]:
# specify the path where updated (2022-07-21) WSC basin polygons are saved
# https://collaboration.cmc.ec.gc.ca/cmc/hydrometrics/www/HydrometricNetworkBasinPolygons/
wsc_geom_folder = os.path.join(DATA_DIR, 'WSC_basins/07/07FD001/')
wsc_geom_folder = '/home/danbot2/code_5820/large_sample_hydrology/common_data/WSC_data/WSC_basin_polygons/'

In [4]:
example_stn = '07FD001'
fpath = f'{example_stn}/DrainageBasin/{example_stn}_DrainageBasin_BassinDeDrainage.shp'
latest_wsc_basin = gpd.read_file(wsc_geom_folder + fpath)
latest_wsc_basin = latest_wsc_basin.to_crs(3005)
latest_wsc_basin.to_file('data/07FD001_WSC_updated.geojson')

In [5]:
# retrieve the station (pour point) location coordinates
wsc_pour_point_fpath = f'{example_stn}/PourPoint/{example_stn}_PourPoint_PointExutoire.shp'
latest_wsc_ppt = gpd.read_file(wsc_geom_folder + wsc_pour_point_fpath, engine='pyogrio', use_arrow=True)
latest_wsc_ppt = latest_wsc_ppt.to_crs(3005)
latest_wsc_ppt.to_file('data/07FD001_pour_point.geojson')

In [6]:
# import the HYSETS basins
hysets_polygon_folder = '/home/danbot2/code_5820/large_sample_hydrology/common_data/HYSETS_data/HYSETS_watershed_boundaries'
hs_df = gpd.read_file(os.path.join(hysets_polygon_folder, 'HYSETS_watershed_boundaries_20200730.shp'))
hs_df.head()

Unnamed: 0,features,Name,OfficialID,FlagPAVICS,Source,Area,geometry
0,1,SAINT JOHN RIVER AT FORT KENT,01AD002,0,HYDAT,14703.921076,"POLYGON ((-69.27594 47.81781, -69.27511 47.817..."
1,1,ST. FRANCIS RIVER AT OUTLET OF GLASIER LAKE,01AD003,0,HYDAT,1358.643465,"POLYGON ((-68.95533 47.20243, -68.95589 47.202..."
2,1,MADAWASKA (RIVIERE) A 6 KM EN AVAL DU BARRAGE ...,01AD015,1,HYDAT,2712.0,"POLYGON ((-68.30417 47.9875, -68.29805 47.9700..."
3,1,FISH RIVER NEAR FORT KENT,01AE001,0,HYDAT,2245.763823,"POLYGON ((-68.5905 47.23913, -68.58887 47.2389..."
4,1,BLACK BROOK NEAR ST-ANDRE-DE-MADAWASKA,01AF006,0,HYDAT,14.2,"POLYGON ((-67.75627 47.07445, -67.75627 47.108..."


In [7]:
hs_basin = hs_df[hs_df['OfficialID'] == '07FD001'].copy()
hs_basin = hs_basin.set_crs(4326)
hs_basin = hs_basin.to_crs(3005)
hs_basin.to_file('data/07FD001_HYSETS.geojson')

In [9]:
# open polygon from BCUB
# bc_df = gpd.read_file('data/intersecting_polygons.geojson')
# bc_df.crs

In [None]:
assert bc_df.crs == latest_wsc_basin.crs

In [None]:
intersections = gpd.sjoin(bc_df, latest_wsc_basin, how='inner', predicate='intersects')
# Calculate the intersection area
intersections['intersection_area'] = intersections.apply(
    lambda row: row['geometry'].intersection(latest_wsc_basin.loc[row['index_right']].geometry).area, axis=1)

# Calculate intersection area as a percentage of gdf2's polygon area
# Assuming gdf2 has one polygon, or you want to compare with the area of each intersected polygon in gdf2
intersections['percentage'] = intersections.apply(
    lambda row: (row['intersection_area'] / latest_wsc_basin.loc[row['index_right']].geometry.area) * 100, axis=1)


In [None]:
# compute the distance from the pour point to the updated WSC pour point
# pour_pt_geoms = intersections.copy().set_geometry('pour_pt')
intersections['pour_pt'] = intersections['pour_pt'].apply(lambda x: loads(x, hex=True))
intersections['distance_to_wsc_ppt'] = intersections.apply(
    lambda row: row['pour_pt'].distance(latest_wsc_ppt.geometry), axis=1)
intersections.head()

In [None]:
intersections.columns

In [None]:
# first, sort by the pct overlapping area
out_df = intersections[['intersection_area', 'percentage', 'region_code', 'geometry', 'distance_to_wsc_ppt']].sort_values('percentage', ascending=False)
# compute distance between pour points
out_df.head(20)

The above result shows that there are many polygons that match very closely by percent overlap.  Now sort by the distance to the WSC pour point.

In [None]:
out_df = out_df.sort_values('distance_to_wsc_ppt')
out_df.head()

In [None]:
out_df = out_df[out_df.index == 88]
out_df.to_file('data/BCUB_polygon_best_overlap.geojson')
print(out_df)

In [None]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
output_notebook()

In [None]:
def plot_all_geometries(gdf):
    # Create a Bokeh plot
    p = figure(title="", x_axis_label='Longitude', y_axis_label='Latitude',
              width=600, height=400)

    # Iterate through each geometry in the GeoDataFrame
    lines = ['solid', 'dashed', 'dotted']
    n = 0
    for i, row in gdf.iterrows():
        name = row['name']
        geometry = row['geometry']

        if geometry.geom_type == 'Polygon':
            x, y = geometry.exterior.xy
            p.line(x, y, line_width=3, color='black',
                   legend_label=name, line_dash=lines[n])
        elif geometry.geom_type == 'MultiPolygon':
            # Plot each polygon in a MultiPolygon
            for poly in geometry:
                x, y = poly.exterior.xy
                p.line(x, y, line_width=2, legend_label=name, line_dash=lines[n])
        # Add more conditions for other geometry types if necessary
        n += 1
    # Show the plot
    p.legend.location='bottom_right'
    p.toolbar.autohide = True

    return p

In [None]:
comb_df = gpd.GeoDataFrame(pd.concat([hs_basin, latest_wsc_basin, out_df]), crs='EPSG:3005')
comb_df['name'] = ['HYSETS', 'WSC', 'BCUB']
comb_df = comb_df[['name', 'geometry']].to_crs(4326)
comb_df.to_file('data/07FD001_source_comparison.geojson')

In [None]:
p1 = plot_all_geometries(comb_df)

In [None]:
show(p1)