# 3. Pour Point Extraction

```{figure} img/pour_points.png
---
width: 600px
---
In this notebook, we will derive a set of pour points describing confluences that will be used to derive basins and extract basin attributes.
```

In this notebook, we'll use the stream network generated in the previous notebook to find all river confluences.  The set of confluences will be filtered using the [National Hydrographic Network](https://natural-resources.canada.ca/science-and-data/science-and-research/earth-sciences/geography/topographic-information/geobase-surface-water-program-geeau/national-hydrographic-network/21361) waterbodies geometry to remove spurious confluences within lakes.  The remaining points will serve as input for basin delineation.  

The following files were pre-processed for the purpose of demonstration since the [original files cover all of Canada and are as a result very large](https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_nhn_rhn/gpkg_en/CA/).  The files below (may) need to be downloaded and saved to `content/notebooks/data/region_polygons/`.  

* `Vancouver_Island.geojson`: this is the polygon describing Vancouver Island.  It was used to capture just the waterbody geometries on Vancouver Island.
* `Vancouver_Island_lakes.geojson`: the water bodies polygon set for Vancouver Island.


```{figure} img/filtered_pts_example.png
---
width: 600px
---
The steps in this notebook produce a set of river confluences (blue), with spurious points within lakes (red) filtered out.  Lake boundaries are traversed to find lake inflows.  
```
```

In [None]:
import os
from utilities import *
from shapely.geometry import Point, LineString, Polygon
import multiprocessing as mp
import geopandas as gpd
import time
import numpy as np
import pandas as pd
# open the stream layer
base_dir = os.path.dirname(os.getcwd())
dem_folder = os.path.join(base_dir, 'notebooks/data/processed_dem/')
base_dir

```{note}
For clarity, some functions have been relegated to a separate file.  To find more detail, see `utilities.py`.
```

In [None]:
# Create the folder where the pour point geometry information will be saved.
pour_pt_path = os.path.join(base_dir, f'notebooks/data/pour_points/')
if not os.path.exists(pour_pt_path):
    os.mkdir(pour_pt_path)

## Import rasters (flow direction, accumulation, stream network)

In [None]:
# open the streams dem
region = 'Vancouver_Island'
d8_path = os.path.join(dem_folder, f'{region}_d8_pointer.tif')
acc_path = os.path.join(dem_folder, f'{region}_acc.tif')
stream_path = os.path.join(dem_folder, f'{region}_streams.tif')
stream_link_path = os.path.join(dem_folder, f'{region}_stream_links.tif')

Here we'll set a minimum threshold of 5 $km^2$ to limit the number of confluences for the sake of this demonstration. 

In [None]:
rt0 = time.time()

stream, _, _ = retrieve_raster(stream_path)
stream_links, _, _ = retrieve_raster(stream_link_path)
fdir, _, _ = retrieve_raster(d8_path)
acc, _, _ = retrieve_raster(acc_path)

resolution = stream.rio.resolution()
dx, dy = abs(resolution[0]), abs(resolution[1])
print(f'Raster resolution is {dx:.0f}x{dy:.0f}m')

# get raster data in matrix form
S = stream.data[0]
F = fdir.data[0]
A = acc.data[0]

stream_crs = stream.rio.crs.to_epsg()

rt1 = time.time()
print(f'   ...time to load resources: {rt1-rt0:.1f}s.')

In [None]:
min_basin_area = 5 # km^2
# min number of cells comprising a basin
basin_threshold = int(min_basin_area * 1E6 / (dx * dy)) 

Create a list of coordinates representing all the stream cells.

In [None]:
# get all the stream pixel indices
stream_px = np.argwhere(S == 1)

## Define confluence points in the stream network

Below we create a dictionary of potential pour points corresponding to confluences.  

We iterate through all the stream pixels, retrieve a 3x3 window of flow direction raster around each one, and check if it has more than one stream cell pointing towards it.

In [None]:
ppts = {}
nn = 0

for (i, j) in stream_px:
    c_idx = f'{i},{j}'
    if c_idx not in ppts:
        ppts[c_idx] = {}
    ppt = ppts[c_idx]

    # Add river outlets, as these are by definition
    # confluences and especially prevalent in coastal regions
    focus_cell_acc = A[i, j]
    focus_cell_dir = F[i, j]

    ppt['acc'] = focus_cell_acc

    if focus_cell_dir == 0:
        # the focus cell is already defined as a stream cell
        # so if its direction value is nan or 0, 
        # there is no flow direction and it's an outlet cell.
        ppt['OUTLET'] = True
        # by definition an outlet cell is also a confluence
        ppt['CONF'] = True
    else:
        ppt['OUTLET'] = False

    # get the 3x3 boolean matrix of stream and d8 pointer 
    # cells centred on the focus cell
    S_w = S[max(0, i-1):i+2, max(0, j-1):j+2].copy()
    F_w = F[max(0, i-1):i+2, max(0, j-1):j+2].copy()
    
    # create a boolean matrix for cells that flow into the focal cell
    F_m = mask_flow_direction(S_w, F_w)
    
    # check if cell is a stream confluence
    # set the target cell to false by default
    ppts = check_for_confluence(i, j, ppts, S_w, F_m)    


Convert the dictionary of stream confluences to a geodataframe in the same CRS as our raster.

In [None]:
output_ppt_path = os.path.join(pour_pt_path, f'{region}_pour_.geojson')
print(output_ppt_path)
if not os.path.exists(output_ppt_path):
    t0 = time.time()
    ppt_df = pd.DataFrame.from_dict(ppts, orient='index')
    ppt_df.index.name = 'cell_idx'
    ppt_df.reset_index(inplace=True) 
    
    # split the cell indices into columns and convert str-->int
    ppt_df['ix'] = [int(e.split(',')[0]) for e in ppt_df['cell_idx']]
    ppt_df['jx'] = [int(e.split(',')[1]) for e in ppt_df['cell_idx']]
    
    # filter for stream points that are an outlet or a confluence
    ppt_df = ppt_df[(ppt_df['OUTLET'] == True) | (ppt_df['CONF'] == True)]
    print(f' There are {len(ppt_df)} confluences and outlets combined in the {region} region.')
else:
    print('existing file')
    ppt_df = gpd.read_file(output_ppt_path)


In [None]:
n_pts_tot = len(stream_px)
n_pts_conf = len(ppt_df[ppt_df['CONF']])
n_pts_outlet = len(ppt_df[ppt_df['OUTLET']])

print(f'Of {n_pts_tot} total stream cells:')
print(f'    {n_pts_conf - n_pts_outlet} ({100*n_pts_conf/n_pts_tot:.1f}%) are stream confluences,')
print(f'    {n_pts_outlet} ({100*n_pts_outlet/n_pts_tot:.1f}%) are stream outlets.')


```{note}
The pour points are thus far only described by the raster pixel index, we still need to apply a transform to map indices to projected coordinates.
```

In [None]:
ppt_gdf = create_pour_point_gdf(region, stream, ppt_df, stream_crs, output_ppt_path)

In [None]:
# ta = time.time()
# polygon_path = os.path.join(base_dir, f'notebooks/data/region_polygons/{region}.geojson')
# region_polygon = gpd.read_file(polygon_path)
# # reproject to match nhn crs
# # region_polygon = region_polygon.to_crs(4617)
# tb = time.time()
# print(f'   ...region polygon opened in {tb-ta:.2f}s')


## Filter spurious confluences


One issue with the stream network algorithm is it does not identify lakes.  There are many lakes on Vancouver Island, and we want to remove the spurious confluence points that fall within lakes, and we want to add points where rivers empty into lakes.  We can use hydrographic information from the [National Hydrographic Netowork](https://natural-resources.canada.ca/science-and-data/science-and-research/earth-sciences/geography/topographic-information/geobase-surface-water-program-geeau/national-hydrographic-network/21361) to do so.

```{tip}
Lake polygons for Vancouver Island are saved under `content/notebooks/data/region_polygons/Vancouver_Island_lakes.geojson`
```



### Get the water body geometries that contain confluence points

From the [NHN documentation](https://ftp.maps.canada.ca/pub/nrcan_rncan/vector/geobase_nhn_rhn/doc/GeoBase_nhn_en_Catalogue_1_2.pdf):

Permanency code:
* -1 unknown
* 0 no value available
* 1 permanent
* 2 intermittent

    
| water_definition | Label | Code Definition |
|------------------|-------|-----------------|
| None | 0 | No Waterbody Type value available. |
| Canal | 1 | An artificial watercourse serving as a navigable waterway or to channel water. |
| Conduit | 2 | An artificial system, such as an Aqueduct, Penstock, Flume, or Sluice, designed to carry water for purposes other than drainage. |
| Ditch | 3 | Small, open manmade channel constructed through earth or rock for the purpose of conveying water. |
| *Lake | 4 | An inland body of water of considerable area. |
| *Reservoir | 5 | A wholly or partially manmade feature for storing and/or regulating and controlling water. |
| Watercourse | 6 | A channel on or below the earth's surface through which water may flow. |
| Tidal River | 7 | A river in which flow and water surface elevation are affected by the tides. |
| *Liquid Waste | 8 | Liquid waste from an industrial complex. |

```{warning}
The label "10" also exists, though I have not found a corresponding definition.  From the image below, it appears they may represent seasonal channels.  Light blue regions are lakes (4) and watercourses (6).
```

```{figure} img/label_10.png
---
width: 400px
---
Darker grey polygons are labeled with the code "10" appear to be seasonal channels.
```

In [None]:
# read the pre-processed lakes polygon file
region_lakes_path = os.path.join(base_dir, f'notebooks/data/region_polygons/{region}_lakes.geojson')
lakes_df = gpd.read_file(region_lakes_path)
lakes_df = lakes_df[[c for c in lakes_df.columns if c not in ['index_right', 'index_left']]]
assert lakes_df.crs == ppt_gdf.crs

```{warning}
Below we apply some subjective criteria to improve the performance of the lake inflow point discovery:
1. Remove lakes smaller than 0.01 $km^2$ to speed up the spatial join.
2. Only process lakes that contain confluence points in order to relocate points to river mouths.
3. Manipulate the lake polygons to smooth the edges -- Where the stream raster disagrees with the NHN polygons it tends to generate spurious inflow points and this step is to mitigate the issue. 
4. Require a minimum distance to existing confluence points (> 4 pixels).
```

In [None]:
lakes_df = lakes_df.to_crs(ppt_gdf.crs)  
# reproject to projected CRS before calculating area
lakes_df['area'] = lakes_df.geometry.area
lakes_df['lake_id'] = lakes_df.index.values

In [None]:
# filter lakes smaller than 0.01 km^2
min_area = 10000
lakes_df = lakes_df[lakes_df['area'] > min_area]
lakes_df = lakes_df[['acquisition_technique', 'lake_id', 'area', 'water_definition', 'planimetric_accuracy', 'permanency', 'geometry']]
# filter by water_definition code 
# get lakes and reservoirs (4 & 5)
lakes_df = lakes_df[(lakes_df['water_definition'] == 4) | (lakes_df['water_definition'] == 5)]
lakes_df = lakes_df.dissolve().explode(index_parts=False).reset_index(drop=True)
# find and fill holes in polygons
lakes_df.geometry = [Polygon(p.exterior) for p in lakes_df.geometry]

### Simplify lake geometries

Misalignment of the derived stream network and the hydrographic information from the NHN produces spurious points when we try to find streams flowing into lakes.  Simplifying (smoothing) the lake polygons trims long narrow segments classified as lake where feature alignment is most likely to occur.

```{figure} img/simplified_polygon.png
---
width: 400px
---
A simplified polygon reduces the perimeter of the lake polygon in order to capture where stream lines cross the lake boundary.
```

In [None]:
def trim_appendages(row):
    g = gpd.GeoDataFrame(geometry=[row['geometry']], crs='EPSG:3005')
    geom = g.copy().explode(index_parts=True)
    geom['area'] = geom.geometry.area
    if len(geom) > 1:
        # return only the largest geometry by area
        largest_geom = geom.loc[geom['area'].idxmax(), 'geometry']
        return largest_geom
    return row['geometry']
          

In [None]:
# find the set of lakes that contain confluence points
lakes_with_pts = gpd.sjoin(lakes_df, ppt_gdf, how='left', predicate='intersects')
# the rows with index_right == nan are lake polygons containing no points
lakes_with_pts = lakes_with_pts[~lakes_with_pts['index_right'].isna()]
# drop all duplicate geometries
lakes_with_pts = lakes_with_pts[~lakes_with_pts.index.duplicated(keep='first')]
lakes_with_pts['area'] = lakes_with_pts.geometry.area

# use negative and positive buffers to smooth the polygon
distance = 100  # metres
lakes_with_pts.geometry = lakes_with_pts.buffer(-distance).buffer(distance * 1.5).simplify(dx)
# drop empty geometries
lakes_with_pts = lakes_with_pts[~lakes_with_pts.geometry.is_empty].copy()
lakes_with_pts['geometry'] = lakes_with_pts.apply(lambda row: trim_appendages(row), axis=1)

lake_cols = ['acquisition_technique', 'lake_id', 'area', 
             'water_definition', 'planimetric_accuracy', 'permanency', 'geometry']
lakes_with_pts = lakes_with_pts[lake_cols]
# remove empty geometries
lakes_with_pts = lakes_with_pts[~lakes_with_pts.geometry.is_empty]
lakes_with_pts.to_file(os.path.join(base_dir, f'notebooks/data/region_polygons/{region}_lakes_simplified.geojson'))

n_lakes = len(lakes_with_pts)
print(f'    {n_lakes} water body objects in {region} polygon.')

In [None]:
lake_ppts = gpd.sjoin(ppt_gdf, lakes_with_pts, how='left', predicate='within')
filtered_ppts = lake_ppts[lake_ppts['index_right'].isna()]
print(f'    {len(filtered_ppts)}/{len(ppt_df)} confluence points are not in lakes ({len(ppt_df) - len(filtered_ppts)} points removed).')  

### Find and add lake inflows

We'll only check lakes that have spurious confluences, the general idea is we shift the in-lake confluence to the inflow location.  The method works best for large lake polygons and relatively smooth geometries where the stream network and NHN features align well, but it adds unnecessary points in other locations.  A few examples of good and bad behaviour are shown below.  

```{figure} img/lake_points_removed.png
---
width: 600px
---
Confluence points within lakes have been removed, while river mouths have been added.
```


In [None]:
def find_link_ids(target):
    """
    Traverse the pixels bounding a watershed to find intersecting streams.
    If the target cell is on a stream pixel, return the point and the link id.
    Link IDs are tracked to have stream links uniquely represented. 
    If the target cell is not in the stream pixel, we search all neighboring
    cells in a 3x3 window for cells in the stream network.
    The list of points are interpolated along the boundary line 
    and not all lake-stream confluences will be captured.    
    """
    x, y, dx, dy, lake, stream_raster = target
    
    t0 = time.time()
    # we have to open in each thread to avoid GIL
    # raster = rxr.open_rasterio(stream_link_path, mask=lake, lock=False)
    
    W = stream_raster.rio.clip_box(x-dx, y-dy, x+dx, y+dy)
    t1 = time.time()
    # print(f'time to open and clip: {t1-t0:.1f}s')

    # stream_links = dill.loads(serialized_W)  # Deserialize the object
    # W = stream_links.rio.clip_box(x-dx, y-dy, x+dx, y+dy)
    if np.isnan(W.data).all():
        return None
    
    stream_loc = W.sel(x=x, y=y).squeeze()
    link_id = stream_loc.compute().item()
    if ~np.isnan(link_id):
        return (Point(x, y), link_id)
    else:
        raster_nonzero = W.where(W > 0, drop=True)
        # Check surrounding cells for nonzero link_ids
        xs, ys = raster_nonzero.x.values, raster_nonzero.y.values
        for x1, y1 in zip(xs, ys):
            link_id = W.sel(x=x1, y=y1).squeeze().item()
            pt = Point(x1, y1)
            if ~np.isnan(link_id) & (not lake.contains(pt)):
                return (Point(x1, y1), link_id)

    return None

### Search along the boundary for lake-river confluences

This step takes **18 minutes** to process on a six core intel i7-8850H @2.6 GHz.

In [None]:
n = 0
tot_pts = 0
crs = stream.rio.crs.to_epsg()
t0 = time.time()
points_to_check = []

for _, row in lakes_with_pts.iterrows():

    lake_geom = row['geometry']
    # resample the shoreline vector to prevent missing confluence points
    resampled_shoreline = redistribute_vertices(lake_geom.exterior, dx).coords.xy
    xs = np.array(resampled_shoreline[0].tolist())
    ys = np.array(resampled_shoreline[1].tolist())

    # Find the cells along the interpolated lake polygon boundary
    px_pts = stream_links.sel(x=xs, y=ys, method='nearest', tolerance=dx)
    coords = list(zip(px_pts['x'].values, px_pts['y'].values))
    
    stream_raster = rxr.open_rasterio(stream_link_path, mask=lake_geom, lock=False)
    inputs = [(x, y, dx, dy, lake_geom, stream_raster) for x, y in coords]
    
    p = mp.Pool()
    results = p.map(find_link_ids, inputs)
    p.close()

    # drop non stream vals
    points_to_check += [r for r in results if r[1] > 0]
    
    if n % 10 == 0:
        t1 = time.time()
        print(f'   Processing lake {n}/{len(lakes_with_pts)}.')
        print(f'      calculated nearest non-nan of {len(xs)} coordinates in {t1-t0:.1f}s ')
    
    n += 1

t1 = time.time()
ttot = (t1-t0) / 60
print(f'completed in {ttot:.1f} minutes') 

In [None]:
pt_df = pd.DataFrame(points_to_check, columns=['geometry', 'link_id'])
pt_df = pt_df.drop_duplicates(subset='link_id', keep='first')
gdf = gpd.GeoDataFrame(pt_df)

### Check that found points are not too close to an existing point

In [None]:
n = 0
all_pts = []
for i, row in gdf.iterrows():
    n += 1
    if n % 250 == 0:
        print(f'{n}/{len(points_to_check)} points checked.')
    
    # index_right is the lake id the point is contained in
    # don't let adjacent points both be pour points
    # but avoid measuring distance to points within lakes
    nearest_neighbour = ppt_gdf.distance(row['geometry']).min()

    # check the point is not within some distance (in m) of an existing point
    # 200m is roughly 8 pixels
    min_spacing = 200
    
    if nearest_neighbour > min_spacing:
        # check if the potential point is in any of the lakes
        all_pts.append(row['geometry'])        
        
new_pts = gpd.GeoDataFrame(geometry=all_pts, crs=f'EPSG:{crs}')
final_ppts = gpd.GeoDataFrame(pd.concat([filtered_ppts, new_pts], axis=0), crs=f'EPSG:{crs}')
final_ppts = final_ppts[['cell_idx',	'acc', 'OUTLET', 'CONF', 'ix',	'jx', 'geometry']]
final_ppts

Format the river mouth points into a geodataframe and append it to the filtered set.

Save the output

In [None]:
# new_pts = gpd.GeoDataFrame(geometry=all_pts_filtered, crs=f'EPSG:{stream_crs}')
# pour_points = gpd.GeoDataFrame(pd.concat([filtered_ppts, new_pts], axis=0), crs=f'EPSG:{stream_crs}')
new_pts.to_file(os.path.join(base_dir, f'notebooks/data/pour_points/{region}_pour_points_filtered1.geojson'))