In [None]:
import os
from time import time

import numpy as np
import pandas as pd
import geopandas as gpd
from osgeo import gdal

import multiprocessing as mp

import rioxarray as rxr
from numba import njit

from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, Whisker
from bokeh.io import output_notebook
from bokeh.palettes import Colorblind3
# from bokeh.layouts import gridplot, row, column
output_notebook()

import warnings
warnings.filterwarnings('ignore')

In [None]:
from whitebox.whitebox_tools import WhiteboxTools

wbt = WhiteboxTools()
wbt.verbose = False
# wbt.verbose = True

## Evaluate the effect of dem resolution on basin representation and attribute estimation

As the basin scale decreases, methodological choices begin to have a significant impact on the number of cells captured and used to represent a basin. In addition, some attributes are affected by DEM resolution, in particular terrain attributes.  Here we investigate two examples. 

Consider a square grid intersected by an arbitrary curvilinear loop.  If we color the grid cells intersected by the line red, and we colour any cell inside the closed loop blue, let $P_{edge}$ the percentage of edge cells be the number of red cells divided by the number of red plus blue cells.

Now if we hold the loop constant and change the grid cell dimension $d_{grid}$, intuitively $P_{edge}$ will increase as the grid cell size increases since eventually the loop will be encompassed by a single grid cell.  $P_{edge}$ will approach 1 as $d_{grid}$ increases.

When we extract basin attributes from a geospatial raster using a basin polygon, at what point does a choice affecting the number of edge pixels become significant?  At some combination of raster resolution and basin size, the proportion of edge pixels will be significant.  The purpose of this exercise is to compare DEM datasets of two different resolutions to get a sense of the basin size at which the edge pixel method chosen becomes significant to the representativeness of the sample of raster pixels used to compute basin attributes.

## tl;dr

Compare 30m vs. 90m dem using a large sample of basin polygons over a wide scale to see when the basin is mostly edge pixels that are either included or excluded depending upon the clipping method.

Once the basin clipping is done, the mean slope is computed for each basin from two different DEM sources to compare the effect of DEM resolution on mean basin slope calculation.

## Method

The method is as follows:
1. Select a random sample of 10k basin polygons from the BCUB polygon set.
2. Use each polygon as a clipping mask and create a temporary clipped DEM to represent the basin pixels. Save these as temporary files because they will be ingested by Whiteboxtools "Slope" tool in the subsequent slope comparison.
3. For each clipped raster:    
   a. Find all non-Nan indices in the clipped raster.  An edge pixel is one which has at least one NaN neighbour.  
   b. Count all edge pixels and divide by the total number of non-NaN indices.  
   c. The process assumes there are no data gaps in the basin prior to clipping.  This is checked by first counting NaN values for pixels within the polygon and asserting equal to zero.

In [None]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, 'processed_data')
# HYSETS_DIR = os.path.join(BASE_DIR, '/home/danbot2/code_5820/large_sample_hydrology/common_data/HYSETS_data/'

# This assumes you've downloaded a basin polygon set 
# and have processed the DEM for the same region 
# with both the 3DEP and EENV DEM
# here we'll do it for 08D (central coast)
region_code = 'FRA'
BASIN_DIR = os.path.join(DATA_DIR, 'basin_attributes/polygons')

# update with the path of processed DEM
# these scripts would be already run in the dataset replication
# see the following scripts in setup_scripts/
#     * get_3DEP_DEM.py   <--retrieves the USGS 3DEP dem
#     * get_EENV_DEM.py   <--retrieves the EarthEnv 90m dem
#     * clip_region_DEM.py <-- takes the assembled tiles and creates 

DEM_PATH = '/home/danbot2/code_5820/large_sample_hydrology/common_data/DEM_data/processed_dem/'


In [None]:
temp_folder = os.path.join(BASE_DIR, 'validation/tmp')
tmp_basins = os.path.join(temp_folder, 'basin_polygons/')

if not os.path.exists(tmp_basins):
    os.makedirs(tmp_basins)

In [None]:
def compute_slope_on_region_raster(region_code, DEM_source):
    if region_code == 'FRA':
        region_code = 'Fraser'
    src_dem_fname = f'{DEM_source}/{region_code}_EENV_DEM_3005_res1.tif'
    if 'USGS' in src:
        src_dem_fname = f'{DEM_source}/{region_code}_{DEM_source}_3005_res1.tif'
    # compute the mean slope of the clipped basin raster
    output_fname = src_dem_fname.replace('res1.tif', 'slope.tif')
    output_fpath = os.path.join(DEM_PATH, output_fname)
    region_fpath = os.path.join(DEM_PATH, src_dem_fname)
    if not os.path.exists(output_fpath):
        wbt.slope(
            region_fpath, 
            output_fpath, 
            zfactor=None, 
            units="degrees", 
            callback=None
        )
   
    return output_fpath

In [None]:
region_slope_paths = []
for src in ['EENV_DEM90', 'USGS_3DEP']:
    region_slope_path = compute_slope_on_region_raster(region_code, src)
    region_slope_paths.append(region_slope_path)

In [None]:
geom_fpath = f'{region_code}_basin_geometries.parquet'
df = gpd.read_parquet(os.path.join(BASIN_DIR, geom_fpath))

In [None]:
# select a random subset of 10K basin polygons
select_new = False
sample_size  = 1e4

if select_new:    
    ids = np.random.choice(df['id'].values, size=int(sample_size), replace=False)
    print(len(ids))
else:
    existing_basin_files = os.listdir(os.path.join(temp_folder, 'basin_polygons'))
    ids = sorted(list(set([e.split('_')[1] for e in existing_basin_files])))
    print(f'{len(ids)} existing basin ids to process')
    if len(ids) < sample_size:
        sample_size  = 1e4
        add_ids = np.random.choice(df['id'].values, size=int(sample_size - len(ids)), replace=False)
        assert all([e not in ids for e in add_ids])
        ids += [str(e) for e in add_ids]
        print(f'{len(ids)} existing basin ids to process')
        

## Clip the DEM with each polygon and create temporary raster files

In [None]:
def retrieve_raster(fpath):
    rds = rxr.open_rasterio(fpath, masked=True, mask_and_scale=True)
    crs = rds.rio.crs
    affine = rds.rio.transform(recalc=False)
    return rds, crs, affine

In [None]:
def clip_rasters(input):
    basin_id, polygon, DEM_source, buffer, region_code, temp_folder = input
    
    dem_fpath = os.path.join(DEM_PATH, f'{DEM_source}/{region_code}_USGS_3DEP_3005_res1.tif')
    rc = region_code
    if region_code == 'FRA':
        rc = 'Fraser'
    src_dem_fname = f'{DEM_source}/{rc}_EENV_DEM_3005_res1.tif'
    if 'USGS' in src:
        src_dem_fname = f'{DEM_source}/{rc}_{DEM_source}_3005_res1.tif'
        
    input_raster_path = os.path.join(DEM_PATH, src_dem_fname)
    # if you want to test the effect of adding a buffer to the polygon
    if buffer != 0:
        dem, crs_obj, affine = retrieve_raster(dem_fpath)
        res = dem.rio.resolution()
        if buffer == 1:
            buff = np.sqrt(res[0]**2 + res[1]**2)
        elif buffer == 2:
            buff = max(abs(res[0]), abs(res[1]))
        bdf = bdf.buffer(buff)

    # save the polygon to a temp file
    crs = 3005
    basin_fname = f'basin_polygons/basin_{basin_id:05d}_b{buffer}_{crs}.shp'
    basin_fpath = os.path.join(temp_folder, basin_fname)
    if not os.path.exists(basin_fpath):  
        basin_data = {
            'id': [basin_id], 
            'region_code': [region_code]
        }
        bdf = gpd.GeoDataFrame(basin_data, geometry=[polygon], crs=crs)   
        geom_type = bdf.geometry.values[0].geom_type

        if geom_type != 'Polygon':
            print(f'geom is {geom_type}')
            if geom_type == 'GeometryCollection':
                foo = gpd.GeoDataFrame(geometry=[polygon], crs=3005)
                bdf = foo.dissolve()                
                geom_type = bdf.geometry.values[0].geom_type
                bdf.to_file(basin_fpath)
            elif geom_type == 'MultiPolygon':
                print(f'multipolygon found: {basin_fpath}')
                bdf = gpd.GeoDataFrame(geometry=[polygon], crs=3005)
                bdf = bdf.explode(index_parts=False)
                bdf['area'] = bdf.geometry.area
                bdf = bdf[bdf['area'] >= 2E3]
                if len(bdf) == 0:
                    print(f'no polygon found {basin_fpath}')
                    raise Exception('no geoms left!!')      
                elif len(bdf) == 1:
                    bdf.to_file(basin_fpath)
                else:
                    raise Exception('too many geoms!!')  
                geom_type = bdf.geometry.values[0].geom_type
            if geom_type != 'Polygon':
                raise Exception('fix geom type!!')
        else:
            if not os.path.exists(basin_fpath):
                bdf.to_file(basin_fpath)

    # Clip the raster using the saved basin polygon
    raster_fname = f"{DEM_source}/{int(basin_id):05}_clipped_{buffer}mbuff.tif"
    fpath_out = os.path.join(temp_folder, raster_fname)
    if not os.path.exists(fpath_out):
        g = gdal.Warp(fpath_out, input_raster_path, format="GTiff",
                      cutlineDSName=basin_fpath,
                      cropToCutline=True)
        g = None 

    return (fpath_out)

In [None]:
tstart = time()
buffer = 0
# get all the ids
failed_ids = []
edge_dict = {}

n = 0
for src in ['USGS_3DEP', 'EENV_DEM90']:
    tmp_raster_folder = os.path.join(temp_folder, src)
    if not os.path.exists(tmp_raster_folder):
        os.mkdir(tmp_raster_folder)
    t1 = time()
    selected_df = df[df['id'].isin(ids)].copy()
    print(f'{len(selected_df)} items selected for processing')
    inputs = list(selected_df.itertuples(index=False, name=None))
    # additional input parameters
    inputs = [e + (src, buffer, region_code, temp_folder) for e in inputs]
    print(f'Processing {len(inputs)} with {src} DEM')

    # clip the dems to all basin polygons
    n_procs = 6
    p = mp.Pool(n_procs)
    print('Starting raster clipping:')
    clip_fpaths = p.map(clip_rasters, inputs)
    

## Count Perimeter Cells

In [None]:
@njit
def edge_pixel_proportion(m):
    """
    The input m is a matrix representing the dem
    where the only nonzero values are indices that
    lie inside the basin.  
    Count the cells that have at least one NaN neighbour
    and compare to the total number of numeric cells.
    """
    
    # Get the number of rows and columns in the matrices
    # rows, cols = m1.shape
    (r1, c1) = m.shape
    # print(r1, c1)

    dem_px = 0
    edge_px = 0

    # Count non-zero elements in matrix
    for row in range(r1):
        for col in range(c1):
            if ~np.isnan(m[row, col]):
                dem_px += 1
                # check all neighbouring cells if any are nan
                indices = [(row - 1, col - 1), (row - 1, col), (row - 1, col + 1),
                           (row, col-1), (row, col + 1),
                           (row + 1, col-1), (row + 1, col), (row + 1, col + 1)]
                nan_nbr = False
                for r, c in indices:
                    if (r <= r1 - 1) & (r >= 0) & (c >= 0) & (c <= c1 - 1):
                        if np.isnan(m[r, c]):
                            edge_px += 1
                            break
    
    # Return the proportion of edge pixels
    if dem_px == 0:
        return 0
    return edge_px / dem_px

In [None]:
def process_basin_raster(f):
    basin_id = int(f.split('/')[-1].split('_')[0])
    raster, crs, affine = retrieve_raster(f)
    data = raster.data[0]
    pct_edge_px = edge_pixel_proportion(raster.data[0])
    return (basin_id, pct_edge_px)

In [None]:
perimeter_results = {}
for src in ['EENV_DEM90', 'USGS_3DEP']:
    tmp_raster_folder = os.path.join(temp_folder, src)
    t1 = time()
    # additional input parameters
    raster_files = os.listdir(os.path.join(BASE_DIR, f'validation/tmp/{src}'))
    raster_paths = [os.path.join(tmp_raster_folder, r) for r in raster_files]

    # you may run into RAM issues if you parallelize this step
    perimeter_results[src] = []
    n = 0
    for f in sorted(raster_paths):
        n += 1
        result = process_basin_raster(f)
        perimeter_results[src].append(result)

    t2 = time()
    ut = len(raster_paths) / (t2-t1)
    print(f'{len(raster_paths)} {src} basins processed in {t2-t1:.0f}s ({ut:.1f}/s)')
    
        

In [None]:
edge_dfs = []
for src in ['EENV_DEM90', 'USGS_3DEP']:
    edge_df = pd.DataFrame(perimeter_results[src])
    edge_df.columns = ['basin_id', f'pct_edge_cells_{src}']
    edge_df.set_index('basin_id', inplace=True)
    edge_dfs.append(edge_df)

In [None]:
for i in range(len(edge_dfs)):
    edge_dfs[i] = edge_dfs[i][~edge_dfs[i].index.duplicated(keep='first')]    

In [None]:
result = pd.concat(edge_dfs, join='inner', axis=1)
result['area'] = [df.loc[df['id'] == i, 'basin'].area.values[0] / 1e6 for i in result.index.values]
print(len(result))
result.head()

## Plot Results

In [None]:
def equiprobable_binning(data, param1, param2, samples_per_bin):
    # group deviation values by perimeter bin number
    df = data.copy()
    n_bins = int(len(df)/samples_per_bin)
    print(f'   Creating {n_bins} bins of {samples_per_bin} samples/bin (N={n_bins* samples_per_bin})')
    
    qc, edges = pd.qcut(df[param1], q=n_bins, precision=3, retbins=True)
    edges1 = edges[1:]
    
    df['p_bin'] = np.digitize(df[param1], bins=edges1, right=True)
    
    bin_widths = [j-i for i, j in zip(edges[:-1], edges[1:])] 
    bin_centres = [(j+i)/2 for i, j in zip(edges[:-1], edges[1:])]
    evs = df[[param1, param2, 'p_bin']].groupby('p_bin').mean()

    evs['bin_width'] = bin_widths
    evs['bin_centre'] = (edges[1:] + edges[:-1]) / 2
    # evs['edges'] = edges
    evs['ubnd'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.95)[param2]
    evs['lbnd'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.05)[param2]
    evs['q1'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.25)[param2]
    evs['q3'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.75)[param2]
    evs['median'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.5)[param2]
    evs['mean'] = df[[param1, param2, 'p_bin']].groupby('p_bin').mean()[param2]
    evs.loc[0, 'ubnd'] = evs.loc[1, 'ubnd']
    evs.loc[0, 'lbnd'] = evs.loc[1, 'lbnd']        
    return evs, edges

In [None]:
def binned_fig(fig, src, df, samples_per_bin):
    edge_pct = f'pct_edge_cells_{src}'
    evs, edges = equiprobable_binning(df, 'area', edge_pct, 600)
    
    linetype = 'dashed'
    color = Colorblind3[0]
    color2 = Colorblind3[0]
    label = '90m'
    if src == 'USGS_3DEP':
        label = '30m'
        color = 'black'
        linetype = 'dotted'
        color2 = 'black'
    
    fig.toolbar.autohide = True
    
    classes = evs['bin_centre'].values
    ub = evs['ubnd'].values
    lb = evs['lbnd'].values
    source = ColumnDataSource(data=dict(base=classes, upper=ub, lower=lb))
    
    # outlier range
    w = Whisker(base='base', upper="upper", lower="lower", source=source,
                     line_color=color, line_alpha=0.8, line_width=1)
    w.upper_head.line_color = color
    w.lower_head.line_color = color

    if src == 'USGS_3DEP':
        fig.circle(evs['bin_centre'], evs['median'], 
                color=color2, size=5, fill_alpha=0,
                   legend_label=f'bin median {label}')    
    else:
        fig.square(evs['bin_centre'], evs['median'], 
                color=color2, size=5,
                   legend_label=f'bin median {label}')    
        
    fig.add_layout(w)
    return fig

In [None]:
fig = figure(width=500, height=400, title=f"", y_range=(0, 0.5),
           toolbar_location='above', x_axis_type='log')

for src in ['EENV_DEM90', 'USGS_3DEP']:
    fig = binned_fig(fig, src, result, 600)
    
fig.xaxis.axis_label = 'Drainage Area [km²]'
fig.yaxis.axis_label = 'Proportion of cells at edge'
# fig.grid.visible = False
show(fig)

## Compute mean basin slope

At the beginning of this notebook we processed the slope for the full region.  Here we compute the mean slope for each basin using basin polygons as clipping masks and compare 30 and 90m dem sources to see the effect on a large sample of basins.

**NOTE**

In this example we use a random sample of 10K basins from the Frasier basin, extracted from the basin geometry file.  Note that the figure in the associated paper draws a random sample of 10K basins from the full study region.  In hindsight, it should have been seeded such that the figure could be replicated precisely.  It was not done, however at the bottom of this is a text output of the list of basin IDs so the figure below can at least be replicated. 

The distribution of basin slopes will vary based on the random sample drawn, but the point of the exercise is to show that the lower resolution DEM tends to compute lower slopes for the same basin, and this trend holds across all samples.  One limitation of the comparison is that we use the basin polygon derived from the higher resolution DEM, and we do not check to see if the lower resolution stream network identifies the same stream network.

In [None]:
def clip_slope_raster(input):
    basin_id, buffer, crs, input_raster_fpath, DEM_source = input
    basin_fname = f'basin_polygons/basin_{basin_id:05d}_b{buffer}_{crs}.shp'
    basin_fpath = os.path.join(temp_folder, basin_fname)
    if not os.path.exists(basin_fpath):
        print('{basin_fpath.split("/")[-1]} not found.  Saving polygon.')
        basin = df[df['id'] == basin_id].copy()
        basin.to_file(basin_fpath)
        print('    ...saved')

    # Clip the raster using the saved basin polygon
    raster_fname = f"{DEM_source}/{int(basin_id):05}_clipped_slope_{buffer}mbuff.tif"
    fpath_out = os.path.join(temp_folder, raster_fname)
    if not os.path.exists(fpath_out):
        g = gdal.Warp(fpath_out, input_raster_fpath, format="GTiff",
                      cutlineDSName=basin_fpath,
                      cropToCutline=True)
        g = None 
    return fpath_out

In [None]:
# ','.join(ids)

In [None]:
slope_df = pd.DataFrame()
n = 0
for src in ['EENV_DEM90', 'USGS_3DEP']:
    input_raster_fpath = [e for e in region_slope_paths if src in e][0]
    slope_inputs = [(int(id), buffer, 3005, input_raster_fpath, src) for id in ids] 
    # compute slope
    p = mp.Pool(n_procs)
    slope_paths = p.map(clip_slope_raster, slope_inputs)
    for path in slope_paths:
        # slope_path = clip_slope_raster(inp)
         # retrieve the slope raster and compute mean basin slope
        raster, _, _ = retrieve_raster(path)
        mean_slope = np.nanmean(raster.data[0])        
        # basin id is the index in the result dataframe, so update a 'mean slope' column
        basin_id = path.split('/')[-1].split('_')[0]
        slope_df.loc[basin_id, src] = np.nanmean(raster.data[0])
        if n % 500 == 0:
            print(f'processed {n}/{len(ids)} basins')
        n += 1
    n = 0

In [None]:
slope_df.head()

In [None]:
foo = slope_df[slope_df[['EENV_DEM90', 'USGS_3DEP']].isna().any(axis=1)]

slope_df.dropna(inplace=True, how='any')

In [None]:
ph1 = figure(title=f'', toolbar_location=None, 
             width=400, height=300)#, x_range=fig1.x_range)

hhist1, hedges1 = np.histogram(slope_df['EENV_DEM90'].values, bins=20, density=True)
hhist2, hedges2 = np.histogram(slope_df['USGS_3DEP'].values, bins=hedges1, density=True)

ph1.xgrid.grid_line_color = None
ph1.yaxis.major_label_orientation = np.pi/4
ph1.background_fill_color = "#fafafa"
ph1.yaxis.axis_label = 'P(X)'
ph1.xaxis.axis_label = 'Mean Slope [deg]'

ph1.quad(bottom=0, left=hedges1[:-1], right=hedges1[1:], top=hhist1, legend_label='90m', 
                  line_alpha=0.6, fill_alpha=0.5, color=Colorblind3[0], line_color="#3A5785")
# hh1 = ph1.quad(bottom=0, left=hedges1[:-1], right=hedges1[1:], top=hzeros1, alpha=0.5, **LINE_ARGS)

ph1.quad(bottom=0, left=hedges2[:-1], right=hedges2[1:], top=hhist2, legend_label='30m',
                  line_alpha=0.6, fill_alpha=0.5, color=Colorblind3[2], line_color="#3A5785")
# hh11 = ph1.quad(bottom=0, left=hedges2[:-1], right=hedges2[1:], top=hzeros2, alpha=0.5, **LINE_ARGS)
# hh2 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)
# hh2 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)
ph1.toolbar_location = 'right'
ph1.toolbar.autohide = True

In [None]:
# layout = column(ph1)
show(ph1)