In [1]:
import os
from time import time

import numpy as np
import pandas as pd
import geopandas as gpd
from osgeo import gdal

import multiprocessing as mp

import rioxarray as rxr
from numba import njit

from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, Whisker
from bokeh.io import output_notebook
from bokeh.palettes import Colorblind3
# from bokeh.layouts import gridplot, row, column
output_notebook()

In [110]:
from whitebox.whitebox_tools import WhiteboxTools

wbt = WhiteboxTools()
wbt.verbose = False
# wbt.verbose = True

## Evaluate the effect of dem resolution on basin representation and attribute estimation

As the basin scale decreases, methodological choices begin to have a significant impact on the number of cells captured and used to represent a basin. In addition, some attributes are affected by DEM resolution, in particular terrain attributes.  Here we investigate two examples. 

Consider a square grid intersected by an arbitrary curvilinear loop.  If we color the grid cells intersected by the line red, and we colour any cell inside the closed loop blue, let $P_{edge}$ the percentage of edge cells be the number of red cells divided by the number of red plus blue cells.

Now if we hold the loop constant and change the grid cell dimension $d_{grid}$, intuitively $P_{edge}$ will increase as the grid cell size increases since eventually the loop will be encompassed by a single grid cell.  $P_{edge}$ will approach 1 as $d_{grid}$ increases.

When we extract basin attributes from a geospatial raster using a basin polygon, at what point does a choice affecting the number of edge pixels become significant?  At some combination of raster resolution and basin size, the proportion of edge pixels will be significant.  The purpose of this exercise is to compare DEM datasets of two different resolutions to get a sense of the basin size at which the edge pixel method chosen becomes significant to the representativeness of the sample of raster pixels used to compute basin attributes.

## tl;dr

Compare 30m vs. 90m dem using a large sample of basin polygons over a wide scale to see when the basin is mostly edge pixels that are either included or excluded depending upon the clipping method.

Once the basin clipping is done, the mean slope is computed for each basin from two different DEM sources to compare the effect of DEM resolution on mean basin slope calculation.

## Method

The method is as follows:
1. Select a random sample of 10k basin polygons from the BCUB polygon set.
2. Use each polygon as a clipping mask and create a temporary clipped DEM to represent the basin pixels. Save these as temporary files because they will be ingested by Whiteboxtools "Slope" tool in the subsequent slope comparison.
3. For each clipped raster:    
   a. Find all non-Nan indices in the clipped raster.  An edge pixel is one which has at least one NaN neighbour.  
   b. Count all edge pixels and divide by the total number of non-NaN indices.  
   c. The process assumes there are no data gaps in the basin prior to clipping.  This is checked by first counting NaN values for pixels within the polygon and asserting equal to zero.

In [2]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, 'processed_data')
# HYSETS_DIR = os.path.join(BASE_DIR, '/home/danbot2/code_5820/large_sample_hydrology/common_data/HYSETS_data/'

# This assumes you've downloaded a basin polygon set 
# and have processed the DEM for the same region 
# with both the 3DEP and EENV DEM
# here we'll do it for 08D (central coast)
region_code = 'FRA'
BASIN_DIR = os.path.join(DATA_DIR, 'basin_attributes/polygons')

# update with the path of processed DEM
# these scripts would be already run in the dataset replication
# see the following scripts in setup_scripts/
#     * get_3DEP_DEM.py   <--retrieves the USGS 3DEP dem
#     * get_EENV_DEM.py   <--retrieves the EarthEnv 90m dem
#     * clip_region_DEM.py <-- takes the assembled tiles and creates 

DEM_PATH = '/home/danbot2/code_5820/large_sample_hydrology/common_data/DEM_data/processed_dem/'


In [17]:
geom_fpath = f'{region_code}_basin_geometries.parquet'
df = gpd.read_parquet(os.path.join(BASIN_DIR, geom_fpath))

In [5]:
# select a random subset of 10K basin polygons
sample_size  = 1e4
ids = np.random.choice(df['id'].values, size=int(sample_size), replace=False)
print(len(ids))

10000


## Clip the DEM with each polygon and create temporary raster files

In [5]:
def retrieve_raster(fpath):
    rds = rxr.open_rasterio(fpath, masked=True, mask_and_scale=True)
    crs = rds.rio.crs
    affine = rds.rio.transform(recalc=False)
    return rds, crs, affine

In [6]:
def clip_rasters(input):
    basin_id, polygon, DEM_source, buffer, region_code, temp_folder = input
    
    dem_fpath = os.path.join(DEM_PATH, f'{DEM_source}/{region_code}_USGS_3DEP_3005_res1.tif')
    rc = region_code
    if region_code == 'FRA':
        rc = 'Fraser'
    src_dem_fname = f'{DEM_source}/{rc}_EENV_DEM_3005_res1.tif'
    if 'USGS' in src:
        src_dem_fname = f'{DEM_source}/{rc}_{DEM_source}_3005_res1.tif'
        
    input_raster_path = os.path.join(DEM_PATH, src_dem_fname)
    # if you want to test the effect of adding a buffer to the polygon
    if buffer != 0:
        dem, crs_obj, affine = retrieve_raster(dem_fpath)
        res = dem.rio.resolution()
        if buffer == 1:
            buff = np.sqrt(res[0]**2 + res[1]**2)
        elif buffer == 2:
            buff = max(abs(res[0]), abs(res[1]))
        bdf = bdf.buffer(buff)

    # save the polygon to a temp file
    crs = 3005
    basin_fname = f'basin_polygons/basin_{basin_id:05d}_b{buffer}_{crs}.shp'
    basin_fpath = os.path.join(temp_folder, basin_fname)
    if not os.path.exists(basin_fpath):  
        basin_data = {
            'id': [basin_id], 
            'region_code': [region_code]
        }
        bdf = gpd.GeoDataFrame(basin_data, geometry=[polygon], crs=crs)   
        geom_type = bdf.geometry.values[0].geom_type

        if geom_type != 'Polygon':
            print(f'geom is {geom_type}')
            if geom_type == 'GeometryCollection':
                foo = gpd.GeoDataFrame(geometry=[polygon], crs=3005)
                bdf = foo.dissolve()                
                geom_type = bdf.geometry.values[0].geom_type
                bdf.to_file(basin_fpath)
            elif geom_type == 'MultiPolygon':
                print(f'multipolygon found: {basin_fpath}')
                bdf = gpd.GeoDataFrame(geometry=[polygon], crs=3005)
                bdf = bdf.explode(index_parts=False)
                bdf['area'] = bdf.geometry.area
                bdf = bdf[bdf['area'] >= 2E3]
                if len(bdf) == 0:
                    print(f'no polygon found {basin_fpath}')
                    raise Exception('no geoms left!!')      
                elif len(bdf) == 1:
                    bdf.to_file(basin_fpath)
                else:
                    raise Exception('too many geoms!!')  
                geom_type = bdf.geometry.values[0].geom_type
            if geom_type != 'Polygon':
                raise Exception('fix geom type!!')
        else:
            bdf.to_file(basin_fpath)
        
    # New filename. Assumes input raster file has '.tif' extension
    # Might need to change how you build the output filename    
    raster_fname = f"{DEM_source}/{int(basin_id):05}_clipped_{buffer}mbuff.tif"
    fpath_out = os.path.join(temp_folder, raster_fname)
    # Clip the raster using the saved basin polygon
    if not os.path.exists(fpath_out):
        g = gdal.Warp(fpath_out, input_raster_path, format="GTiff",
                      cutlineDSName=basin_fpath,
                      cropToCutline=True)
    g = None    
    return fpath_out

In [7]:
temp_folder = os.path.join(BASE_DIR, 'validation/tmp')
tmp_basins = os.path.join(temp_folder, 'basin_polygons/')

if not os.path.exists(tmp_basins):
    os.makedirs(tmp_basins)

In [14]:
tstart = time()
buffer = 0
# get all the ids
failed_ids = []
edge_dict = {}

n = 0
for src in ['USGS_3DEP', 'EENV_DEM90']:
    tmp_raster_folder = os.path.join(temp_folder, src)
    if not os.path.exists(tmp_raster_folder):
        os.mkdir(tmp_raster_folder)
    t1 = time()
    selected_df = df[df['id'].isin(ids)].copy()
    print(f'{len(selected_df)} items selected for processing')
    inputs = list(selected_df.itertuples(index=False, name=None))
    # additional input parameters
    inputs = [e + (src, buffer, region_code, temp_folder) for e in inputs]
    print(f'Processing {len(inputs)} with {src} DEM')

    # clip the dems to all basin polygons
    n_procs = 12
    p = mp.Pool(n_procs)
    print('Starting raster clipping:')
    clipped_raster_paths = p.map(clip_rasters, inputs)
    

10000 items selected for processing
Processing 10000 with USGS_3DEP DEM
Starting raster clipping:
10000 items selected for processing
Processing 10000 with EENV_DEM90 DEM
Starting raster clipping:


## Count Perimeter Cells

In [8]:
@njit
def edge_pixel_proportion(m):
    """
    The input m is a matrix representing the dem
    where the only nonzero values are indices that
    lie inside the basin.  
    Count the cells that have at least one NaN neighbour
    and compare to the total number of numeric cells.
    """
    
    # Get the number of rows and columns in the matrices
    # rows, cols = m1.shape
    (r1, c1) = m.shape
    # print(r1, c1)

    dem_px = 0
    edge_px = 0

    # Count non-zero elements in matrix
    for row in range(r1):
        for col in range(c1):
            if ~np.isnan(m[row, col]):
                dem_px += 1
                # check all neighbouring cells if any are nan
                indices = [(row - 1, col - 1), (row - 1, col), (row - 1, col + 1),
                           (row, col-1), (row, col + 1),
                           (row + 1, col-1), (row + 1, col), (row + 1, col + 1)]
                nan_nbr = False
                for r, c in indices:
                    if (r <= r1 - 1) & (r >= 0) & (c >= 0) & (c <= c1 - 1):
                        if np.isnan(m[r, c]):
                            edge_px += 1
                            break
    
    # Return the proportion of edge pixels
    if dem_px == 0:
        return 0
    return edge_px / dem_px

In [13]:
def process_basin_raster(f):
    basin_id = int(f.split('/')[-1].split('_')[0])
    raster, crs, affine = retrieve_raster(f)
    data = raster.data[0]
    pct_edge_px = edge_pixel_proportion(raster.data[0])
    return (basin_id, pct_edge_px)

In [39]:
perimeter_results = {}
for src in ['EENV_DEM90', 'USGS_3DEP']:
    tmp_raster_folder = os.path.join(temp_folder, src)
    t1 = time()
    # additional input parameters
    raster_files = os.listdir(os.path.join(BASE_DIR, f'validation/tmp/{src}'))
    raster_paths = [os.path.join(tmp_raster_folder, r) for r in raster_files]

    # don't parallelize this step!
    perimeter_results[src] = []
    n = 0
    for f in sorted(raster_paths):
        n += 1
        result = process_basin_raster(f)
        perimeter_results[src].append(result)    

    t2 = time()
    ut = len(raster_paths) / (t2-t1)
    print(f'{len(raster_paths)} {src} basins processed in {t2-t1:.0f}s ({ut:.1f}/s)')
    
        

19383 EENV_DEM90 basins processed in 1069s (18.1/s)
10000 USGS_3DEP basins processed in 3801s (2.6/s)


In [71]:
edge_dfs = []
for src in ['EENV_DEM90', 'USGS_3DEP']:
    edge_df = pd.DataFrame(perimeter_results[src])
    edge_df.columns = ['basin_id', f'pct_edge_cells_{src}']
    edge_df.set_index('basin_id', inplace=True)
    edge_dfs.append(edge_df)

In [72]:
result = pd.concat(edge_dfs, join='inner', axis=1)
result['area'] = [df.loc[df['id'] == i, 'basin'].area.values[0] / 1e6 for i in result.index.values]
print(len(result))
result.head()

10000


Unnamed: 0_level_0,pct_edge_cells_EENV_DEM90,pct_edge_cells_USGS_3DEP,area
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
138018,0.252427,0.103082,1.672744
138043,0.08775,0.030553,21.346657
138068,0.407143,0.152783,1.501448
138071,0.009707,0.003313,5976.846482
138096,0.112007,0.038879,11.92545


## Plot Results

In [96]:
def equiprobable_binning(data, param1, param2, samples_per_bin):
    # group deviation values by perimeter bin number
    df = data.copy()
    n_bins = int(len(df)/samples_per_bin)
    print(f'   Creating {n_bins} bins of {samples_per_bin} samples/bin (N={n_bins* samples_per_bin})')
    
    qc, edges = pd.qcut(df[param1], q=n_bins, precision=3, retbins=True)
    edges1 = edges[1:]
    
    df['p_bin'] = np.digitize(df[param1], bins=edges1, right=True)
    
    bin_widths = [j-i for i, j in zip(edges[:-1], edges[1:])] 
    bin_centres = [(j+i)/2 for i, j in zip(edges[:-1], edges[1:])]
    evs = df[[param1, param2, 'p_bin']].groupby('p_bin').mean()

    evs['bin_width'] = bin_widths
    evs['bin_centre'] = (edges[1:] + edges[:-1]) / 2
    # evs['edges'] = edges
    evs['ubnd'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.95)[param2]
    evs['lbnd'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.05)[param2]
    evs['q1'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.25)[param2]
    evs['q3'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.75)[param2]
    evs['median'] = df[[param1, param2, 'p_bin']].groupby('p_bin').quantile(0.5)[param2]
    evs['mean'] = df[[param1, param2, 'p_bin']].groupby('p_bin').mean()[param2]
    evs.loc[0, 'ubnd'] = evs.loc[1, 'ubnd']
    evs.loc[0, 'lbnd'] = evs.loc[1, 'lbnd']        
    return evs, edges

In [103]:
def binned_fig(fig, src, df, samples_per_bin):
    edge_pct = f'pct_edge_cells_{src}'
    evs, edges = equiprobable_binning(df, 'area', edge_pct, 600)
    
    linetype = 'dashed'
    color = Colorblind3[0]
    color2 = Colorblind3[0]
    label = '90m'
    if src == 'USGS_3DEP':
        label = '30m'
        color = Colorblind3[2]
        linetype = 'dotted'
        color2 = Colorblind3[2]
    
    fig.toolbar.autohide = True
    
    classes = evs['bin_centre'].values
    ub = evs['ubnd'].values
    lb = evs['lbnd'].values
    source = ColumnDataSource(data=dict(base=classes, upper=ub, lower=lb))
    
    # outlier range
    w = Whisker(base='base', upper="upper", lower="lower", source=source,
                     line_color=color, line_alpha=0.8, line_width=1)
    w.upper_head.line_color = color
    w.lower_head.line_color = color

    if src == 'USGS_3DEP':
        fig.circle(evs['bin_centre'], evs['median'], 
                color=color2, size=5,
                   legend_label=f'bin median {label}')    
    else:
        fig.square(evs['bin_centre'], evs['median'], 
                color=color2, size=5,
                   legend_label=f'bin median {label}')    
        
    fig.add_layout(w)
    return fig

In [104]:
fig = figure(width=500, height=400, title=f"", y_range=(0, 0.5),
           toolbar_location='above', x_axis_type='log')

for src in ['EENV_DEM90', 'USGS_3DEP']:
    fig = binned_fig(fig, src, result, 600)
    
fig.xaxis.axis_label = 'Area [km²]'
fig.yaxis.axis_label = 'Proportion of cells at edge'
show(fig)

   Creating 16 bins of 600 samples/bin (N=9600)
   Creating 16 bins of 600 samples/bin (N=9600)


## Compute mean basin slope

Above we created temporary raster files for each basin clipped from the larger dem raster.  Here we compute the mean slope for each basin and compare 30 and 90m dem sources to see the effect on a large sample of basins.

In [None]:
for src in ['EENV_DEM90', 'USGS_3DEP']:
    tmp_raster_folder = os.path.join(temp_folder, src)
    t1 = time()
    # raster_files = os.listdir(os.path.join(BASE_DIR, f'validation/tmp/{src}'))
    # raster_paths = [os.path.join(tmp_raster_folder, r) for r in raster_files]
    ids = result.index.values
    n = 1
    for bid in ids:
        dem_fpath = os.path.join(tmp_raster_folder, f'{bid}_clipped_0mbuff.tif')
        slope_fpath = os.path.join(tmp_raster_folder, f'{bid}_slope_fromclip_{buffer}mbuff.tif')
        if not os.path.exists(slope_fpath):
            # print(f'computing slope on {dem_fpath}')
            wbt.slope(
                dem_fpath, 
                slope_fpath, 
                zfactor=None,
                units='degrees',
                # callback=default_callback
            )
        # retrieve the slope raster and compute mean basin slope
        raster, _, _ = retrieve_raster(slope_fpath)
        mean_slope = np.nanmean(raster.data[0])
        # basin id is the index in the result dataframe, so update a 'mean slope' column
        result.loc[bid, f'mean_slope_{src}'] = mean_slope
        if n % 500 == 0:            
            print(f'processed {n}/{len(ids)} basins')

In [None]:
result.head()

In [None]:
sdf = all_df.copy().dropna(subset=[c for c in all_df if 'slope' in c])
print(len(sdf))
src = 'EENV'
hhist1, hedges1 = np.histogram(sdf[f'{src}_mean_slope'].values, bins=20, density=True)

src = '3DEP'
hhist2, hedges2 = np.histogram(sdf[f'{src}_mean_slope'].values, bins=hedges1, density=True)
# hzeros1 = np.zeros(len(hedges1)-1)
# hzeros2= np.zeros(len(hedges2)-1)
# hmax = max(hhist1) * 1.01

ph1 = figure(title=f'', toolbar_location=None, 
             width=400, height=fig3.height, x_range=fig1.x_range)

ph1.xgrid.grid_line_color = None
ph1.yaxis.major_label_orientation = np.pi/4
ph1.background_fill_color = "#fafafa"
ph1.yaxis.axis_label = 'P(X)'
ph1.xaxis.axis_label = 'Mean Slope [deg]'

ph1.quad(bottom=0, left=hedges1[:-1], right=hedges1[1:], top=hhist1, legend_label='90m', 
                  line_alpha=0.6, fill_alpha=0.5, color=Colorblind3[0], line_color="#3A5785")
hh1 = ph1.quad(bottom=0, left=hedges1[:-1], right=hedges1[1:], top=hzeros1, alpha=0.5, **LINE_ARGS)

ph1.quad(bottom=0, left=hedges2[:-1], right=hedges2[1:], top=hhist2, legend_label='30m',
                  line_alpha=0.6, fill_alpha=0.5, color=Colorblind3[2], line_color="#3A5785")
hh11 = ph1.quad(bottom=0, left=hedges2[:-1], right=hedges2[1:], top=hzeros2, alpha=0.5, **LINE_ARGS)
# hh2 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)
# hh2 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)
ph1.toolbar_location = 'right'
ph1.toolbar.autohide = True

In [None]:
# layout = column(ph1)
show(ph1)