In [1]:
import os

import numpy as np
import pandas as pd
import geopandas as gpd

import shapely
from shapely.geometry import Point, mapping, Polygon

from bokeh.layouts import gridplot
from bokeh.models import BoxSelectTool, LassoSelectTool
from bokeh.plotting import figure, show, save, row, ColumnDataSource#, curdoc
from bokeh.io import output_notebook

output_notebook()



In [2]:
BASE_DIR = os.getcwd()

HYSETS_DIR = os.path.join(BASE_DIR, 'source_data/HYSETS_data/')

source_DEM = 'EENV_DEM' # EENV_DEM or USGS_3DEP

In [3]:
hysets_df = pd.read_csv(os.path.join(HYSETS_DIR, 'HYSETS_watershed_properties.txt'), sep=';')
hysets_locs = [Point(x, y) for x, y in zip(hysets_df['Centroid_Lon_deg_E'].values, hysets_df['Centroid_Lat_deg_N'])]
hysets_df = gpd.GeoDataFrame(hysets_df, geometry=hysets_locs, crs='EPSG:4326')

In [4]:
def get_region_polygon(region):
    region_polygon_path = 'processed_data/merged_basin_groups/region_polygons/'
    region_polygon_fpath = os.path.join(BASE_DIR, region_polygon_path)
    region_polygon_fnames = os.listdir(region_polygon_fpath)
    region_polygon_files = [e for e in region_polygon_fnames if e.startswith(region)]
    assert len(region_polygon_files) == 1
    region_polygon_fname = region_polygon_files[0]

    return gpd.read_file(os.path.join(region_polygon_fpath, region_polygon_fname))

In [5]:
def retrieve_results(region, param):
    region_polygon = get_region_polygon(region)        
    
    # basin characteristics derived independently using derived basin polygons
    sample_fname = f'processed_data/basin_properties/{region}_basin_sample_properties.csv'
    sample_fpath = os.path.join(BASE_DIR, sample_fname)
    sample_df = pd.read_csv(sample_fpath)
    sample_df = sample_df[[c for c in sample_df.columns if 'Unnamed' not in c]]
    
    # find all hysets basins falling inside the region polygon
    polygon = region_polygon.geometry.values[0]
    assert region_polygon.crs == hysets_df.crs
    hysets_within = hysets_df[hysets_df.within(polygon)]
    print(f'There are {len(hysets_within)} HYSETS stations within {region} region.')
    
    sample_param = param
    if param == 'Elevation_m':
        sample_param = 'median_el'
        
    s_data = sample_df[sample_param].values
    
    h_data = hysets_within[param].values
    data = {
        'x1': np.sort(s_data),
        'y1': np.arange(len(s_data)) / float(len(s_data)),
        'x2': np.sort(h_data),
        'y2': np.arange(len(h_data)) / float(len(h_data)),
    }
    return data
    

In [6]:
def make_ecdf_plot(region, param, data):
    
    x1, y1 = data['x1'], data['y1'] # population estimate
    x2, y2 = data['x2'], data['y2'] # Hysets stations
    
    n_pts_pop = len(data['x2'])
    n_pts_hysets = len(data['x1'])
    
    p = figure(width=400, height=300, title=f'{region} CDF')
    p.circle(x1, y1, 
             legend_label=f'Pop est. N={n_pts_pop}',
             color='dodgerblue')
    p.circle(x2, y2, 
             legend_label=f'Hysets N={n_pts_hysets}', 
             color='firebrick')
    
    p.legend.location = 'bottom_right'
    p.xaxis.axis_label = f'{param}'
    p.yaxis.axis_label = 'P(X)'
    
    return p
    

In [9]:
# params = ['Drainage_Area_km2', 'Elevation_m', 'Perimeter',
#        'Aspect_deg', 'Gravelius', 'Slope_deg', 'Permeability_logk_m2',
#        'Porosity_frac', 'Land_Use_Forest_frac', 'Land_Use_Shrubs_frac',
#        'Land_Use_Grass_frac', 'Land_Use_Wetland_frac', 'Land_Use_Crops_frac',
#        'Land_Use_Urban_frac', 'Land_Use_Water_frac', 'Land_Use_Snow_Ice_frac']
params = ['Drainage_Area_km2', 'Elevation_m', 'gravelius', 'perimeter', 'slope', 'aspect']

# basin characteristics derived independently using derived basin polygons
sample_folder = f'processed_data/basin_properties/'
sample_fpath = os.path.join(BASE_DIR, sample_folder)
processed_regions = sorted([e.split('_')[0] for e in os.listdir(sample_fpath)])


In [10]:
for param in params:
    print('')
    print(f'Processing {param}')
    plots = []
    for region in processed_regions:
        # pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x-mu)**2 / (2*sigma**2))
        # cdf = (1+scipy.special.erf((x-mu)/np.sqrt(2*sigma**2)))/2
        data = retrieve_results(region, param)
        p1 = make_ecdf_plot(region, param, data)
        plots.append(p1)
    grid = gridplot(plots, width=300, height=250, ncols=3)
    output_path = os.path.join(BASE_DIR, f'processed_data/plots/{param}_CDF_Plots.html')
    save(grid, filename=output_path, title=f'{param} CDF')


Processing Drainage_Area_km2
There are 10 HYSETS stations within 07G region.
There are 2 HYSETS stations within 07O region.
There are 1 HYSETS stations within 07U region.
There are 9 HYSETS stations within 08A region.
There are 43 HYSETS stations within 08B region.
There are 22 HYSETS stations within 08C region.
There are 36 HYSETS stations within 08D region.
There are 33 HYSETS stations within 08E region.
There are 18 HYSETS stations within 08F region.
There are 31 HYSETS stations within 08G region.
There are 63 HYSETS stations within 08H region.


  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")



Processing Elevation_m
There are 10 HYSETS stations within 07G region.
There are 2 HYSETS stations within 07O region.
There are 1 HYSETS stations within 07U region.
There are 9 HYSETS stations within 08A region.
There are 43 HYSETS stations within 08B region.
There are 22 HYSETS stations within 08C region.
There are 36 HYSETS stations within 08D region.
There are 33 HYSETS stations within 08E region.
There are 18 HYSETS stations within 08F region.
There are 31 HYSETS stations within 08G region.
There are 63 HYSETS stations within 08H region.


  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")



Processing gravelius
There are 10 HYSETS stations within 07G region.


KeyError: 'gravelius'