In [1]:
import os

import numpy as np
import pandas as pd
import geopandas as gpd

import shapely
from shapely.geometry import Point, mapping, Polygon

from bokeh.layouts import gridplot
from bokeh.models import BoxSelectTool, LassoSelectTool
from bokeh.plotting import figure, show, save, row, ColumnDataSource#, curdoc
from bokeh.io import output_notebook

output_notebook()



In [2]:
BASE_DIR = os.getcwd()

HYSETS_DIR = os.path.join(BASE_DIR, 'source_data/HYSETS_data/')

source_DEM = 'EENV_DEM' # EENV_DEM or USGS_3DEP

In [3]:
hysets_df = pd.read_csv(os.path.join(HYSETS_DIR, 'HYSETS_watershed_properties.txt'), sep=';')
hysets_locs = [Point(x, y) for x, y in zip(hysets_df['Centroid_Lon_deg_E'].values, hysets_df['Centroid_Lat_deg_N'])]
hysets_df = gpd.GeoDataFrame(hysets_df, geometry=hysets_locs, crs='EPSG:4326')

In [4]:
hysets_df.columns

Index(['Watershed_ID', 'Source', 'Name', 'Official_ID', 'Centroid_Lat_deg_N',
       'Centroid_Lon_deg_E', 'Drainage_Area_km2', 'Drainage_Area_GSIM_km2',
       'Flag_GSIM_boundaries', 'Flag_Artificial_Boundaries', 'Elevation_m',
       'Slope_deg', 'Gravelius', 'Perimeter', 'Flag_Shape_Extraction',
       'Aspect_deg', 'Flag_Terrain_Extraction', 'Land_Use_Forest_frac',
       'Land_Use_Grass_frac', 'Land_Use_Wetland_frac', 'Land_Use_Water_frac',
       'Land_Use_Urban_frac', 'Land_Use_Shrubs_frac', 'Land_Use_Crops_frac',
       'Land_Use_Snow_Ice_frac', 'Flag_Land_Use_Extraction',
       'Permeability_logk_m2', 'Porosity_frac', 'Flag_Subsoil_Extraction',
       'geometry'],
      dtype='object')

In [5]:
def get_region_polygon(region):
    region_polygon_path = 'processed_data/merged_basin_groups/region_polygons/'
    region_polygon_fpath = os.path.join(BASE_DIR, region_polygon_path)
    region_polygon_fnames = os.listdir(region_polygon_fpath)
    region_polygon_files = [e for e in region_polygon_fnames if e.startswith(region)]
    assert len(region_polygon_files) == 1
    region_polygon_fname = region_polygon_files[0]

    return gpd.read_file(os.path.join(region_polygon_fpath, region_polygon_fname))

In [6]:
def retrieve_results(region, param, method):
    region_polygon = get_region_polygon(region)
    
    # basin characteristics derived independently using derived basin polygons
    sample_fname = f'processed_data/basin_properties/{region}/{region}_basin_properties_{method}.csv'
    sample_fpath = os.path.join(BASE_DIR, sample_fname)
    sample_df = pd.read_csv(sample_fpath)
    sample_df = sample_df[[c for c in sample_df.columns if 'Unnamed' not in c]]
    
    # find all hysets basins falling inside the region polygon
    polygon = region_polygon.geometry.values[0]
    assert region_polygon.crs == hysets_df.crs
    
    hysets_within = hysets_df[hysets_df.within(polygon)]
    print(f'    There are {len(hysets_within)} HYSETS stations within {region} region.')
            
    s_data = sample_df[param].values
    
    h_data = hysets_within[param].values
    data = {
        'x1': np.sort(s_data),
        'y1': np.arange(len(s_data)) / float(len(s_data)),
        'x2': np.sort(h_data),
        'y2': np.arange(len(h_data)) / float(len(h_data)),
    }
    return data
    

In [7]:
def results_df_all_methods(region, param):
    
    # basin characteristics derived independently using derived basin polygons
    data = {region: {}}
    
    for method in ['RND', 'NBR', 'ACC']:
        sample_fname = f'processed_data/basin_properties/{region}/{region}_basin_properties_{method}.csv'
        sample_fpath = os.path.join(BASE_DIR, sample_fname)
        sample_df = pd.read_csv(sample_fpath)
        sample_df = sample_df[[c for c in sample_df.columns if 'Unnamed' not in c]]
        
        for c in sample_df.columns:
            if c not in data[region].keys():
                data[region][c] = sample_df[[c]]
                data[region][c].columns = [method]
            else:
                data[region][c].loc[:, method] = sample_df[c]
    
    return data
    

In [8]:
def make_ecdf_plot(region, param, data):
    
    x1, y1 = data['x1'], data['y1'] # population estimate
    x2, y2 = data['x2'], data['y2'] # Hysets stations
    
    n_pts_pop = len(data['x2'])
    n_pts_hysets = len(data['x1'])
    
    p = figure(width=400, height=300, title=f'{region} CDF')
    p.circle(x1, y1, 
             legend_label=f'Pop est. N={n_pts_pop}',
             color='dodgerblue')
    p.circle(x2, y2, 
             legend_label=f'Hysets N={n_pts_hysets}', 
             color='firebrick')
    
    p.legend.location = 'bottom_right'
    p.xaxis.axis_label = f'{param}'
    p.yaxis.axis_label = 'P(X)'
    
    return p
    

In [14]:
def method_comparison_plot(region, param, data, methods):
    
    df = data[region][param].copy()

    n_pts = len(df)
    
    p = figure(width=400, height=300, title=f'{region} CDF N={n_pts}')
    
    cs = ['firebrick', 'dodgerblue', 'gold']
    i = 0
    for method in methods:
        if param == 'Aspect_deg':
            print(df[method].describe())
        if param in ['Drainage_Area_km2', 'Perimeter']:
            x_label = 'log_' + param
            x = np.sort(np.log(df[method].values))
        else:
            x_label = param
            x = np.sort(df[method].values)
        y = np.arange(n_pts) / float(n_pts)
        p.circle(x, y, 
                 legend_label=f'{method}',
                 alpha=0.5,
                color=cs[i])
        i += 1
    
    p.legend.location = 'bottom_right'
    p.xaxis.axis_label = f'{x_label}'
    p.yaxis.axis_label = 'P(X)'
    
    return p

In [10]:
# params = ['Drainage_Area_km2', 'Elevation_m', 'Perimeter',
#        'Aspect_deg', 'Gravelius', 'Slope_deg', 'Permeability_logk_m2',
#        'Porosity_frac', 'Land_Use_Forest_frac', 'Land_Use_Shrubs_frac',
#        'Land_Use_Grass_frac', 'Land_Use_Wetland_frac', 'Land_Use_Crops_frac',
#        'Land_Use_Urban_frac', 'Land_Use_Water_frac', 'Land_Use_Snow_Ice_frac']
params = ['Drainage_Area_km2', 'Elevation_m', 'Gravelius', 'Perimeter', 'Slope_deg', 'Aspect_deg']

# basin characteristics derived independently using derived basin polygons
sample_folder = f'processed_data/basin_properties/'
sample_fpath = os.path.join(BASE_DIR, sample_folder)
processed_regions = sorted([e.split('_')[0] for e in os.listdir(sample_fpath) if not e.endswith('.csv')])
processed_regions

['07G', '07U', '08A', '08E', '08G', '08H', '08O', '08P']

In [12]:
method = 'RND'
method = 'NBR'
method = 'ACC'
methods = ['RND', 'NBR', 'ACC']

# for param in params:
#     print('')
#     print(f'Processing {param}')
#     plots = []
#     for region in processed_regions:
#         # pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x-mu)**2 / (2*sigma**2))
#         # cdf = (1+scipy.special.erf((x-mu)/np.sqrt(2*sigma**2)))/2
#         data = retrieve_results(region, param, method)
#         p1 = make_ecdf_plot(region, param, data)
#         plots.append(p1)
#     grid = gridplot(plots, width=300, height=250, ncols=3)
#     output_path = os.path.join(BASE_DIR, f'processed_data/plots/{param}_CDF_Plots_{method}.html')
#     save(grid, filename=output_path, title=f'{param} CDF')

In [16]:
for param in params:
    print('')
    print(f'Processing {param}')
    plots = []
    for region in processed_regions:
        print('')
        print(region)
        # pdf = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-(x-mu)**2 / (2*sigma**2))
        # cdf = (1+scipy.special.erf((x-mu)/np.sqrt(2*sigma**2)))/2
        data = results_df_all_methods(region, param)
        p1 = method_comparison_plot(region, param, data, methods)
        plots.append(p1)
    grid = gridplot(plots, width=300, height=250, ncols=3)
    output_path = os.path.join(BASE_DIR, f'processed_data/plots/{param}_CDF_Plots_method_comparison.html')
    save(grid, filename=output_path, title=f'{param} CDF')


Processing Drainage_Area_km2

07G

07U

08A

08E

08G

08H

08O

08P

Processing Elevation_m

07G

07U

08A

08E

08G

08H

08O

08P

Processing Gravelius

07G

07U

08A

08E

08G

08H

08O

08P

Processing Perimeter

07G

07U

08A

08E

08G

08H

08O

08P

Processing Slope_deg

07G

07U

08A

08E

08G

08H

08O

08P

Processing Aspect_deg

07G
count    2464.000000
mean      169.612599
std        29.706757
min        69.137878
25%       153.608765
50%       171.872925
75%       182.223907
max       283.310059
Name: RND, dtype: float64
count    2464.000000
mean      169.057660
std        29.132927
min        68.993843
25%       154.103268
50%       170.784149
75%       180.904579
max       298.190674
Name: NBR, dtype: float64
count    2464.000000
mean      169.642475
std        30.708929
min        63.376900
25%       154.754139
50%       172.004639
75%       183.497395
max       296.854767
Name: ACC, dtype: float64

07U
count    1640.000000
mean      180.016210
std        27.329686
mi