# Russian River Step 1 -- download and clean raw geometry data

In this section, we choose the basin, the streams to be included in the stream-aligned mesh, and make sure that all are resolved discretely at appropriate length scales for this work.

In [None]:
# these can be turned on for development work
%load_ext autoreload
%autoreload 2

In [None]:
# setting up logging first or else it gets preempted by another package
import watershed_workflow.ui
watershed_workflow.ui.setup_logging(1)

In [None]:
import os,sys
import logging
import numpy as np
from matplotlib import pyplot as plt
import shapely
import pandas as pd
import geopandas as gpd
pd.options.display.max_columns = None
import pickle

import watershed_workflow 
import watershed_workflow.config
import watershed_workflow.sources
import watershed_workflow.sources.standard_names as names

# set the default figure size for notebooks
plt.rcParams["figure.figsize"] = (8, 6)


## Input: Parameters and other source data

Note, this section will need to be modified for other runs of this workflow in other regions.

In [None]:
# Force Watershed Workflow to pull data from this directory rather than a shared data directory.
# This picks up the Coweeta-specific datasets set up here to avoid large file downloads for 
# demonstration purposes.
#
def splitPathFull(path):
    """
    Splits an absolute path into a list of components such that
    os.path.join(*splitPathFull(path)) == path
    """
    parts = []
    while True:
        head, tail = os.path.split(path)
        if head == path:  # root on Unix or drive letter with backslash on Windows (e.g., C:\)
            parts.insert(0, head)
            break
        elif tail == path:  # just a single file or directory
            parts.insert(0, tail)
            break
        else:
            parts.insert(0, tail)
            path = head
    return parts

cwd = splitPathFull(os.getcwd())
assert cwd[-1] == 'workflow'
cwd = cwd[:-1]

# Note, this directory is where downloaded data will be put as well
data_dir = os.path.join(*(cwd + ['input_data',]))
def toInput(filename):
    return os.path.join(data_dir, filename)

output_dir = os.path.join(*(cwd + ['output_data',]))
output_filenames = dict()
def fromOutput(filename):
    return os.path.join(output_dir, filename)    

def toOutput(role, filename):
    output_filenames[role] = filename
    return fromOutput(filename)

# check output and input dirs exist
if not os.path.isdir(data_dir):
    os.makedirs(data_dir, exist_ok=True)
if not os.path.isdir(output_dir):
    os.makedirs(output_dir, exist_ok=True)
       

In [None]:
# Set the data directory to the local space to get the locally downloaded files
watershed_workflow.config.setDataDirectory(data_dir)


In [None]:
## Parameters cell -- this provides all parameters that can be changed via pipelining to generate a new watershed. 
name = 'RussianRiver'
hucs = ['18010110'] # a list of HUCs to run

# Geometric parameters
# -- parameters to clean and reduce the river network prior to meshing
prune_by_area = 10               # km^2
simplify = 125                   # length scale to target average edge 

# -- mesh triangle refinement control
refine_d0 = 200
refine_d1 = 600

refine_L0 = 125
refine_L1 = 300

refine_A0 = refine_L0**2 / 2
refine_A1 = refine_L1**2 / 2

# Note that, by default, we tend to work in the DayMet CRS because this allows us to avoid
# reprojecting meteorological forcing datasets.
crs = watershed_workflow.crs.default_crs


In [None]:
# set up a dictionary of source objects
#
# Data sources, also called managers, deal with downloading and parsing data files from a variety of online APIs.
sources = watershed_workflow.sources.getDefaultSources()

# log the sources that will be used here
watershed_workflow.sources.logSources(sources)


In [None]:
# get the shape and crs of the shape
print(crs)
watershed_shapes = sources['HUC'].getShapesByID(hucs, out_crs=crs)
print(watershed_shapes.crs)

## the Watershed

In [None]:
# Construct and plot the WW object used for storing watersheds
watershed = watershed_workflow.split_hucs.SplitHUCs(watershed_shapes)
watershed.plot()

## Gage Data

In [None]:
# find all gages in the river
import pygeohydro

# Initialize NWIS
nwis = pygeohydro.NWIS()
bbox = ','.join(f"{b:.06f}" for b in watershed_shapes.to_crs(watershed_workflow.crs.latlon_crs).geometry[0].bounds)
print(bbox)

query = {
    "bBox": bbox,
    "siteType": "ST",  # stream types
    "parameterCd": "00060,00065", # Discharge and Gage height
    "hasDataTypeCd": "dv",  # Daily values
    "outputDataTypeCd": "dv",  # Output as daily values
}

#
# Fetch all available gages within the bounding box
sites = nwis.get_info(query).to_crs(crs)

#
# Spatial join: keep only sites that fall inside the watershed, not the bounds
sites = gpd.sjoin(sites, watershed_shapes, how="inner", predicate="intersects")

#
# limit to sites with end_date after 2000 -- they MIGHT have good data
sites = sites[sites['end_date'] > '2000-01-01']

sites


In [None]:
#
# do they actually have data?  download discharge
#
dates = ('2000-01-01', '2026-01-01')
qobs = nwis.get_streamflow(sites.site_no.to_list(), dates, mmd=True)
qobs

In [None]:
#
# 36 have data... how many have lots of data?  Lets say at least 10 years of daily data since 2000...
#
qobs_10yrs_k = [k for k in qobs.keys() if qobs[k].count() > 10*365]
len(qobs_10yrs_k)

# these are the 24 that we will mesh into the domain...
qobs_10yrs = qobs[qobs_10yrs_k]
qobs_10yrs

In [None]:
#
# create a new sites dataframe, that just has the metadata of continuous sites, and not repeated site_nos associated with different ranges
#
geom = [shapely.geometry.Point(qobs_10yrs.attrs[k]['dec_long_va'], qobs_10yrs.attrs[k]['dec_lat_va']) for k in qobs_10yrs.keys()] 
sites_10yrs = gpd.GeoDataFrame(geometry=geom, crs=watershed_workflow.crs.latlon_crs)

cols = qobs_10yrs.attrs[list(qobs_10yrs.keys())[0]].keys()
for col in cols:
    vals = [qobs_10yrs.attrs[k][col] for k in qobs_10yrs.keys()]
    sites_10yrs[col] = vals    

sites_10yrs['ID'] = list(qobs_10yrs.keys())
sites_10yrs['count'] = [qobs_10yrs[k].count() for k in qobs_10yrs.keys()]
sites_10yrs = sites_10yrs.to_crs(crs)
sites_10yrs

In [None]:
import pynhd
nldi = pynhd.NLDI()

features = nldi.getfeature_byid("nwissite", sites_10yrs['ID'])

# pull a few things over from the nwis data
sites_10yrs['comid'] = features['comid'].astype(int)
sites_10yrs['measure'] = features['measure'] # in % of total length, the distance up the reach from the downstream point to the gage
sites_10yrs

In [None]:
# check that no two gages are on the same reach -- this would break our subdomain decomposition
assert len(set(sites_10yrs.comid.tolist())) == len(sites_10yrs)

In [None]:
# check if there are pairs of points that are super close

def checkTooClose(pts, geom='geometry'):
    pairs_dist = []
    pairs_comid = []
    threshold = 1000 # m
    
    for i, geom1 in enumerate(pts[geom]):
        for j, geom2 in enumerate(pts[geom]):
            if i < j:
                d = geom1.distance(geom2)
                if d <= threshold:
                    pairs_dist.append((pts.index[i], pts.index[j], d))
    
                if pts.iloc[i].comid == pts.iloc[j].comid:
                    pairs_comid.append((pts.index[i], pts.index[j], pts.iloc[j].comid))
    
    print('Close pairs:')
    for i,j,d in pairs_dist:
        print(f'point {i} ({pts.loc[i, geom]}) and point {j} {pts.loc[j,geom]}) are within {d} m')
    
    print('Shared comid:')
    for i,j,c in pairs_comid:
        print(f'point {i} ({pts.loc[i, geom]}) and point {j} {pts.loc[j,geom]}) share comid {c}')
    return pairs_dist, pairs_comid

pairs_dist, _ = checkTooClose(sites_10yrs)

In [None]:
# lets remove the one with fewer records:
for i,j,d in pairs_dist:
    fewer = i if sites_10yrs.loc[i, 'count'] < sites_10yrs.loc[j, 'count'] else j

sites_10yrs = sites_10yrs.drop(fewer)

In [None]:
#
# After going through this once, two other gages -- 
#   USGS-11463980
#   USGS-11464000
# are rather close to each other, and result in a small subcatchment between them, where the 
# subcatchments downstream of 11464000 and upstream of 11463980 touch each other on either
# side of the small subcatchment, which breaks the partitioning algorithm.
#
# We'll remove the upstream one, as it has fewer records.
sites_10yrs = sites_10yrs[sites_10yrs['ID'] != 'USGS-11463980']


## the Rivers 

In [None]:
# download/collect the river network within that shape's bounds
reaches = sources['hydrography'].getShapesByGeometry(watershed.df, out_crs=crs)

# remove coastlines
reaches = reaches[reaches.ftype != 'Coastline']
print(reaches.crs)

# construct rivers
rivers = watershed_workflow.river_tree.createRivers(reaches, method='hydroseq')
print(rivers[0].df.crs)

reaches

In [None]:
#
# are all of our gage-reaches in the set of reaches?
print(sum(sites_10yrs['comid'].isin(reaches['comid'])), ' of ', len(sites_10yrs), ' are in the set of ALL reaches')

In [None]:
def plot(ws, rivs, ax=None):
    if ax is None:
        fig, ax = plt.subplots()
    ws.plot(color='k', marker='+', markersize=10, ax=ax)
    for river in rivs:
        river.plot(marker='x', markersize=10, ax=ax)

    return ax

ax = plot(watershed, rivers)

    

In [None]:
# the outlet here needs to be modified thanks to the coastline reaches

# move the endpoint to the boundary
op = rivers[0].linestring.coords[-1]
cp = shapely.ops.nearest_points(watershed.exterior.exterior, rivers[0].linestring)[0]
print(op, cp)

rivers[0].moveCoordinate(-1, cp)

In [None]:
# prune
rivers = watershed_workflow.reduceRivers(rivers, 
                                         prune_by_area=prune_by_area,
                                         remove_diversions=True,
                                         remove_braided_divergences=True)

for river in rivers:
    river.resetDataFrame()

reduced_reaches = pd.concat([r.df for r in rivers])

In [None]:
# are all of our gages STILL in our reduced network?
print(sum(sites_10yrs['comid'].isin(reduced_reaches['comid'])), ' of ', len(sites_10yrs), ' are in the set of REDUCED reaches')

## Put the gages on the reaches, and use these to define subcatchments

We don't have a good way of splitting the catchment of a reach at the gage's measure of the reach.  If we had this, we could split both the reach and the catchment at the actual gage location, and make subcatchments that respect this.  Let's try to assign the gage to an upstream-most or downstream-most point on the reach instead.



In [None]:
# add another "gage" point -- the outlet of the full domain
assert len(rivers) == 1
sites_10yrs = sites_10yrs.reset_index()

sites = gpd.GeoDataFrame(
    pd.concat([sites_10yrs,
           gpd.GeoDataFrame({'comid' : rivers[0]['comid'],
                             'measure' : 0.,
                             'station_nm' : 'Russian River Outlet',
                             'ID' : 'RR-outlet',
                            },
                            index=[len(sites_10yrs),],
                            geometry=[shapely.geometry.Point(rivers[0].linestring.coords[-1]),],
                            crs=sites_10yrs.crs),
          ]), crs=sites_10yrs.crs)

sites[names.NAME] = sites[names.ID]
sites.pop('index')
sites


In [None]:
sites

In [None]:
#
# first map from the site locations to the end of the reach we wish to map the gage onto
#
sites['reach_ID'] = sites['comid'].astype(str)
watershed_polys = watershed_workflow.river_tree.determineOutletToReachMap(rivers, sites)



In [None]:
# check that moving to outlets didn't break our set of points?
c1, c2 = checkTooClose(watershed_polys, 'true_geometry')
assert len(c1) == 0
assert len(c2) == 0

In [None]:
# now accumluate incremental catchments
watershed_polys = watershed_workflow.river_tree.accumulateIncrementalCatchments(rivers, watershed_polys)
watershed_polys


In [None]:
# remap some of the geometry fields to make life a little simpler
watershed_polys['true_gage_geometry'] = watershed_polys['true_geometry']
watershed_polys['outlet'] = watershed_polys['geometry']
watershed_polys['geometry'] = watershed_polys['incremental_catchment']
watershed_polys = watershed_polys.set_geometry('geometry', crs=watershed_polys.crs)
print(watershed_polys.crs)


In [None]:
# one multipolygon?  take the biggest subset...
watershed_polys['geometry'] = [watershed_workflow.split_hucs.findBiggest(p.geoms) if isinstance(p, shapely.geometry.MultiPolygon) else p for p in watershed_polys.geometry]
print(watershed_polys.crs)
watershed_polys


In [None]:
# all area > 0?
print(watershed_polys.area)
assert min(watershed_polys.area) > 0.
print(len(watershed_polys))
print(len(sites))

In [None]:
for i, geo_i in enumerate(watershed_polys.geometry):
    for j, geo_j in enumerate(watershed_polys.geometry):
        if j > i:
            if geo_i.contains(geo_j):
                print(f"CONTAINS: {watershed_polys.index[i]} comid {watershed_polys.iloc[i]['comid']} contains {watershed_polys.index[j]} comid {watershed_polys.iloc[j]['comid']}")            
            if geo_j.contains(geo_i):
                print(f"CONTAINS: {watershed_polys.index[j]} comid {watershed_polys.iloc[j]['comid']} contains {watershed_polys.index[i]} comid {watershed_polys.iloc[i]['comid']}")            

            area = geo_i.intersection(geo_j).area
            af_i = area / geo_i.area
            af_j = area / geo_j.area
            if af_i > .01 or af_j > .01:
                print('large intersection:', watershed_polys.index[i], watershed_polys.index[j], 'area_frac =', af_i, af_j)

In [None]:
# save these shapefiles to disk
watershed_polys.to_parquet(toOutput('watershed_polys', '01_watershed_polys.parquet'))

river_df = gpd.GeoDataFrame(pd.concat([r.to_dataframe() for r in rivers]), crs=crs)
river_df.to_parquet(toOutput('rivers', '01_rivers.parquet'))

qobs.to_csv(toOutput('evaluation_discharge', '01_discharge_observations.csv'))


In [None]:
# save output filenames
with open(toOutput('04_output_filenames', '01_output_filenames.txt'), 'wb') as fid:
    pickle.dump(output_filenames, fid)