In [36]:
import geopandas as gpd
import os
import pandas as pd
import pyarrow as pa

In [2]:
## Set-up
data_path = '/nas/cee-water/cjgleason/craig/CONUS_ephemeral_data/'
mdata_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/data/'
save_path = '../narrow_rivers_PIXC_data/NHD_prepped/'
# Max binsize of 1000 is plenty for CONUS w/o lakes
# Mississippi is ~472 m wide at mouth as calculated here
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 500, 1000]

In [3]:
# Define dtypes for lookup tables to preserve leading zeros
dtype_dic= {'HUC4': str, 'HUC2': str, 'toBasin': str, 'level': str}
# Read in HUC lookup table
lookup = pd.read_csv(os.path.join(mdata_path,
                                  'HUC4_lookup_no_great_lakes.csv'),
                     dtype=dtype_dic)

In [4]:
# Get slurm job index
i = 4

In [5]:
# Get current HUC2 and HUC4 IDs
huc2 = 'HUC2_' + lookup.loc[i,'HUC4'][0:2]
huc4 = 'NHDPLUS_H_' + lookup.loc[i,'HUC4'] + '_HU4_GDB'

In [6]:
# Set data filepath
file_path = os.path.join(data_path, huc2, huc4, huc4 + '.gdb')

In [7]:
# Set write filepath
save_path = os.path.join(save_path, huc2)
save_file = huc4 + '_prepped.gpkg' ## CHANGE

In [8]:
## Prep Physiographic Regions
# https://www.sciencebase.gov/catalog/item/631405bbd34e36012efa304e
physio = gpd.read_file(filename=os.path.join(data_path,
                                             'other_shapefiles/physio.shp'),
                       engine='pyogrio')
# Set CRS to Web Mercator
physio = physio.to_crs(epsg=3857)
# Dissolve provinces by division
physio = physio.dissolve(by='DIVISION').reset_index()
# Drop all columns besides division and geometry
physio = physio[['DIVISION', 'geometry']]

In [9]:
## Get bankfull width coefficients from Bieber et al. 2015, Table 3
bankfull = pd.read_csv(os.path.join(mdata_path,
                                    'bieger_2015_bankfull_width.csv'))

In [11]:
## Merging
# Read in NHD flowlines
basin = gpd.read_file(filename=file_path, layer='NHDFlowline', engine='pyogrio')
# Set CRS to Pseudo-Mercator https://epsg.io/3857
basin = basin.to_crs(epsg=3857)

  return ogr_read(


In [12]:
# Read in VAA
vaa = gpd.read_file(filename=file_path, layer='NHDPlusFlowlineVAA', engine='pyogrio')
# Merge on VAA
basin = basin.merge(right=vaa, how='inner', on=['NHDPlusID', 'VPUID', 'ReachCode'])
# Read in EROMMA
eromma = gpd.read_file(filename=file_path, layer='NHDPlusEROMMA', engine='pyogrio')
# Merge on EROMMA
basin = basin.merge(right=eromma, how='inner', on=['NHDPlusID', 'VPUID'])

In [13]:
## Filtering
# Read in NHD Waterbody polygons
area = gpd.read_file(filename=file_path, layer='NHDWaterbody',
                     columns=['NHDPlusID', 'geometry'], engine='pyogrio')
# Set CRS to Pseudo-Mercator https://epsg.io/3857
area = area.to_crs(epsg=3857)

In [14]:
# Find all flowlines within waterbodies
subset = basin.sjoin(df=area, how='inner', predicate='within')
# Get IDs of these flowlines
ids = subset.NHDPlusID_left.to_list()

In [15]:
# Drop reaches within waterbodies
basin = basin[~basin.NHDPlusID.isin(ids)]
# Drop reaches that aren't stream types or artificial path
basin = basin.loc[(basin.FType == 460) | (basin.FType == 558)]
# Drop reaches that are terminal paths
basin = basin.loc[basin.TerminalFl == 0]
# Drop reaches with discharge of zero
basin = basin.loc[basin.QBMA > 0]
# Drop reaches with stream order of zero
basin = basin.loc[basin.StreamOrde > 0]

In [16]:
## Find the physiographic division each reach is within
# Note: there is broken topology even after the dissolve,
# and neither shapely nor sf fully repaired it
basin = basin.sjoin(df=physio, how='left',
                    predicate='intersects').drop(columns='index_right')
# Drop all reaches where DIVISION == NaN (in Canada and at the coasts)
basin = basin[~basin.DIVISION.isnull()]

In [18]:
## Get bankfull widths
# Merge on bankfull width coefficient
basin = basin.merge(bankfull, on='DIVISION', how='left')
# Calculate width from cumulative drainage area
basin['WidthM'] = basin.a*basin.TotDASqKm**basin.b

In [24]:
### NEW
# Calculate the multiplicative factor from the standard error
# of estimate for each physiographic division (SEEpd) by exponentiation
basin['mul_factor'] = 10**basin.see_phys
# Calculate the min width of each reach
basin['WidthM_Min'] = basin.WidthM/basin.mul_factor
# Calculate the max width of each basin
basin['WidthM_Max'] = basin.WidthM*basin.mul_factor

In [39]:
# Drop reaches that are shorter than their width
basin = basin[basin['LengthKM']*1000 > basin['WidthM']]

In [43]:
## Bin reaches by width, set to string for parquet
basin['Bin'] = pd.cut(basin['WidthM'], bins).astype(str)
basin['Bin_Min'] = pd.cut(basin['WidthM_Min'], bins).astype(str)
basin['Bin_Max'] = pd.cut(basin['WidthM_Max'], bins).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [34]:
pd.set_option('display.max_rows', 95)

In [45]:
basin.dtypes

Permanent_Identifier                 object
FDate                   datetime64[ms, UTC]
Resolution                            int32
GNIS_ID                              object
GNIS_Name                            object
                               ...         
WidthM                              float64
mul_factor                          float64
WidthM_Min                          float64
WidthM_Max                          float64
Bin                                  object
Length: 96, dtype: object