In [1]:
import os
import time

import dask.dataframe as dd
import geopandas as gpd
import matplotlib.cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from reaches import *

In [2]:
def read_output(data_path):

    files = []
    for file in os.listdir(data_path):
        if not file.startswith('.'):
            if 'min' in file:
                files.append(os.path.join(data_path, file))
                
    # Make df dataframe
    d = []

    for file in files:
        data = pd.read_parquet(file)
        data = data.sort_values(by='NHDPlusID').iloc[::10]
        data['file'] = file
        d.append(data)
        
    df = pd.concat(d, axis=0, ignore_index=True)
    return df

In [3]:
# Order of all possible bins for plotting
order = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]',
         '(50, 60]', '(60, 70]', '(70, 80]', '(80, 90]', '(90, 100]',
         '(100, 150]', '(150, 200]', '(200, 500]']

In [None]:
# USE THIS ONCE I RERUN AND WRITE OUT REACHES NOT NODES FOR MIN

# # Get list of files    
# files = []
# for file in os.listdir(data_path):
#     if not file.startswith('.'):
#         if 'min' in file:
#             files.append(os.path.join(data_path, file))

# ddf_ne = dd.read_parquet(files)

In [4]:
pd.options.display.float_format = '{:.2f}'.format

## For plotting

In [None]:
def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
    new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
    return new_cmap

In [None]:
cmap = plt.get_cmap('cubehelix', 5)
new_cmap = truncate_colormap(cmap, 0.2, 0.75)

In [None]:
color = []

for i in np.linspace(0, 1, 3):
    color.append(colors.rgb2hex(new_cmap(i)))

In [None]:
palette = sns.color_palette(color)

In [None]:
# Order of all possible bins for plotting
order = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]',
         '(50, 60]', '(60, 70]', '(70, 80]', '(80, 90]', '(90, 100]',
         '(100, 150]', '(150, 200]', '(200, 500]']

In [None]:
# save_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/figures_for_2025_03_07/'

## Minimum Coverage

In [5]:
narrow = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]']

### HUC2_01, Northeast

In [6]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '01'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [7]:
df_ne = read_output(data_path=data_path)

In [8]:
df_ne['coverage'] = df_ne['coverage']*100

In [9]:
df_ne['granule'] = df_ne['file'].str.slice(63, -20)

In [None]:
# df_ne[df_ne['granule'].str.contains('PIC2')]

In [15]:
summary = pd.DataFrame(df_ne.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [16]:
summary['ten']  = df_ne[df_ne.coverage > 10].groupby('NHDPlusID').count()['coverage']
summary['twenty']  = df_ne[df_ne.coverage > 20].groupby('NHDPlusID').count()['coverage']
summary['thirty']  = df_ne[df_ne.coverage > 30].groupby('NHDPlusID').count()['coverage']
summary['forty']  = df_ne[df_ne.coverage > 40].groupby('NHDPlusID').count()['coverage']
summary['fifty']  = df_ne[df_ne.coverage > 50].groupby('NHDPlusID').count()['coverage']
summary['sixty']  = df_ne[df_ne.coverage > 60].groupby('NHDPlusID').count()['coverage']
summary['seventy']  = df_ne[df_ne.coverage > 70].groupby('NHDPlusID').count()['coverage']
summary['eighty']  = df_ne[df_ne.coverage > 80].groupby('NHDPlusID').count()['coverage']
summary['ninety']  = df_ne[df_ne.coverage > 90].groupby('NHDPlusID').count()['coverage']

In [17]:
summary = pd.merge(left=summary, right=df_ne.drop_duplicates(subset='NHDPlusID')[['NHDPlusID', 'Bin']], on='NHDPlusID', how='left')

In [26]:
summary

Unnamed: 0,NHDPlusID,observations,ten,twenty,thirty,forty,fifty,sixty,seventy,eighty,ninety,Bin
0,5000100000001.00,22,3.00,1.00,,,,,,,,"(10, 20]"
1,5000100000006.00,23,1.00,,,,,,,,,"(0, 10]"
2,5000100000007.00,50,,,,,,,,,,"(10, 20]"
3,5000100000008.00,90,5.00,1.00,,,,,,,,"(0, 10]"
4,5000100000009.00,26,,,,,,,,,,"(0, 10]"
...,...,...,...,...,...,...,...,...,...,...,...,...
272738,10000900099059.00,69,,,,,,,,,,"(0, 10]"
272739,10000900099062.00,69,,,,,,,,,,"(0, 10]"
272740,10000900099065.00,69,,,,,,,,,,"(0, 10]"
272741,10000900099066.00,69,,,,,,,,,,"(0, 10]"


In [None]:
passed_ne = summary[(summary.twenty/summary.observations > 0.5) & summary.Bin.isin(narrow)].sort_values(by='observations', ascending=False).reset_index().drop(columns='index')

In [None]:
passed_ne

In [None]:
# passed_ne.to_csv('passed_ne.csv')

In [None]:
ne_ids = passed_ne['NHDPlusID'].to_list()

In [None]:
len(ne_ids)

In [None]:
seen = df_ne[df_ne['NHDPlusID'].isin(ne_ids)].drop(columns='file')

In [None]:
good_ne = seen[seen['coverage'] > 20].sort_values(by=['NHDPlusID', 'coverage'], ascending=[True, False]).reset_index().drop(columns='index')

In [None]:
good_ne

In [None]:
# good_ne.to_csv('good_ne.csv')

In [None]:
# seen[(seen['NHDPlusID'] == 10000900048960.00) & (seen['coverage'] > 30)]

### HUC2_15, Southwest

In [19]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '15'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [20]:
df_sw = read_output(data_path=data_path)

In [21]:
df_sw['granule'] = df_sw['file'].str.slice(63, -20)

In [22]:
df_sw['coverage'] = df_sw['coverage']*100

In [23]:
summary_sw = pd.DataFrame(df_sw.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [24]:
summary_sw['ten']  = df_sw[df_sw.coverage > 10].groupby('NHDPlusID').count()['coverage']
summary_sw['twenty']  = df_sw[df_sw.coverage > 20].groupby('NHDPlusID').count()['coverage']
summary_sw['thirty']  = df_sw[df_sw.coverage > 30].groupby('NHDPlusID').count()['coverage']
summary_sw['forty']  = df_sw[df_sw.coverage > 40].groupby('NHDPlusID').count()['coverage']
summary_sw['fifty']  = df_sw[df_sw.coverage > 50].groupby('NHDPlusID').count()['coverage']
summary_sw['sixty']  = df_sw[df_sw.coverage > 60].groupby('NHDPlusID').count()['coverage']
summary_sw['seventy']  = df_sw[df_sw.coverage > 70].groupby('NHDPlusID').count()['coverage']
summary_sw['eighty']  = df_sw[df_sw.coverage > 80].groupby('NHDPlusID').count()['coverage']
summary_sw['ninety']  = df_sw[df_sw.coverage > 90].groupby('NHDPlusID').count()['coverage']

In [None]:
# summary_sw = summary_sw.reset_index()

In [25]:
summary_sw = pd.merge(left=summary_sw, right=df_sw.drop_duplicates(subset='NHDPlusID')[['NHDPlusID', 'Bin', 'granule']], on='NHDPlusID', how='left')

In [None]:
passed_sw = summary_sw[(summary_sw.twenty/summary_sw.observations > 0.5) & summary_sw.Bin.isin(narrow)].sort_values(by='observations', ascending=False).reset_index().drop(columns='index')

In [None]:
passed_sw

In [None]:
# passed_sw.to_csv('passed_sw.csv')

In [None]:
sw_ids = passed_sw['NHDPlusID'].to_list()

In [None]:
len(sw_ids)

In [None]:
seen_sw = df_sw[df_sw['NHDPlusID'].isin(sw_ids)].drop(columns='file')

In [None]:
good_sw = seen_sw[seen_sw['coverage'] > 20].sort_values(by=['NHDPlusID', 'coverage'], ascending=[True, False]).reset_index().drop(columns='index')

In [None]:
good_sw

In [None]:
# good_sw.to_csv('good_sw.csv')

### Testing

In [None]:
# test = ddf[ddf['file'].str.contains('313_233R')].compute()

In [None]:
# test

In [None]:
# len(test.NHDPlusID.unique())

In [None]:
# summary = pd.DataFrame(test.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [None]:
# summary['ten']  = test[test.coverage > 0.1].groupby('NHDPlusID').count()['coverage']
# summary['twenty']  = test[test.coverage > 0.2].groupby('NHDPlusID').count()['coverage']
# summary['thirty']  = test[test.coverage > 0.3].groupby('NHDPlusID').count()['coverage']
# summary['forty']  = test[test.coverage > 0.4].groupby('NHDPlusID').count()['coverage']
# summary['fifty']  = test[test.coverage > 0.5].groupby('NHDPlusID').count()['coverage']
# summary['sixty']  = test[test.coverage > 0.6].groupby('NHDPlusID').count()['coverage']
# summary['seventy']  = test[test.coverage > 0.7].groupby('NHDPlusID').count()['coverage']
# summary['eighty']  = test[test.coverage > 0.8].groupby('NHDPlusID').count()['coverage']
# summary['ninety']  = test[test.coverage > 0.9].groupby('NHDPlusID').count()['coverage']

In [None]:
# summary = pd.merge(left=summary, right=test, on='NHDPlusID', how='left')

In [None]:
# summary[summary.forty/summary.observations > 0.75]

In [None]:
# ddf.groupby('NHDPlusID').count().compute()

In [None]:
# ddf.groupby('NHDPlusID').apply(lambda group: (group['coverage'] > 0.5).count()).compute()

In [None]:
# test = dd.read_parquet(files[0])

In [None]:
# len(test_min.NHDPlusID.unique())

In [None]:
# len(test.sort_values(by='NHDPlusID').compute().iloc[::10].NHDPlusID.unique())

In [None]:
# len(test_min.NHDPlusID.unique())

In [None]:
# len(test_min.sort_values(by='NHDPlusID')[::10].NHDPlusID.unique())

In [None]:
# ddf_ne = ddf_ne.drop(columns='NHDPlusID')

In [None]:
# ddf_ne['coverage'] = ddf_ne['coverage']*100

In [None]:
# fort = ddf_ne[ddf_ne['NHDPlusID'] == 10000900004534].compute()

In [None]:
# fort[fort['coverage'] > 0]

## Find centroids

In [None]:
mdata_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/data/'

In [None]:
# Get NHD index metadata
# Define dtypes for lookup tables to preserve leading zeros
dtype_dic= {'HUC4': str, 'HUC2': str, 'toBasin': str, 'level': str}
# Read in HUC lookup table
huc_lookup = pd.read_csv(os.path.join(mdata_path,
                                  'HUC4_lookup_no_great_lakes.csv'),
                     dtype=dtype_dic)

In [None]:
# Read in HUC4 boundaries (no Great Lakes)
data_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_data/all_wbd_no_great_lakes.parquet'
wbd = gpd.read_parquet(path=data_path)

In [None]:
# Project CRS
# wbd = wbd.to_crs(epsg=3857)

In [None]:
hucs_01 = wbd[wbd['huc4'].str.startswith('01')]['huc4'].to_list()

In [None]:
hucs_15 = wbd[wbd['huc4'].str.startswith('15')]['huc4'].to_list()

In [None]:
# Extract indices for read-in
indices_01 = list(huc_lookup[huc_lookup['HUC4'].isin(hucs_01)]['slurm_index'])
indices_15 = list(huc_lookup[huc_lookup['HUC4'].isin(hucs_15)]['slurm_index'])

## NHD reaches with SWORD matches

In [31]:
data_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_data/'

In [74]:
sword = gpd.read_file(os.path.join(data_path, 'fiona_reach_set_sword_v18.gpkg'))

In [55]:
# sword_ids = sword.fid.unique()

In [56]:
matches = pd.read_csv(os.path.join(data_path, 'narrow_candidate_centroids_20_30_sword_v18.csv'))

In [81]:
bins = matches.Bin.unique()

In [68]:
ne_match_ids = list(matches[matches.huc2 == 1]['NHDPlusID'])

In [70]:
sw_match_ids = list(matches[matches.huc2 == 15]['NHDPlusID'])

In [5]:
narrow = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]']

### HUC2_01, Northeast

In [6]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '01'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [7]:
df_ne = read_output(data_path=data_path)

In [8]:
df_ne['coverage'] = df_ne['coverage']*100

In [9]:
df_ne['granule'] = df_ne['file'].str.slice(63, -20)

### HUC2_15, Southwest

In [19]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '15'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [20]:
df_sw = read_output(data_path=data_path)

In [21]:
df_sw['granule'] = df_sw['file'].str.slice(63, -20)

In [22]:
df_sw['coverage'] = df_sw['coverage']*100

### Find granules

In [104]:
ne_granules = df_ne[(df_ne.NHDPlusID.isin(ne_match_ids)) & (df_ne.coverage > 30)].reset_index().drop(columns='index')

In [105]:
sw_granules = df_sw[(df_sw.NHDPlusID.isin(sw_match_ids)) & (df_sw.coverage > 30)].reset_index().drop(columns='index')

In [106]:
df = pd.concat((ne_granules, sw_granules))

In [109]:
df = df.sort_values(by=['NHDPlusID', 'coverage'], ascending=[True, False]).reset_index().drop(columns=['index', 'file'])

In [113]:
df = df.rename(columns={'coverage': 'min_coverage'})

In [118]:
df.to_csv('granules_for_reaches_with_v18_matches.csv', index=False)

In [117]:
cd ../narrow_rivers_PIXC_output/SWORDv18_matches/

/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_output/SWORDv18_matches


## Analysis

In [None]:
# Create merged dataframe of all flowlines intersected
# Initialize lists
d = []

# Loop through indices and store in lists
for i in indices_01:
    # Read prepped NHD
    flowlines, huc4, huc2 = readNHD(index=i)
    flowlines['huc4'] = huc4[10:14]
    flowlines['huc2'] = huc2[-2:]
    # Append to lists
    d.append(flowlines)
# Merge GeoDataFrames
flowlines_01 = pd.concat(d)

In [None]:
# Create merged dataframe of all flowlines intersected
# Initialize lists
d = []

# Loop through indices and store in lists
for i in indices_15:
    # Read prepped NHD
    flowlines, huc4, huc2 = readNHD(index=i)
    flowlines['huc4'] = huc4[10:14]
    flowlines['huc2'] = huc2[-2:]
    # Append to lists
    d.append(flowlines)
# Merge GeoDataFrames
flowlines_15 = pd.concat(d)

In [None]:
temp_01 = flowlines_01[['NHDPlusID', 'GNIS_Name', 'Bin', 'huc2', 'huc4', 'geometry']]
temp_15 = flowlines_15[['NHDPlusID', 'GNIS_Name', 'Bin', 'huc2', 'huc4', 'geometry']]

In [None]:
both = pd.concat(objs=[temp_01, temp_15]).reset_index().drop(columns='index')

In [None]:
both['Bin'] = pd.Categorical(both['Bin'], order)

In [None]:
all_ids = ne_ids + sw_ids

In [None]:
for_geom = both[both['NHDPlusID'].isin(all_ids)].reset_index().drop(columns='index')

In [None]:
for_geom = for_geom.to_crs(epsg=4326)

In [None]:
for_geom['centroid'] = for_geom['geometry'].centroid

In [None]:
for_geom[for_geom['huc2'] == '15'].plot()
# plt.xlim(-110.8,-110.4)
# plt.ylim(33.6,34)

In [None]:
for_geom.to_parquet('narrow_candidate_lines_and_centroids_20_50.parquet')

In [None]:
center = for_geom.copy()

In [None]:
center = center.set_geometry('centroid').drop(columns='geometry')

In [None]:
center.to_file('narrow_candidate_centroids_20_30.gpkg')

In [None]:
center

In [None]:
fig, ax = plt.subplots()

for_geom.iloc[[0]].plot(ax=ax)
for_geom['centroid'].iloc[[0]].plot(ax=ax)