In [1]:
import os
import time

import dask.dataframe as dd
import matplotlib.cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
def read_output(data_path):

    files = []
    for file in os.listdir(data_path):
        if not file.startswith('.'):
            if 'min' in file:
                files.append(os.path.join(data_path, file))
                
    # Make df dataframe
    d = []

    for file in files:
        data = pd.read_parquet(file)
        data = data.sort_values(by='NHDPlusID').iloc[::10]
        data['file'] = file
        d.append(data)
        
    df = pd.concat(d, axis=0, ignore_index=True)
    return df

In [9]:
# USE THIS ONCE I RERUN AND WRITE OUT REACHES NOT NODES FOR MIN

# # Get list of files    
# files = []
# for file in os.listdir(data_path):
#     if not file.startswith('.'):
#         if 'min' in file:
#             files.append(os.path.join(data_path, file))

# ddf_ne = dd.read_parquet(files)

In [26]:
pd.options.display.float_format = '{:.2f}'.format

## For plotting

In [2]:
def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
    new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
    return new_cmap

In [3]:
cmap = plt.get_cmap('cubehelix', 5)
new_cmap = truncate_colormap(cmap, 0.2, 0.75)

In [4]:
color = []

for i in np.linspace(0, 1, 3):
    color.append(colors.rgb2hex(new_cmap(i)))

In [5]:
palette = sns.color_palette(color)

In [6]:
# Order of all possible bins for plotting
order = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]',
         '(50, 60]', '(60, 70]', '(70, 80]', '(80, 90]', '(90, 100]',
         '(100, 150]', '(150, 200]', '(200, 500]']

In [7]:
# save_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/figures_for_2025_03_07/'

## Minimum Coverage

In [3]:
narrow = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]']

### HUC2_01, Northeast

In [4]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '01'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [5]:
df_ne = read_output(data_path=data_path)

In [6]:
df_ne['coverage'] = df_ne['coverage']*100

In [44]:
df_ne['granule'] = df_ne['file'].str.slice(63, -20)

In [76]:
# df_ne[df_ne['granule'].str.contains('PIC2')]

In [109]:
summary = pd.DataFrame(df_ne.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [110]:
summary['ten']  = df_ne[df_ne.coverage > 10].groupby('NHDPlusID').count()['coverage']
summary['twenty']  = df_ne[df_ne.coverage > 20].groupby('NHDPlusID').count()['coverage']
summary['thirty']  = df_ne[df_ne.coverage > 30].groupby('NHDPlusID').count()['coverage']
summary['forty']  = df_ne[df_ne.coverage > 40].groupby('NHDPlusID').count()['coverage']
summary['fifty']  = df_ne[df_ne.coverage > 50].groupby('NHDPlusID').count()['coverage']
summary['sixty']  = df_ne[df_ne.coverage > 60].groupby('NHDPlusID').count()['coverage']
summary['seventy']  = df_ne[df_ne.coverage > 70].groupby('NHDPlusID').count()['coverage']
summary['eighty']  = df_ne[df_ne.coverage > 80].groupby('NHDPlusID').count()['coverage']
summary['ninety']  = df_ne[df_ne.coverage > 90].groupby('NHDPlusID').count()['coverage']

In [111]:
summary = pd.merge(left=summary, right=df_ne.drop_duplicates(subset='NHDPlusID')[['NHDPlusID', 'Bin']], on='NHDPlusID', how='left')

In [112]:
passed_ne = summary[(summary.thirty/summary.observations > 0.75) & summary.Bin.isin(narrow)].sort_values(by='observations', ascending=False).reset_index().drop(columns='index')

In [135]:
# passed_ne

In [114]:
passed_ne.to_csv('passed_ne.csv')

In [115]:
ne_nids = summary[(summary.thirty/summary.observations > 0.75) & summary.Bin.isin(narrow)].sort_values(by='observations', ascending=False).sample(12)['NHDPlusID'].to_list()

In [116]:
ne_nids

[5000700072109.0,
 5000600024538.0,
 5000200005654.0,
 5000600016172.0,
 5000700072447.0,
 5000600020340.0,
 10000800066692.0,
 5000600032907.0,
 5000700071109.0,
 10000900092908.0,
 10000800047656.0,
 5000200018595.0]

In [119]:
seen = df_ne[df_ne['NHDPlusID'].isin(ne_nids)].drop(columns='file')

In [132]:
good_ne = seen[seen['coverage'] > 30].sort_values(by=['NHDPlusID', 'coverage'], ascending=[True, False]).reset_index().drop(columns='index')

In [133]:
good_ne

Unnamed: 0,NHDPlusID,coverage,Bin,granule
0,5000200005654.00,47.95,"(10, 20]",SWOT_L2_HR_PIXC_020_007_233R_20240820T213742_2...
1,5000200005654.00,47.91,"(10, 20]",SWOT_L2_HR_PIXC_008_520_076R_20240101T200254_2...
2,5000200005654.00,47.08,"(10, 20]",SWOT_L2_HR_PIXC_002_007_233R_20230811T080618_2...
3,5000200005654.00,46.55,"(10, 20]",SWOT_L2_HR_PIXC_022_007_233R_20241001T150756_2...
4,5000200005654.00,45.39,"(10, 20]",SWOT_L2_HR_PIXC_027_520_076R_20250201T061923_2...
...,...,...,...,...
346,10000900092908.00,33.16,"(0, 10]",SWOT_L2_HR_PIXC_028_298_082L_20250214T044413_2...
347,10000900092908.00,32.92,"(0, 10]",SWOT_L2_HR_PIXC_027_035_227L_20250114T225252_2...
348,10000900092908.00,32.69,"(0, 10]",SWOT_L2_HR_PIXC_002_298_082L_20230821T171210_2...
349,10000900092908.00,30.96,"(0, 10]",SWOT_L2_HR_PIXC_028_035_227L_20250204T193756_2...


In [134]:
good_ne.to_csv('good_ne.csv')

In [131]:
# seen[(seen['NHDPlusID'] == 10000900048960.00) & (seen['coverage'] > 30)]

### HUC2_15, Southwest

In [136]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '15'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [137]:
df_sw = read_output(data_path=data_path)

In [138]:
df_sw['granule'] = df_sw['file'].str.slice(63, -20)

In [139]:
df_sw['coverage'] = df_sw['coverage']*100

In [140]:
summary_sw = pd.DataFrame(df_sw.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [141]:
summary_sw['ten']  = df_sw[df_sw.coverage > 10].groupby('NHDPlusID').count()['coverage']
summary_sw['twenty']  = df_sw[df_sw.coverage > 20].groupby('NHDPlusID').count()['coverage']
summary_sw['thirty']  = df_sw[df_sw.coverage > 30].groupby('NHDPlusID').count()['coverage']
summary_sw['forty']  = df_sw[df_sw.coverage > 40].groupby('NHDPlusID').count()['coverage']
summary_sw['fifty']  = df_sw[df_sw.coverage > 50].groupby('NHDPlusID').count()['coverage']
summary_sw['sixty']  = df_sw[df_sw.coverage > 60].groupby('NHDPlusID').count()['coverage']
summary_sw['seventy']  = df_sw[df_sw.coverage > 70].groupby('NHDPlusID').count()['coverage']
summary_sw['eighty']  = df_sw[df_sw.coverage > 80].groupby('NHDPlusID').count()['coverage']
summary_sw['ninety']  = df_sw[df_sw.coverage > 90].groupby('NHDPlusID').count()['coverage']

In [143]:
summary_sw = pd.merge(left=summary_sw, right=df_sw.drop_duplicates(subset='NHDPlusID')[['NHDPlusID', 'Bin', 'granule']], on='NHDPlusID', how='left')

In [146]:
summary_sw

Unnamed: 0,NHDPlusID,observations,ten,twenty,thirty,forty,fifty,sixty,seventy,eighty,ninety,Bin,granule
0,40000100000018.00,46,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_002_496_098R_20230828T190121_2...
1,40000100000019.00,23,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_002_496_098R_20230828T190121_2...
2,40000100000021.00,47,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_002_483_211R_20230828T081126_2...
3,40000100000026.00,47,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_002_483_211R_20230828T081126_2...
4,40000100000033.00,4,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_003_177_210L_20230907T063334_2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
623213,40000800098769.00,50,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_001_427_215R_20230805T112559_2...
623214,40000800098770.00,50,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_001_427_215R_20230805T112559_2...
623215,40000800098771.00,50,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_001_427_215R_20230805T112559_2...
623216,40000800098772.00,12,,,,,,,,,,"(0, 10]",SWOT_L2_HR_PIXC_005_134_093R_20231017T105154_2...


In [145]:
# summary_sw[(summary_sw.thirty/summary_sw.observations > 0.75) & summary_sw.Bin.isin(narrow)].sort_values(by='observations', ascending=False)

In [147]:
passed_sw = summary_sw[(summary_sw.thirty/summary_sw.observations > 0.75) & summary_sw.Bin.isin(narrow)].sort_values(by='observations', ascending=False).reset_index().drop(columns='index')

In [149]:
passed_sw.to_csv('passed_sw.csv')

In [150]:
sw_nids = summary_sw[(summary_sw.thirty/summary_sw.observations > 0.75) & summary_sw.Bin.isin(narrow)].sort_values(by='observations', ascending=False).sample(12)['NHDPlusID'].to_list()

In [151]:
sw_nids

[40000300031638.0,
 40000300038034.0,
 40000300075622.0,
 40000600048401.0,
 40000400106243.0,
 40000600035936.0,
 40000300075623.0,
 40000300025412.0,
 40000300000677.0,
 40000600042938.0,
 40000300013045.0,
 40000600010826.0]

In [152]:
seen_sw = df_sw[df_sw['NHDPlusID'].isin(sw_nids)].drop(columns='file')

In [153]:
good_sw = seen_sw[seen_sw['coverage'] > 30].sort_values(by=['NHDPlusID', 'coverage'], ascending=[True, False]).reset_index().drop(columns='index')

In [154]:
good_sw

Unnamed: 0,NHDPlusID,coverage,Bin,granule
0,40000300000677.00,62.77,"(20, 30]",SWOT_L2_HR_PIXC_003_496_097R_20230918T154617_2...
1,40000300000677.00,61.95,"(20, 30]",SWOT_L2_HR_PIXC_026_496_097R_20250110T130304_2...
2,40000300000677.00,61.35,"(20, 30]",SWOT_L2_HR_PIXC_015_496_097R_20240526T004712_2...
3,40000300000677.00,60.81,"(20, 30]",SWOT_L2_HR_PIXC_008_483_212R_20231231T124205_2...
4,40000300000677.00,59.98,"(20, 30]",SWOT_L2_HR_PIXC_012_483_212R_20240323T234223_2...
...,...,...,...,...
588,40000600048401.00,33.06,"(40, 50]",SWOT_L2_HR_PIXC_022_455_214R_20241017T151302_2...
589,40000600048401.00,32.62,"(40, 50]",SWOT_L2_HR_PIXC_027_455_214R_20250129T225824_2...
590,40000600048401.00,31.99,"(40, 50]",SWOT_L2_HR_PIXC_007_455_214R_20231209T155648_2...
591,40000600048401.00,31.27,"(40, 50]",SWOT_L2_HR_PIXC_021_455_214R_20240926T182757_2...


In [155]:
good_sw.to_csv('good_sw.csv')

### Testing

In [140]:
# test = ddf[ddf['file'].str.contains('313_233R')].compute()

In [139]:
# test

In [141]:
# len(test.NHDPlusID.unique())

In [142]:
# summary = pd.DataFrame(test.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [143]:
# summary['ten']  = test[test.coverage > 0.1].groupby('NHDPlusID').count()['coverage']
# summary['twenty']  = test[test.coverage > 0.2].groupby('NHDPlusID').count()['coverage']
# summary['thirty']  = test[test.coverage > 0.3].groupby('NHDPlusID').count()['coverage']
# summary['forty']  = test[test.coverage > 0.4].groupby('NHDPlusID').count()['coverage']
# summary['fifty']  = test[test.coverage > 0.5].groupby('NHDPlusID').count()['coverage']
# summary['sixty']  = test[test.coverage > 0.6].groupby('NHDPlusID').count()['coverage']
# summary['seventy']  = test[test.coverage > 0.7].groupby('NHDPlusID').count()['coverage']
# summary['eighty']  = test[test.coverage > 0.8].groupby('NHDPlusID').count()['coverage']
# summary['ninety']  = test[test.coverage > 0.9].groupby('NHDPlusID').count()['coverage']

In [144]:
# summary = pd.merge(left=summary, right=test, on='NHDPlusID', how='left')

In [145]:
# summary[summary.forty/summary.observations > 0.75]

In [146]:
# ddf.groupby('NHDPlusID').count().compute()

In [147]:
# ddf.groupby('NHDPlusID').apply(lambda group: (group['coverage'] > 0.5).count()).compute()

In [13]:
# test = dd.read_parquet(files[0])

In [None]:
# len(test_min.NHDPlusID.unique())

In [34]:
# len(test.sort_values(by='NHDPlusID').compute().iloc[::10].NHDPlusID.unique())

In [7]:
# len(test_min.NHDPlusID.unique())

In [8]:
# len(test_min.sort_values(by='NHDPlusID')[::10].NHDPlusID.unique())

In [7]:
# ddf_ne = ddf_ne.drop(columns='NHDPlusID')

In [156]:
# ddf_ne['coverage'] = ddf_ne['coverage']*100

In [10]:
# fort = ddf_ne[ddf_ne['NHDPlusID'] == 10000900004534].compute()

In [11]:
# fort[fort['coverage'] > 0]