In [2]:
import os
import time

import dask.dataframe as dd
import matplotlib.cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [119]:
def read_output(data_path):

    files = []
    for file in os.listdir(data_path):
        if not file.startswith('.'):
            if 'min' in file:
                files.append(os.path.join(data_path, file))
                
    # Make df dataframe
    d = []

    for file in files:
        data = pd.read_parquet(file)
        data = data.sort_values(by='NHDPlusID').iloc[::10]
        data['file'] = file
        d.append(data)
        
    df = pd.concat(d, axis=0, ignore_index=True)
    return df

In [9]:
# USE THIS ONCE I RERUN AND WRITE OUT REACHES NOT NODES FOR MIN

# # Get list of files    
# files = []
# for file in os.listdir(data_path):
#     if not file.startswith('.'):
#         if 'min' in file:
#             files.append(os.path.join(data_path, file))

# ddf_ne = dd.read_parquet(files)

## For plotting

In [2]:
def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
    new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
    return new_cmap

In [3]:
cmap = plt.get_cmap('cubehelix', 5)
new_cmap = truncate_colormap(cmap, 0.2, 0.75)

In [4]:
color = []

for i in np.linspace(0, 1, 3):
    color.append(colors.rgb2hex(new_cmap(i)))

In [5]:
palette = sns.color_palette(color)

In [6]:
# Order of all possible bins for plotting
order = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]',
         '(50, 60]', '(60, 70]', '(70, 80]', '(80, 90]', '(90, 100]',
         '(100, 150]', '(150, 200]', '(200, 500]']

In [7]:
# save_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/figures_for_2025_03_07/'

## Minimum Coverage

In [149]:
narrow = ['(0, 10]', '(10, 20]', '(20, 30]', '(30, 40]', '(40, 50]']

### HUC2_01, Northeast

In [3]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '01'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [120]:
df_ne = read_output(data_path=data_path)

In [158]:
df_ne['coverage'] = df_ne['coverage']*100

In [204]:
summary = pd.DataFrame(df_ne.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [205]:
summary['ten']  = df_ne[df_ne.coverage > 10].groupby('NHDPlusID').count()['coverage']
summary['twenty']  = df_ne[df_ne.coverage > 20].groupby('NHDPlusID').count()['coverage']
summary['thirty']  = df_ne[df_ne.coverage > 30].groupby('NHDPlusID').count()['coverage']
summary['forty']  = df_ne[df_ne.coverage > 40].groupby('NHDPlusID').count()['coverage']
summary['fifty']  = df_ne[df_ne.coverage > 50].groupby('NHDPlusID').count()['coverage']
summary['sixty']  = df_ne[df_ne.coverage > 60].groupby('NHDPlusID').count()['coverage']
summary['seventy']  = df_ne[df_ne.coverage > 70].groupby('NHDPlusID').count()['coverage']
summary['eighty']  = df_ne[df_ne.coverage > 80].groupby('NHDPlusID').count()['coverage']
summary['ninety']  = df_ne[df_ne.coverage > 90].groupby('NHDPlusID').count()['coverage']

In [206]:
summary

Unnamed: 0_level_0,observations,ten,twenty,thirty,forty,fifty,sixty,seventy,eighty,ninety
NHDPlusID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5.000100e+12,22,3.0,1.0,,,,,,,
5.000100e+12,23,1.0,,,,,,,,
5.000100e+12,50,,,,,,,,,
5.000100e+12,90,5.0,1.0,,,,,,,
5.000100e+12,26,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
1.000090e+13,69,,,,,,,,,
1.000090e+13,69,,,,,,,,,
1.000090e+13,69,,,,,,,,,
1.000090e+13,69,,,,,,,,,


In [216]:
summary = pd.merge(left=summary, right=df_ne.drop_duplicates(subset='NHDPlusID')[['NHDPlusID', 'Bin']], on='NHDPlusID', how='left')

In [217]:
summary

Unnamed: 0,NHDPlusID,observations,ten,twenty,thirty,forty,fifty,sixty,seventy,eighty,ninety,Bin
0,5.000100e+12,22,3.0,1.0,,,,,,,,"(10, 20]"
1,5.000100e+12,23,1.0,,,,,,,,,"(0, 10]"
2,5.000100e+12,50,,,,,,,,,,"(10, 20]"
3,5.000100e+12,90,5.0,1.0,,,,,,,,"(0, 10]"
4,5.000100e+12,26,,,,,,,,,,"(0, 10]"
...,...,...,...,...,...,...,...,...,...,...,...,...
272738,1.000090e+13,69,,,,,,,,,,"(0, 10]"
272739,1.000090e+13,69,,,,,,,,,,"(0, 10]"
272740,1.000090e+13,69,,,,,,,,,,"(0, 10]"
272741,1.000090e+13,69,,,,,,,,,,"(0, 10]"


In [252]:
summary[(summary.sixty/summary.observations > 0.5) & summary.Bin.isin(narrow)].sort_values(by='observations', ascending=False)

Unnamed: 0,NHDPlusID,observations,ten,twenty,thirty,forty,fifty,sixty,seventy,eighty,ninety,Bin


### HUC2_15, Southwest

In [169]:
## Setup
bwidth = 'mean'
bbin = 'Bin'
huc2 = '15'
data_path = '../narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_' + huc2 + '_2025_03_06_' + bwidth

In [171]:
df_sw = read_output(data_path=data_path)

In [172]:
df_sw['coverage'] = df_sw['coverage']*100

In [229]:
summary_sw = pd.DataFrame(df_sw.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [230]:
summary_sw

Unnamed: 0_level_0,observations
NHDPlusID,Unnamed: 1_level_1
4.000010e+13,46
4.000010e+13,23
4.000010e+13,47
4.000010e+13,47
4.000010e+13,4
...,...
4.000080e+13,50
4.000080e+13,50
4.000080e+13,50
4.000080e+13,12


In [231]:
summary_sw['ten']  = df_sw[df_sw.coverage > 10].groupby('NHDPlusID').count()['coverage']
summary_sw['twenty']  = df_sw[df_sw.coverage > 20].groupby('NHDPlusID').count()['coverage']
summary_sw['thirty']  = df_sw[df_sw.coverage > 30].groupby('NHDPlusID').count()['coverage']
summary_sw['forty']  = df_sw[df_sw.coverage > 40].groupby('NHDPlusID').count()['coverage']
summary_sw['fifty']  = df_sw[df_sw.coverage > 50].groupby('NHDPlusID').count()['coverage']
summary_sw['sixty']  = df_sw[df_sw.coverage > 60].groupby('NHDPlusID').count()['coverage']
summary_sw['seventy']  = df_sw[df_sw.coverage > 70].groupby('NHDPlusID').count()['coverage']
summary_sw['eighty']  = df_sw[df_sw.coverage > 80].groupby('NHDPlusID').count()['coverage']
summary_sw['ninety']  = df_sw[df_sw.coverage > 90].groupby('NHDPlusID').count()['coverage']

In [232]:
summary_sw = pd.merge(left=summary_sw, right=df_sw.drop_duplicates(subset='NHDPlusID')[['NHDPlusID', 'Bin']], on='NHDPlusID', how='left')

In [233]:
summary_sw

Unnamed: 0,NHDPlusID,observations,ten,twenty,thirty,forty,fifty,sixty,seventy,eighty,ninety,Bin
0,4.000010e+13,46,,,,,,,,,,"(0, 10]"
1,4.000010e+13,23,,,,,,,,,,"(0, 10]"
2,4.000010e+13,47,,,,,,,,,,"(0, 10]"
3,4.000010e+13,47,,,,,,,,,,"(0, 10]"
4,4.000010e+13,4,,,,,,,,,,"(0, 10]"
...,...,...,...,...,...,...,...,...,...,...,...,...
623213,4.000080e+13,50,,,,,,,,,,"(0, 10]"
623214,4.000080e+13,50,,,,,,,,,,"(0, 10]"
623215,4.000080e+13,50,,,,,,,,,,"(0, 10]"
623216,4.000080e+13,12,,,,,,,,,,"(0, 10]"


In [254]:
summary_sw[(summary_sw.seventy/summary_sw.observations > 0.5) & summary_sw.Bin.isin(narrow)].sort_values(by='observations', ascending=False).shape

(1, 12)

### Testing

In [140]:
# test = ddf[ddf['file'].str.contains('313_233R')].compute()

In [139]:
# test

In [141]:
# len(test.NHDPlusID.unique())

In [142]:
# summary = pd.DataFrame(test.groupby('NHDPlusID').count()['coverage']).rename(columns={'coverage': 'observations'})

In [143]:
# summary['ten']  = test[test.coverage > 0.1].groupby('NHDPlusID').count()['coverage']
# summary['twenty']  = test[test.coverage > 0.2].groupby('NHDPlusID').count()['coverage']
# summary['thirty']  = test[test.coverage > 0.3].groupby('NHDPlusID').count()['coverage']
# summary['forty']  = test[test.coverage > 0.4].groupby('NHDPlusID').count()['coverage']
# summary['fifty']  = test[test.coverage > 0.5].groupby('NHDPlusID').count()['coverage']
# summary['sixty']  = test[test.coverage > 0.6].groupby('NHDPlusID').count()['coverage']
# summary['seventy']  = test[test.coverage > 0.7].groupby('NHDPlusID').count()['coverage']
# summary['eighty']  = test[test.coverage > 0.8].groupby('NHDPlusID').count()['coverage']
# summary['ninety']  = test[test.coverage > 0.9].groupby('NHDPlusID').count()['coverage']

In [144]:
# summary = pd.merge(left=summary, right=test, on='NHDPlusID', how='left')

In [145]:
# summary[summary.forty/summary.observations > 0.75]

In [146]:
# ddf.groupby('NHDPlusID').count().compute()

In [147]:
# ddf.groupby('NHDPlusID').apply(lambda group: (group['coverage'] > 0.5).count()).compute()

In [13]:
# test = dd.read_parquet(files[0])

In [None]:
# len(test_min.NHDPlusID.unique())

In [34]:
# len(test.sort_values(by='NHDPlusID').compute().iloc[::10].NHDPlusID.unique())

In [7]:
# len(test_min.NHDPlusID.unique())

In [8]:
# len(test_min.sort_values(by='NHDPlusID')[::10].NHDPlusID.unique())

In [7]:
# ddf_ne = ddf_ne.drop(columns='NHDPlusID')

In [156]:
# ddf_ne['coverage'] = ddf_ne['coverage']*100

In [10]:
# fort = ddf_ne[ddf_ne['NHDPlusID'] == 10000900004534].compute()

In [11]:
# fort[fort['coverage'] > 0]