### Check that prepNHD worked and all files are present

In [3]:
import geopandas as gpd
import os
import pandas as pd
import time

In [1]:
def get_paths(filepath, hucs, codes):
    for i in range(len(codes)):
        # Get all HUC4 GBD paths for current HUC2 (exclude WBD)
        sub_paths = [fn for fn in os.listdir(os.path.join(filepath, 'HUC2_' + codes[i]))
                     if fn.startswith('NHD')]

        for j in sub_paths:
            # path = os.path.join(filepath, 'HUC2_' + codes[i],
            #                     j, j + '.gdb')
            huc = j
            hucs.append(huc)
    return hucs

In [4]:
# Define dtypes for lookup tables to preserve leading zeros
dtype_dic= {'HUC4': str, 'HUC2': str, 'toBasin': str, 'level': str}
# Read in HUC lookup table
lookup = pd.read_csv('/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/data/HUC4_lookup_no_great_lakes.csv',
                     dtype=dtype_dic)

In [5]:
datapath = '/nas/cee-water/cjgleason/craig/CONUS_ephemeral_data/'
preppath = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_data/NHD_prepped_segmented/'
codes_huc2 = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18']

In [6]:
# Get the original files
original = get_paths(filepath=datapath, hucs=[], codes=codes_huc2)
len(original)

211

In [15]:
# Get the prepped files
prepped = get_paths(filepath=preppath, hucs=[], codes=codes_huc2)
len(prepped)

202

In [17]:
# Clean the prepped paths to match original
# prepped = [x[:-16] for x in prepped]
prepped = [x[:-26] for x in prepped]
# prepped

In [20]:
# Get the missing filenames (0431 is fine, it's Canada)
# 0418, 0419, 0424, 0426, and 0428 are the Great Lakes.
missing = list(set(original) - set(prepped))
missing

['NHDPLUS_H_0428_HU4_GDB',
 'NHDPLUS_H_0419_HU4_GDB',
 'NHDPLUS_H_1710_HU4_GDB',
 'NHDPLUS_H_0424_HU4_GDB',
 'NHDPLUS_H_0512_HU4_GDB',
 'NHDPLUS_H_0418_HU4_GDB',
 'NHDPLUS_H_1802_HU4_GDB',
 'NHDPLUS_H_0426_HU4_GDB',
 'NHDPLUS_H_0431_HU4_GDB']

In [21]:
# Get the missing hucs
missingHUCS= [x[10:14] for x in missing]
missingHUCS

['0428', '0419', '1710', '0424', '0512', '0418', '1802', '0426', '0431']

In [29]:
lookup[lookup['HUC4'] == '0431']

Unnamed: 0,slurm_index,HUC4,HUC2,toBasin,level


In [None]:
# Find the indices of the missing HUCS for slurm
lookup.loc[lookup['HUC4'].isin(missingHUCS), 'slurm_index']

### Check to make sure the max width fits in the bins I set

In [1]:
import geopandas as gpd
import os
import pandas as pd

In [2]:
## Set-up
mdata_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/data/'
prep_path = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_data/NHD_prepped/'

In [3]:
# Define dtypes for lookup tables to preserve leading zeros
dtype_dic= {'HUC4': str, 'HUC2': str, 'toBasin': str, 'level': str}
# Read in HUC lookup table
lookup = pd.read_csv(os.path.join(mdata_path, 'HUC4_lookup_no_great_lakes.csv'), dtype=dtype_dic)

In [25]:
tf = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_data/NHD_prepped/HUC2_08/NHDPLUS_H_0809_HU4_GDB_prepped.gpkg'

In [26]:
test = gpd.read_file(filename=tf, engine='pyogrio')

In [27]:
test.WidthM.to_numpy().max()

np.float64(472.15431976542897)

In [None]:
# Get slurm job index
print('FILE\t\t\tMAX WIDTH')
# for i in range(36,100):
i = 142
# Get current HUC2 and HUC4 IDs
hu2 = 'HUC2_' + lookup.loc[i,'HUC4'][0:2]
hu4 = 'NHDPLUS_H_' + lookup.loc[i,'HUC4'] + '_HU4_GDB'

# Set data filepath
file_path = os.path.join(prep_path, hu2, hu4 + '_prepped.gpkg')

# Get max width value for given HUC4
maxw = gpd.read_file(filename=file_path, columns=['WidthM'],
                      engine='pyogrio').drop(columns='geometry').to_numpy().max()

print(hu4 + ':\t' + str(maxw))

# Remove vars (speeds up runtime considerably)
del hu2, hu4, file_path, maxw

In [None]:
i

In [None]:


# Get current HUC2 and HUC4 IDs
hu2 = 'HUC2_' + lookup.loc[i,'HUC4'][0:2]
hu4 = 'NHDPLUS_H_' + lookup.loc[i,'HUC4'] + '_HU4_GDB'

In [None]:
# Set data filepath
file_path = os.path.join(data_path, hu2, hu4 + '_prepped.gpkg')

In [None]:
# fiona.listlayers(file_path)

In [None]:
# Read in NHD flowlines
miss = gpd.read_file(filename=file_path, columns=['WidthM', 'Bin'], engine='pyogrio')

In [None]:
miss.columns

In [None]:
miss.WidthM.max()

In [None]:
hu4

In [None]:
miss.Bin.unique()

### See if HUC filtering matches a priori table

In [1]:
import os

import pandas as pd

In [2]:
## Min
directory = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_01_2025_02_04_min'

files_min = []
for file in os.listdir(directory):
    if not file.startswith('.'):
        files_min.append(file)

# files = files[1:]
print(f"There are {str(len(files_min))} original files in directory.")

There are 3400 original files in directory.


In [26]:
## Mean
directory = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_01_2025_02_04_mean'

files_mean = []
for file in os.listdir(directory):
    if not file.startswith('.'):
        files_mean.append(file)

# files = files[1:]
print(f"There are {str(len(files_mean))} original files in directory.")

There are 3387 original files in directory.


In [29]:
## Max
directory = '/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC_output/PIXC_v2_0_HUC2_01_2025_02_04_max'

files_max = []
for file in os.listdir(directory):
    if not file.startswith('.'):
        files_max.append(file)

# files = files[1:]
print(f"There are {str(len(files_max))} original files in directory.")

There are 3388 original files in directory.


In [32]:
files_common = list(set(files_min) & set(files_mean) & set(files_max))

In [34]:
len(files_common)

3366

### All errors

In [2]:
import os

In [5]:
def search_log(file_path, search_string):
    missing = []
    
    for file_name in os.listdir(file_path):
        if not file_name.startswith('.'):
            with open(os.path.join(file_path, file_name), 'r') as file:
                contains = False
                
                for line in file:
                    if search_string in line:
                            missing.append(file)
                        # contains = True
                    # if 'TIME LIMIT' in line:
                    #     print(file)

                # if contains == False:
                #     print(line)
    return missing

In [6]:
# Min
file_path ='/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/log_2025_03_02_min/'
missing = search_log(file_path, 'TIME LIMIT')

In [7]:
missing

[]

### Min/mean/max log files

In [2]:
def search_log(file_path, search_string):
    missing = []
    
    for file_name in os.listdir(file_path):
        if not file_name.startswith('.'):
            with open(os.path.join(file_path, file_name), 'r') as file:
                contains = False
                
                for line in file:
                    if search_string in line:
                        contains = True
                
                # print(file_name + ': ' + str(contains))
                
                if contains == True: #False:
                    missing.append(file_name)
    
    return missing

In [27]:
# Min
file_path ='/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/log_2025_02_12_min/'
missing = search_log(file_path, 'OOM')

In [28]:
# now 28939417
missing

['log_28620430_530.log',
 'log_28620430_729.log',
 'log_28620430_880.log',
 'log_28620430_1305.log',
 'log_28620430_1275.log',
 'log_28619350_2587.log',
 'log_28619350_2588.log',
 'log_28620430_1478.log',
 'log_28620430_1699.log',
 'log_28620430_1711.log',
 'log_28624693_3203.log',
 'log_28624693_3199.log',
 'log_28624693_3586.log',
 'log_28624693_3728.log']

In [26]:
[x[-8:-4] for x in missing]

['_530',
 '_729',
 '_880',
 '1305',
 '1275',
 '2587',
 '2588',
 '1478',
 '1699',
 '1711',
 '3203',
 '3199',
 '3586',
 '3728']

In [21]:
# Mean
file_path ='/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/log_2025_02_12_mean/'
missing = search_log(file_path, 'OOM')

In [22]:
# now 28938858
missing

['log_28918975_156.log',
 'log_28918975_151.log',
 'log_28918975_73.log',
 'log_28918975_119.log',
 'log_28918975_729.log',
 'log_28918975_837.log',
 'log_28918975_888.log',
 'log_28918975_1498.log',
 'log_28918975_1563.log',
 'log_28918975_1793.log',
 'log_28918975_1794.log',
 'log_28918975_1800.log',
 'log_28921985_2090.log',
 'log_28921985_2159.log',
 'log_28921985_2084.log',
 'log_28921985_2163.log',
 'log_28921985_2471.log',
 'log_28921985_2690.log',
 'log_28921985_2687.log',
 'log_28921985_3041.log',
 'log_28921985_3391.log',
 'log_28921985_3397.log',
 'log_28921985_3498.log',
 'log_28921985_3503.log',
 'log_28921985_3554.log',
 'log_28921985_3586.log',
 'log_28921985_3645.log']

In [23]:
[x[-8:-4] for x in missing if '28938858' in x]

[]

In [None]:
# Max
file_path ='/nas/cee-water/cjgleason/fiona/narrow_rivers_PIXC/log_2025_02_12_max/'
missing = search_log(file_path, 'OOM')

In [7]:
missing

['log_28930033_156.log',
 'log_28930033_1460.log',
 'log_28930033_1615.log',
 'log_28930033_1699.log',
 'log_28930033_1949.log',
 'log_28930033_1975.log',
 'log_28932140_2163.log',
 'log_28932140_2479.log',
 'log_28932140_3158.log',
 'log_28932140_3665.log']

In [13]:
[x[-8:-4] for x in missing]

['_156',
 '1460',
 '1615',
 '1699',
 '1949',
 '1975',
 '2163',
 '2479',
 '3158',
 '3665']