In [1]:
import os
import requests
import xarray as xr
import rioxarray as rxr
import geopandas as gpd
import numpy as np
import tempfile
from datetime import datetime, timedelta

In [2]:
def download_stageiv_data(date, region='conus'):
    """
    Download Stage IV precipitation data for a given date and region.
    """
    base_url = "https://water.noaa.gov/resources/downloads/precip/stageIV/"
    date_str = date.strftime("%Y/%m/%d")
    file_name = f"nws_precip_1day_{date.strftime('%Y%m%d')}_{region}.nc"
    url = f"{base_url}{date_str}/{file_name}"
    
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".nc")
    temp_path = temp_file.name
    temp_file.close()
    
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(temp_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return temp_path
    
    os.unlink(temp_path)  # Delete if download fails
    return None

def clip_to_shapefile(nc_file, shapefile):
    """
    Clip the downloaded NetCDF file using a given shapefile.
    """
    gdf = gpd.read_file(shapefile)
    if nc_file is None:
        return None
    
    ds = xr.open_dataset(nc_file)
    # add crs
    ds = ds.rio.write_crs("+proj=stere +lat_0=90 +lat_ts=60 +lon_0=-105 +x_0=0 +y_0=0 +a=6371200 +b=6371200 +units=m +no_defs")
    ds = ds.rio.reproject(gdf.crs)
    ds = ds.rio.clip_box(minx=gdf.geometry.total_bounds[0],
                          miny=gdf.geometry.total_bounds[1],
                          maxx=gdf.geometry.total_bounds[2],
                          maxy=gdf.geometry.total_bounds[3], crs=gdf.crs)
    return ds

def build_annual_dataset(water_year, shapefile, region='conus', test=False):
    """
    Download and process daily Stage IV precipitation data for an entire water year (Oct 1 - Sep 30).
    """
    start_date = datetime(water_year - 1, 10, 1)  # Water year starts on Oct 1 of previous year
    end_date = datetime(water_year, 9, 30)  # Ends on Sep 30 of current year
    
    if test:
        start_date = datetime(water_year, 1, 1)
        end_date = datetime(water_year, 1, 5)

    datasets = []
    date = start_date
    while date <= end_date:
        try:
            nc_file = download_stageiv_data(date, region)
            clipped_ds = clip_to_shapefile(nc_file, shapefile)
            gdf = gpd.read_file(shapefile)
            if clipped_ds is None:
                # Create a NaN-filled dataset with the expected shape
                dummy_ds = xr.full_like(xr.open_dataset(nc_file) if nc_file else None, np.nan)
                # add crs
                dummy_ds = dummy_ds.rio.write_crs("+proj=stere +lat_0=90 +lat_ts=60 +lon_0=-105 +x_0=0 +y_0=0 +a=6371200 +b=6371200 +units=m +no_defs")
                clipped_ds = dummy_ds.rio.clip(minx=gdf.geometry.total_bounds[0],
                          miny=gdf.geometry.total_bounds[1],
                          maxx=gdf.geometry.total_bounds[2],
                          maxy=gdf.geometry.total_bounds[3], crs=gdf.crs)
            
            datasets.append(clipped_ds)
            
            # Clear temporary file from disk
            if nc_file:
                os.unlink(nc_file)
            
        except Exception as e:
            print(f"Skipping {date.strftime('%Y-%m-%d')}: {e}")
        
        date += timedelta(days=1)
    
        # concatetate the datasets
        ds = xr.concat(datasets, dim='time')
        # convert to crs of shapefile
        ds = ds.rio.reproject(gdf.crs)
        # convert from inches to mm for observation, normal, and departure from normal
        ds['observation'] = ds['observation'] * 25.4
        ds['normal'] = ds['normal'] * 25.4
        ds['departure_from_normal'] = ds['departure_from_normal'] * 25.4
        # change the attribte units to mm
        ds['observation'].attrs['units'] = 'mm'
        ds['normal'].attrs['units'] = 'mm'
        ds['departure_from_normal'].attrs['units'] = 'mm'
        
    return ds

def test_build_annual_dataset():
    """
    Test function to check if build_annual_dataset works correctly on a small subset of data.
    """
    water_year = 2023
    shapefile = "/storage/dlhogan/summa_modeling_data/domain_EastRiver/shapefiles/catchment/EastRiver.shp" 
    test_ds = build_annual_dataset(water_year, shapefile, test=True)
    
    assert isinstance(test_ds, xr.Dataset), "Output is not an xarray Dataset"
    assert 'time' in test_ds.dims, "Time dimension missing in output dataset"
    print("Test passed: build_annual_dataset produces valid output.")
    return test_ds


In [3]:
test_ds = test_build_annual_dataset()

Test passed: build_annual_dataset produces valid output.


In [8]:
wys = [2022,2023]
for wy in wys:
    basin='TuolumneRiver'
    shapefile = f"/storage/dlhogan/summa_modeling_data/domain_{basin}/shapefiles/catchment/{basin}.shp"
    if os.path.exists(f"/storage/dlhogan/summa_modeling_data/domain_{basin}/forcing/1_raw_data/stageIV/stageiv_{wy}_{basin}.nc"):
        print(f"Stage IV data for water year {wy} already exists.")
        continue
    else:
        ds = build_annual_dataset(wy, shapefile)
        ds.to_netcdf(f"/storage/dlhogan/summa_modeling_data/domain_{basin}/forcing/1_raw_data/stageIV/stageiv_{wy}_{basin}.nc")
        print(f"Saved stage IV data for water year {wy}")

Stage IV data for water year 2022 already exists.
Saved stage IV data for water year 2023


In [15]:
if os.path.exists(f"/storage/dlhogan/summa_modeling_data/domain_{basin}/forcing/1_raw_data/stageIV/stageiv_{basin}_{wys[0]}_{wys[1]}.nc"):
    print(f"Merged Stage IV data already exists.")
else:
    # merge the datasets
    ds = xr.open_mfdataset(f"/storage/dlhogan/summa_modeling_data/domain_{basin}/forcing/1_raw_data/stageIV/*.nc")

    # save the merged dataset
    ds.to_netcdf(f"/storage/dlhogan/summa_modeling_data/domain_{basin}/forcing/1_raw_data/stageIV/stageiv_{basin}_{wys[0]}_{wys[1]}.nc")