In [1]:
import numpy as np
import pandas as pd
import xarray as xr

import regionmask

from glob import glob
import os

import dask

In [2]:
#####################
#### Directories ####
#####################
nldas_path = '/storage/group/pches/default/public/NLDAS/'
smap_path = '/storage/group/pches/default/public/SMAP/'
project_data_path = '/storage/group/pches/default/users/dcl5300/wbm_soilM_crop_uc_lafferty-etal-2024-tbd_DATA'
log_path = '/storage/home/dcl5300/work/current_projects/wbm_soilM_crop_uc_lafferty-etal-2024-tbd/code/logs'

In [3]:
#######################
# Function definitions
#######################

# Subsetting function
def _subset_states(ds, list_of_states):
    """
    Subsets a netCDF file to a list of states using regionmask
    """
    if list_of_states == None:
        return ds
    # Subset
    subset_index = regionmask.defined_regions.natural_earth_v5_0_0.us_states_50.map_keys(list_of_states)
    subset_mask = regionmask.defined_regions.natural_earth_v5_0_0.us_states_50.mask(ds)
    ds_subset = ds.where(subset_mask.isin(subset_index), drop=True)
    # Return
    return ds_subset

# SMAP processing
def process_smap(subset_name, list_of_states):
    """
    Grabs SMAP outputs and stores as one netCDF file with after subsetting to list_of_states.
    """
    if not os.path.isfile(f'{project_data_path}/WBM/precalibration/{subset_name}/SMAP/SMAP_validation.nc'):
        # Read all
        files = glob(f'{smap_path}/processed_nldas_grid/SMAP_L4_SM_gph_all_nldas_*.nc')
        ds_smap = xr.concat([_subset_states(xr.open_dataset(file)['sm_rootzone'], list_of_states) for file in files], dim='time')

        # 365 day calendar
        ds_smap = ds_smap.convert_calendar(calendar='noleap', dim='time')
    
        # Merge and store (and change units to kg/m3)
        ds_out = xr.Dataset({'soilMoist':1000*ds_smap})
        ds_out.attrs['units'] = 'kg/m3'
        ds_out.to_netcdf(f'{project_data_path}/WBM/precalibration/{subset_name}/SMAP/SMAP_validation.nc')

        # Also store numpy array for quicker evaluations
        npy_out = np.transpose(ds_out['soilMoist'].to_numpy(), (2,1,0))
        np.save(f'{project_data_path}/WBM/precalibration/{subset_name}/SMAP/SMAP_validation.npy', npy_out)
    else:
        print('SMAP already processed')

# NLDAS processing
def process_nldas(subset_name, list_of_states):
    """
    Grabs NDLAS outputs and stores as one netCDF file with after subsetting to list_of_states.
    """
    nldas_dict = {'VIC':'SOILM0_100cm', 'NOAH':'SOILM', 'MOSAIC':'SOILM'}
    
    # Loop through each
    for model, var_id in nldas_dict.items():
        if not os.path.isfile(f'{project_data_path}/WBM/precalibration/{subset_name}/{model}/{model}_validation.nc'):
            # Read all
            files = glob(f'{nldas_path}/{model}/daily/*.nc')
            ds_nldas = xr.concat([_subset_states(xr.open_dataset(file)[var_id], list_of_states) for file in files], dim='time')

            # 365 day calendar
            ds_nldas = ds_nldas.convert_calendar(calendar='noleap', dim='time')
    
            # Select correct depth
            if model in ['MOSAIC', 'NOAH']:
                ds_nldas = ds_nldas.isel(depth=1)
            else:
                ds_nldas = ds_nldas.isel(depth=0)
        
            # Merge and store
            ds_out = xr.Dataset({'soilMoist':ds_nldas})
            ds_out.attrs['units'] = 'kg/m3'
            ds_out.to_netcdf(f'{project_data_path}/WBM/precalibration/{subset_name}/{model}/{model}_validation.nc')

            # Also store numpy array for quicker evaluations
            npy_out = np.transpose(ds_out['soilMoist'].to_numpy(), (2,1,0))
            np.save(f'{project_data_path}/WBM/precalibration/{subset_name}/{model}/{model}_validation.npy', npy_out)

# Forcing processing
def process_forcing(subset_name, list_of_states):
    """
    Grabs all forcing inputs are stores as single numpy npz file.
    SMAP and NLDAS handled separately since meteo forcing is different. 
    """
    for obs in ['MOSAIC', 'NOAH', 'VIC', 'SMAP']:
        if not os.path.isfile(f'{project_data_path}/WBM/precalibration/{subset_name}/{obs}/inputs.npz'):
            # Climate drivers
            if obs == "SMAP":
                files = glob(f'{smap_path}/processed_nldas_grid/SMAP_L4_SM_gph_all_nldas_*.nc')
                ds_forcing = xr.concat([_subset_states(xr.open_dataset(file), list_of_states) for file in files], dim='time')
            else:
                files = glob(f'{nldas_path}/forcing/daily/NLDAS_FORA0125_H.A*.nc')
                ds_forcing = xr.concat([_subset_states(xr.open_dataset(file), list_of_states) for file in files], dim='time')

            # 365 day calendar
            ds_forcing = ds_forcing.convert_calendar(calendar='noleap', dim='time')

            # Geophysical inputs
            ds_lai = _subset_states(xr.open_dataset(f'{project_data_path}/WBM/geo_inputs/LAI_GLDAS_clima_NLDASgrid.nc'), list_of_states)
            if obs == 'VIC': # VIC does not provide awCap, wiltingp, so use NOAH
                obs_soilp = 'NOAH'
            else:
                obs_soilp = obs
            ds_awCap = _subset_states(xr.open_dataset(f'{project_data_path}/WBM/geo_inputs/{obs_soilp}_awCap.nc'), list_of_states)
            ds_wiltingp = _subset_states(xr.open_dataset(f'{project_data_path}/WBM/geo_inputs/{obs_soilp}_wiltingp.nc'), list_of_states)
    
            ds_clayfrac = _subset_states(xr.open_dataset(f'{project_data_path}/WBM/geo_inputs/clayfrac_NLDASgrid.nc'), list_of_states)
            ds_sandfrac = _subset_states(xr.open_dataset(f'{project_data_path}/WBM/geo_inputs/sandfrac_NLDASgrid.nc'), list_of_states)
            ds_siltfrac = _subset_states(xr.open_dataset(f'{project_data_path}/WBM/geo_inputs/siltfrac_NLDASgrid.nc'), list_of_states)
    
            # Initial conditions
            ds_init = _subset_states(xr.open_dataset(f'{project_data_path}/WBM/precalibration/{subset_name}/{obs}/{obs}_validation.nc'), list_of_states).isel(time=0)
    
            # Numpy arrays in correct order
            lats = ds_lai.lat.to_numpy()
            lons = ds_lai.lon.to_numpy()
    
            if obs == "SMAP":
                tas = np.transpose(ds_forcing['temp_lowatmmodlay'].to_numpy() - 273.15, (2,1,0))
                prcp = np.transpose(ds_forcing['precipitation_total_surface_flux'].to_numpy() * 86400, (2,1,0))
            else:
                tas = np.transpose(ds_forcing['TMP'].to_numpy() - 273.15, (2,1,0))
                prcp = np.transpose(ds_forcing['APCP'].to_numpy(), (2,1,0))
    
            lai = np.transpose(ds_lai['LAI'].to_numpy(), (2,1,0))
            awCap = np.transpose(ds_awCap['awCap'].to_numpy())
            wiltingp = np.transpose(ds_wiltingp['wiltingp'].to_numpy())
    
            clayfrac = np.transpose(ds_clayfrac['clayfrac'].to_numpy() / 100) # percentage -> fraction
            sandfrac = np.transpose(ds_sandfrac['sandfrac'].to_numpy() / 100) # percentage -> fraction
            siltfrac = np.transpose(ds_siltfrac['siltfrac'].to_numpy() / 100) # percentage -> fraction
    
            soilMoist_init = np.transpose(ds_init['soilMoist'].to_numpy())
    
            # Store numpy for easy access
            np.savez(f'{project_data_path}/WBM/precalibration/{subset_name}/{obs}/inputs.npz',
                     tas=tas, prcp=prcp,
                     lai=lai, awCap=awCap, wiltingp=wiltingp,
                     lats=lats, lons=lons,
                     clayfrac=clayfrac, sandfrac=sandfrac, siltfrac=siltfrac,
                     soilMoist_init=soilMoist_init)

### Entire domain

In [4]:
subset_name = "CONUS"
list_of_states = None

In [5]:
# Make directories
output_path = f"{project_data_path}/WBM/precalibration/{subset_name}"

# Main
if not os.path.isdir(output_path):
    os.mkdir(output_path)
    
# Subs
for sub in ["SMAP", "VIC", "NOAH", "MOSAIC"]:
    if not os.path.isdir(f"{output_path}/{sub}"):
        os.mkdir(f"{output_path}/{sub}")

In [6]:
%%time
# SMAP
process_smap(subset_name, list_of_states)

SMAP already processed
CPU times: user 694 µs, sys: 0 ns, total: 694 µs
Wall time: 775 µs


In [7]:
%%time
# NLDAS
process_nldas(subset_name, list_of_states)

CPU times: user 1min 1s, sys: 23.5 s, total: 1min 24s
Wall time: 2min 25s


In [8]:
%%time
# Forcing
process_forcing(subset_name, list_of_states)

CPU times: user 3min 18s, sys: 1min 3s, total: 4min 22s
Wall time: 10min 30s


### Central US

In [4]:
subset_name = "centralUS_test"
list_of_states = ["Illinois", "Iowa", "Wisconsin", "Minnesota", "North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Indiana", "Ohio", "Michigan", "Kentucky"]

In [5]:
# Make directories
output_path = f"{project_data_path}/WBM/precalibration/{subset_name}"

# Main
if not os.path.isdir(output_path):
    os.mkdir(output_path)
    
# Subs
for sub in ["SMAP", "VIC", "NOAH", "MOSAIC"]:
    if not os.path.isdir(f"{output_path}/{sub}"):
        os.mkdir(f"{output_path}/{sub}")

In [16]:
%%time
# SMAP
process_smap(subset_name, list_of_states)

CPU times: user 1min 44s, sys: 6.74 s, total: 1min 51s
Wall time: 2min 51s


In [17]:
%%time
# NLDAS
process_nldas(subset_name, list_of_states)

CPU times: user 5min 54s, sys: 25.7 s, total: 6min 19s
Wall time: 9min 39s


In [6]:
%%time
# Forcing
process_forcing(subset_name, list_of_states)

CPU times: user 7min 37s, sys: 26.1 s, total: 8min 3s
Wall time: 10min 42s
