In [1]:
import os 
from glob import glob
from functools import reduce 

import numpy as np

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

import xarray as xr
import pandas as pd
import dask

from SALib.analyze import delta

plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = "Arial"
plt.rcParams["font.size"] = 12
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["axes.linewidth"] = 1.50

from matplotlib.colors import LinearSegmentedColormap, ListedColormap

cm_data = np.loadtxt("./utils/colormaps/batlow.txt")[::-1]
sc_cmap = LinearSegmentedColormap.from_list("cmap", cm_data, N=10)

import cartopy.crs as ccrs
import cartopy.feature as cfeature

from utils.global_paths import project_data_path, project_code_path, loca_path
from utils.constants import obs_names

In [9]:
############
### Dask ###
############
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    # account="pches",
    account="open",
    cores=1,
    memory="30GiB",
    walltime="06:00:00"
)
cluster.scale(jobs=20)  # ask for jobs

from dask.distributed import Client
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.0.155:37407,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Preliminaries

In [2]:
# Info 
subset_name = 'eCONUS'

# Metrics
soil_metrics_raw = ['mean', '5dmin', '5dmax']
soil_metrics_change = ['mean-change', '5dmin-change', '5dmax-change']

# Soil labels for plots
soil_labels = {"mean-anom": "Annual Average \nSoil Moisture Anomaly", 
               "5dmin-anom": "Soil Moisture Anomaly \nDuring Driest 5 Days",
               "5dmax-anom": "Soil Moisture Anomaly \nDuring Wettest 5 Days",
               "mean-change": "Annual Average \nSoil Moisture Change", 
               "5dmin-change": "Soil Moisture Change \nDuring Driest 5 Days",
               "5dmax-change": "Soil Moisture Change \nDuring Wettest 5 Days",
               "mean": "Annual Average \nSoil Moisture", 
               "5dmin": "Soil Moisture \nDuring Driest 5 Days",
               "5dmax": "Soil Moisture \nDuring Wettest 5 Days"}

# Time slices to analyze
time_slices = [[2030, 2039], [2050,2059], [2080,2089]]

# SSPs
ssps = ['ssp245', 'ssp370']

In [3]:
# Models 
models = os.listdir(f"{loca_path}/")
models.remove('training_data')
models.remove('scripts')

loca_all = {}

# Loop through models
for model in models:
    loca_all[model] = {}
    # Loop through members
    members = os.listdir(f"{loca_path}/{model}/0p0625deg/")
    for member in members:
        # Append SSPs
        ssps = os.listdir(f"{loca_path}/{model}/0p0625deg/{member}/")
        loca_all[model][member] = ssps

# Matches website (https://loca.ucsd.edu/loca-version-2-for-north-america-ca-jan-2023/) as of Jan 2023
print(f"# models: {len(models)}")
print(f"# model/expts: {np.sum([len(np.unique([item for row in [loca_all[model][member] for member in loca_all[model].keys()] for item in row])) for model in models])}")
print(f"# model/expts/ens: {np.sum([len(loca_all[model][ssp]) for model in models for ssp in loca_all[model]])}")
print(f"# model/expts/ens (not including historical): {np.sum([len([ssp for ssp in loca_all[model][member] if ssp != 'historical']) for model in models for member in loca_all[model]])}")

# models: 27
# model/expts: 99
# model/expts/ens: 329
# model/expts/ens (not including historical): 221


In [4]:
# Read all soil moisture metrics
def read_all(subset_name, soil_metric, ssps):
    # For all
    ds_all = []

    # Loop through models
    for model in models:
        # Take first member only
        for member in list(loca_all[model].keys())[:1]:
            # Loop through SSPs
            for ssp in loca_all[model][member]:
                if ssp in ssps:
                    projection_id = f"{model}_{member}_{ssp}"
                    ds_proj = []
                    # Loop through obs
                    for obs_name in obs_names:
                        # Concat along loss metrics
                        ds = xr.open_mfdataset(f"{project_data_path}/projections/{subset_name}/metrics/{soil_metric}/{projection_id}_{obs_name}_*.nc",
                                                combine="nested", concat_dim = "loss_metric")
                        # Append
                        ds_proj.append(ds)
                    # Concat along obs
                    ds_proj = xr.concat(ds_proj, dim="obs_name")
                    ds_all.append(ds_proj)
                    
    # Concat along climate
    ds_all = xr.concat(ds_all, dim="projection_id")

    # Fix time dim
    ds_all['time'] = ds_all['time'].dt.year

    return ds_all

# Delta SA

## Anomaly/changes

### Functions

In [10]:
# Reads all .nc projections and stores as parquet file
def store_as_df(subset_name, soil_metric, time_slice):
    # Save path
    save_path = f'{project_data_path}/projections/{subset_name}/metrics_df/{soil_metric}_{time_slice[0]}-{time_slice[1]}'
    
    # Check if done
    if os.path.exists(save_path):
        print(f'{soil_metric} {time_slice} already done!')
    else:
        # Read all (all SSPs)
        ssps = ['ssp245', 'ssp370']
        ds_all = read_all(subset_name, soil_metric, ssps)

        # Select 
        vars_to_drop = ['projection_id', 'member', 'soil_id']
        with dask.config.set(**{'array.slicing.split_large_chunks': False}):
            df = ds_all.sel(time=slice(time_slice[0], time_slice[1])).drop_vars(vars_to_drop).to_dask_dataframe()

        # Store
        df.to_parquet(save_path, append=False, overwrite=True, compute=True, write_index=False)
        print(f'{soil_metric} {time_slice} done')

In [11]:
# Read specified lat/lon and perfrom delta SA
def get_delta(subset_name, soil_metric, time_slice, lat, lon, sa_factors):
    # Read
    sel = [("lat", "==", lat), ("lon", "==", lon)]
    file_path = f'{project_data_path}/projections/{subset_name}/metrics_df/{soil_metric}_{time_slice[0]}-{time_slice[1]}'
    df_loc = pd.read_parquet(file_path, filters=sel, engine="pyarrow")

    # If needed
    df_loc['soil_id'] = df_loc['obs_name'] + "_" + df_loc['loss_metric']

    # Shuffle for good measure
    df = df_loc.sample(frac=1)

    # Skip if all NaN
    if df.isnull().values.any():
        return None

    # Problem defn
    n_factors = len(sa_factors)
    problem = {
        'num_vars': n_factors,
        'names': sa_factors,
    }

    # Perform SA
    X = df[sa_factors].to_numpy()
    Y = df[soil_metric].to_numpy()
    
    Si = delta.analyze(problem, X, Y, num_resamples=2).to_df()
    Si['lat'] = lat
    Si['lon'] = lon

    Si = Si.reset_index().pivot(index=['lat','lon'], columns='index', values='delta')
    
    return Si

In [12]:
# Perform gridpoint-level delta SA on saved parquet file
def perform_delta_sa(subset_name, soil_metric, time_slice, sa_factors, save_name):
    # Check if done
    save_path = f'{project_data_path}/projections/{subset_name}/sa_results/{soil_metric}_{time_slice[0]}-{time_slice[1]}_delta-sa_{save_name}.nc'
    if os.path.exists(save_path):
        print(f'{soil_metric} {time_slice} already done!')
        return None
        
    # Get non-NaN locs
    locs = np.load(f"{project_code_path}/code/utils/grids/{subset_name}_non_nans.npy", allow_pickle=True)

    # Loop over all with dask.delayed
    delayed = []

    for loc in locs:
        lat, lon = loc
        df_tmp = dask.delayed(get_delta)(subset_name, soil_metric, time_slice, lat, lon, sa_factors)
        delayed.append(df_tmp)
    
    # Compute
    delayed_out = dask.compute(*delayed)

    # Pandas dataframe 
    df = pd.concat(delayed_out)
    df = df.rename_axis(None, axis=1)

    # Create a complete grid of lat, lon values
    all_lats = np.load(f"{project_code_path}/code/utils/grids/{subset_name}_lat.npy")
    all_lons = np.load(f"{project_code_path}/code/utils/grids/{subset_name}_lon.npy")

    lon, lat = np.meshgrid(all_lons, all_lats)
    lon_lat_index = pd.MultiIndex.from_arrays([lat.flatten(), lon.flatten()], names=['lat', 'lon'])

    # Reindex to include all lat, lon combinations, filling missing ones with NaN
    df_reindexed = df.reindex(lon_lat_index)

    # Convert the reindexed DataFrame to an xarray Dataset
    ds = xr.Dataset.from_dataframe(df_reindexed)

    # Store
    ds.to_netcdf(save_path)
    print(f'{soil_metric} {time_slice} done')

In [13]:
# Store non-nan coords for subset if not done already
file_path = f"{project_code_path}/code/utils/grids/{subset_name}_non_nans.npy"

if not os.path.exists(file_path):
    # Read all 
    ds_all = read_all(subset_name, soil_metrics[0], ssps)

    # Get non-nans
    ds_all_stacked = ds_all.stack(loc=['lat','lon']).isel(time=10, projection_id=10, obs_name=0, loss_metric=0)[soil_metrics[0]]
    locs = ds_all_stacked[ds_all_stacked.notnull().compute()]['loc'].to_numpy()

    # Save
    np.save(file_path, locs)

### Calculations

In [20]:
# Store as df
for soil_metric in soil_metrics_change:
    for time_slice in time_slices:
        store_as_df(subset_name, soil_metric, time_slice)

mean-change [2030, 2039] done
mean-change [2050, 2059] done
mean-change [2080, 2089] done
5dmin-change [2030, 2039] done
5dmin-change [2050, 2059] done
5dmin-change [2080, 2089] done
5dmax-change [2030, 2039] done
5dmax-change [2050, 2059] done
5dmax-change [2080, 2089] done


In [16]:
%%time
# Get non-NaN locs
locs = np.load(f"{project_code_path}/code/utils/grids/{subset_name}_non_nans.npy", allow_pickle=True)

for loc in locs[:1]:
    lat, lon = loc
    df_tmp = get_delta(subset_name, soil_metric, time_slice, lat, lon, sa_factors)

ValueError: zero-size array to reduction operation minimum which has no identity

In [None]:
# Read
sel = [("lat", "==", lat), ("lon", "==", lon)]
file_path = f'{project_data_path}/projections/{subset_name}/metrics_df/{soil_metric}_{time_slice[0]}-{time_slice[1]}'
df_loc = pd.read_parquet(file_path, filters=sel, engine="pyarrow")

In [None]:
df_loc

In [15]:
soil_metrics_change

['mean-change', '5dmin-change', '5dmax-change']

In [14]:
%%time
# Perform SA
for soil_metric in soil_metrics_change:
    for time_slice in time_slices:
        # All
        sa_factors = ['ssp', 'model', 'time', 'obs_name', 'loss_metric']
        save_name = 'all'
        perform_delta_sa(subset_name, soil_metric, time_slice, sa_factors, save_name)
    
        # Soil grouped
        sa_factors = ['ssp', 'model', 'time', 'soil_id']
        save_name = 'soil_grouped'
        perform_delta_sa(subset_name, soil_metric, time_slice, sa_factors, save_name)

ValueError: zero-size array to reduction operation minimum which has no identity

## Trends

### Calculation

In [22]:
# Linear regression function
def linear_regression(X, y):
    if np.isfinite(y).all() == False:
        return np.array([np.nan, np.nan])
    else:
        return np.polyfit(X, y, 1)

In [23]:
%%time
# SSPs
ssps = ['ssp245', 'ssp370']

# Loop through metrics
for soil_metric in ['mean', '5dmin', '5dmax']:
    # Check if done
    save_path = f'{project_data_path}/projections/eCONUS/metrics/{soil_metric}_trend.nc'
    if not os.path.exists(save_path):
        # Real all
        ds = read_all(subset_name, soil_metric, ssps)

        # Linear trend
        result = xr.apply_ufunc(
            linear_regression,
            ds['time'],  # input x data
            ds[soil_metric],  # input y data
            input_core_dims=[['time'], ['time']],  # specify core dimensions for inputs
            output_core_dims=[['coef']],  # specify core dimensions for output
            vectorize=True,  # apply function element-wise
            dask='parallelized',  # enable parallelization with dask
            output_dtypes=[float],  # specify output data type
            dask_gufunc_kwargs={"output_sizes": {"coef": 2}},
        ).compute()
    
        # Store
        ds_result = xr.Dataset({'result': result})
        ds_result['coef'] = ['trend', 'intcp']
        ds_result.to_netcdf(save_path)
    else:
        print(f"{soil_metric} already done!")

CPU times: user 2min 45s, sys: 32 s, total: 3min 17s
Wall time: 24min 9s


### SA

In [24]:
# Read specified lat/lon and perfrom delta SA
def get_delta(ds, lat, lon, sa_factors):
    # Select
    df_loc = ds.sel(coef='trend').sel(lat=lat, lon=lon).to_dataframe().reset_index()

    # If needed
    df_loc['soil_id'] = df_loc['obs_name'] + "_" + df_loc['loss_metric']

    # Shuffle for good measure
    df = df_loc.sample(frac=1)

    # Skip if all NaN
    if df.isnull().values.any():
        return None

    # Problem defn
    n_factors = len(sa_factors)
    problem = {
        'num_vars': n_factors,
        'names': sa_factors,
    }

    # Perform SA
    X = df[sa_factors].to_numpy()
    Y = df['result'].to_numpy()
    
    Si = delta.analyze(problem, X, Y, num_resamples=2).to_df()
    Si['lat'] = lat
    Si['lon'] = lon

    Si = Si.reset_index().pivot(index=['lat','lon'], columns='index', values='delta')
    
    return Si

In [25]:
%%time

# Choose SA grouping
# sa_factors = ['soil_id', 'ssp', 'model']
# save_name = 'soil_grouped'

sa_factors = ['ssp', 'model', 'obs_name', 'loss_metric']
save_name = 'all'

# Loop through metrics
for soil_metric in ['mean', '5dmin', '5dmax']:
    save_path = f'{project_data_path}/projections/{subset_name}/sa_results/{soil_metric}_trends_delta-sa_{save_name}.nc'
    if os.path.exists(save_path):
        print(f'{soil_metric} already done!')

    else:
        # Read trends
        ds = xr.open_dataset(f'{project_data_path}/projections/{subset_name}/metrics/{soil_metric}_trend.nc')
    
        # Get non-NaN locs
        locs = np.load(f"{project_code_path}/code/utils/grids/{subset_name}_non_nans.npy", allow_pickle=True)
    
        # Loop over all with dask.delayed
        delayed = []
        for loc in locs:
            lat, lon = loc
            df_tmp = dask.delayed(get_delta)(ds, lat, lon, sa_factors)
            delayed.append(df_tmp)
    
        # Compute
        delayed_out = dask.compute(*delayed)
    
        # Pandas dataframe 
        df = pd.concat(delayed_out)
        df = df.rename_axis(None, axis=1)
        
        # Create a complete grid of lat, lon values
        all_lats = np.load(f"{project_code_path}/code/utils/grids/{subset_name}_lat.npy")
        all_lons = np.load(f"{project_code_path}/code/utils/grids/{subset_name}_lon.npy")
        
        lon, lat = np.meshgrid(all_lons, all_lats)
        lon_lat_index = pd.MultiIndex.from_arrays([lat.flatten(), lon.flatten()], names=['lat', 'lon'])
        
        # Reindex to include all lat, lon combinations, filling missing ones with NaN
        df_reindexed = df.reindex(lon_lat_index)
        
        # Convert the reindexed DataFrame to an xarray Dataset
        ds_out = xr.Dataset.from_dataframe(df_reindexed)
    
        # Store
        ds_out.to_netcdf(save_path)

CPU times: user 5min 7s, sys: 9.1 s, total: 5min 16s
Wall time: 26min 15s
