In [1]:
import os
from glob import glob

import dask
import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe

from utils import city_list

## Preliminaries

In [2]:
################
#### Paths #####
################
# Update these for reproduction

project_data_path = "/storage/group/pches/default/users/dcl5300/conus_comparison_lafferty-etal-2024/"
project_code_path = "/storage/home/dcl5300/work/current_projects/conus_comparison_lafferty-etal-2024/"
gard_path = "/storage/group/pches/default/public/GARD-LENS" # GARD-LENS summary stats
gard_gcms = ['canesm5', 'cesmlens2', 'ecearth3']

In [3]:
# Check dims
ds = xr.open_dataset(f'{gard_path}/GARDLENS_pcp_stats_CONUS.nc', chunks='auto')
gcms = ds.isel(year=100)['gcm'].to_numpy()
unique, counts = np.unique(gcms, return_counts=True)
dict(zip(unique, counts))

{np.str_('canesm5'): np.int64(50),
 np.str_('cesmlens2'): np.int64(100),
 np.str_('ecearth3'): np.int64(50)}

In [4]:
# Check dims
ds = xr.open_dataset(f'{gard_path}/GARDLENS_t_mean_stats_CONUS.nc', chunks='auto')
gcms = ds.isel(year=100)['gcm'].to_numpy()
unique, counts = np.unique(gcms, return_counts=True)
dict(zip(unique, counts))

{np.str_('canesm5'): np.int64(50),
 np.str_('cesmlens2'): np.int64(100),
 np.str_('ecearth3'): np.int64(50)}

In [5]:
# Get unique combinations
ds = xr.open_dataset(f'{gard_path}/GARDLENS_t_mean_stats_CONUS.nc').isel(year=0)
gard_info = {}
for gcm in gcms:
    gard_info.update({gcm: ds.where(ds.gcm == gcm, drop=True)['ens'].to_numpy()})

In [6]:
############
### Dask ###
############
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    # account="pches",
    account="open",
    cores=1,
    memory="30GiB",
    walltime="01:00:00"
)

cluster.scale(jobs=20)  # ask for jobs

from dask.distributed import Client

client = Client(cluster)

client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.0.156:38211,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


# Calculate metrics 

In [7]:
####################
# ALREADY DONE
####################

# Regrid

In [8]:
# We use LOCA grid as target
loca_lat_grid = np.linspace(23.90625, 53.46875, 474)
loca_lon_grid = np.linspace(234.53125, 293.46875, 944)
    
ds_out = xr.Dataset({"lat": (["lat"], loca_lat_grid,
                             {"standard_name": "latitude", "units": "degrees_north"}),
                     "lon": (["lon"], loca_lon_grid,
                             {"standard_name": "longitude", "units": "degrees_east"})
                    })
    
# Add mask from LOCA output
loca_nans = np.load(f'{project_code_path}/code/utils/LOCA2_NaNs.npy')
ds_out["mask"] = xr.DataArray(~loca_nans, dims=['lat','lon'])
    
# GARD-LENS grid to construct regridder
example_file = f'{gard_path}/GARDLENS_t_mean_stats_CONUS.nc'
ds_in = xr.open_dataset(example_file).isel(year=0, n_ens=0)
    
# Regridder
conservative_regridder = xe.Regridder(ds_in, ds_out, "conservative")
# nn_s2d_regridder = xe.Regridder(ds_in, ds_out, "nearest_s2d")

In [9]:
# Some small preprocessing for GARD-LENS
def _preprocess(ds, gard_stat_id, metric_id):
    # Re-index
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ds = ds.set_index(n_ens=['gcm', 'scen', 'ens']).unstack('n_ens')
        ds = ds.rename({'gcm':'model', 'scen':'ssp', 'ens':'member'})
    # Rename
    ds = ds.rename({gard_stat_id: metric_id})[[metric_id]]

    return ds

# Regridding function
def regrid_gard(ds_in, gard_var_id, gard_stat_id, model, member, metric_id, regridder, regridder_name, out_path):
    # Check if done
    if not os.path.exists(f"{out_path}/{regridder_name}/{metric_id}_{model}_{member}_ssp370.nc"):
        # Select GCM and member
        ds_in = ds_in.where((ds_in.gcm == model) & (ds_in.ens == member), drop=True)
    
        # Tidy
        ds_in = _preprocess(ds_in, gard_stat_id, metric_id)
    
        # Regrid
        # NOTE: use high NaN threshold to try to not introduce NaNs
        # not already present in the LOCA2 grid
        ds_out = regridder(ds_in, skipna=True, na_thres=0.99)
        
        # Store
        # comp = dict(zlib=True, complevel=5)
        # encoding = {var: comp for var in ds_out.data_vars}
        ds_out.to_netcdf(f"{out_path}/{regridder_name}/{metric_id}_{model}_{member}_ssp370.nc")

In [10]:
# Run it
out_path = f"{project_data_path}/metrics_regridded/GARD-LENS/"

#####################
# avg tas
gard_var_id = 't_mean'
gard_stat_id = 'mean'
metric_id = 'avg_tas'

ds_in = xr.open_dataset(f'{gard_path}/GARDLENS_{gard_var_id}_stats_CONUS.nc')

for model in gard_info.keys():
    for member in gard_info[model]:
        regrid_gard(ds_in = ds_in,
                    gard_var_id = gard_var_id, 
                    gard_stat_id = gard_stat_id,
                    model = model,
                    member = member,
                    metric_id = metric_id,
                    regridder = conservative_regridder,
                    regridder_name = 'conservative',
                    out_path = out_path)

#####################
# max precip
gard_var_id = 'pcp'
gard_stat_id = 'max'
metric_id = 'max_pr'

ds_in = xr.open_dataset(f'{gard_path}/GARDLENS_{gard_var_id}_stats_CONUS.nc')

for model in gard_info.keys():
    for member in gard_info[model]:
        regrid_gard(ds_in = ds_in,
                    gard_var_id = gard_var_id, 
                    gard_stat_id = gard_stat_id,
                    model = model,
                    member = member,
                    metric_id = metric_id,
                    regridder = conservative_regridder,
                    regridder_name = 'conservative',
                    out_path = out_path)

#####################
# sum precip
gard_var_id = 'pcp'
gard_stat_id = 'sum'
metric_id = 'sum_pr'

ds_in = xr.open_dataset(f'{gard_path}/GARDLENS_{gard_var_id}_stats_CONUS.nc')

for model in gard_info.keys():
    for member in gard_info[model]:
        regrid_gard(ds_in = ds_in,
                    gard_var_id = gard_var_id, 
                    gard_stat_id = gard_stat_id,
                    model = model,
                    member = member,
                    metric_id = metric_id,
                    regridder = conservative_regridder,
                    regridder_name = 'conservative',
                    out_path = out_path)

# Summaries

## Indices

In [17]:
# Calculates summary indices for GARD-LENS model ensemble for given SSP
def get_summary_indices(metric_id, model, years, out_path, out_str):
    """
    Current summary indices calculated: mean, 99th quantile, 99% quantile range
    `years` define the window over which all outputs are pooled. 
    """
    
    # Check if done
    if not os.path.isfile(f"{out_path}/{out_str}.nc"):
        # Read all
        ds = xr.open_mfdataset(f"{project_data_path}/metrics_regridded/GARD-LENS/conservative/{metric_id}_{model}_*.nc", chunks='auto')

        # Time slice
        ds = ds.rename({'year':'time'})
        ds_sel = ds.sel(time=slice(years[0],years[1]))
    
        ## Summary indices
        # Mean
        ds_mean = ds_sel.mean(dim=['member', 'time']).assign_coords(indice = 'mean')
        # Quantiles
        ds_qlow = ds_sel.chunk(dict(member=-1)).quantile(0.005, dim=['member', 'time'])
        ds_qhigh = ds_sel.chunk(dict(member=-1)).quantile(0.995, dim=['member', 'time'])
        ds_qrange = (ds_qhigh - ds_qlow).assign_coords(indice = '99range')
    
        ds_q99 = ds_sel.chunk(dict(member=-1)).quantile(0.99, dim=['member', 'time']).assign_coords(indice = 'q99')

        # Store
        ds_out = xr.concat([ds_mean, ds_qrange, ds_q99], dim='indice')
        ds_out.to_netcdf(f"{out_path}/{out_str}.nc")

In [None]:
%%time
ssp = 'ssp370'

for years in [[2020,2040], [2050,2070], [2080,2100]]:
    for metric_id in ['avg_tas', 'max_pr', 'sum_pr']:
        for model in gard_info.keys():
            get_summary_indices(metric_id = metric_id,
                                model = model,
                                years = years,
                                out_path=f"{project_data_path}/summary_indices",
                                out_str=f"GARD-LENS_{model}_{ssp}_{years[0]}-{years[1]}_{metric_id}")

## Timeseries

### Raw

In [6]:
# Some small preprocessing for GARD-LENS
def _preprocess(ds, gard_stat_id, metric_id):
    # Re-index
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ds = ds.set_index(n_ens=['gcm', 'scen', 'ens']).unstack('n_ens')
        ds = ds.rename({'gcm':'model', 'scen':'ssp', 'ens':'member'})
    # Rename
    ds = ds.rename({gard_stat_id: metric_id})[[metric_id]]

    return ds

In [50]:
# Calculates summary indices for GARD-LENS ensemble for given GCM
def get_raw_data(ds, gard_stat_id, metric_id, model, years, lat, lon, out_path, out_str):
    """
    """
    # Check if done:
    if not os.path.isfile(f"{out_path}/{out_str}.csv"):
        
        # Select GCM
        ds = ds.where((ds.gcm == model), drop=True)
        
        # Location selection first
        ds_sel = ds.sel(lat=lat, lon=lon, method='nearest')
        
        # Tidy
        ds_sel = _preprocess(ds_sel, gard_stat_id, metric_id)
        
        # Time slice
        if years is not None:
            ds_sel = ds_sel.sel(time=slice(years[0],years[1]))
        
        # Construct dataframe
        df_out = ds_sel.to_dataframe().reset_index().dropna().drop(columns=["lat", "lon"])
        df_out["ssp"] = 'ssp370'
        df_out["model"] = model
            
        # Store
        df_out.to_csv(f"{out_path}/{out_str}.csv", index=False)

In [51]:
# Run it
out_path = f"{project_data_path}/summary_raw_original_grid/"

#####################
# avg tas
#####################
gard_var_id = 't_mean'
gard_stat_id = 'mean'
metric_id = 'avg_tas'

ds_in = xr.open_dataset(f'{gard_path}/GARDLENS_{gard_var_id}_stats_CONUS.nc')

for model in gard_info.keys():
    for city in ['chicago', 'nyc', 'denver']:
        lat, lon = city_list[city]
        get_raw_data(ds = ds_in,
                     gard_stat_id = gard_stat_id,
                     metric_id = metric_id, 
                     model = model, 
                     years = None, 
                     lat=lat, lon=lon,
                     out_path = out_path,
                     out_str = f"{city}_GARD-LENS_{model}_ssp370_{metric_id}")

        
#####################
# max tas
#####################
gard_var_id = 't_mean'
gard_stat_id = 'max'
metric_id = 'max_tas'

ds_in = xr.open_dataset(f'{gard_path}/GARDLENS_{gard_var_id}_stats_CONUS.nc')

for model in gard_info.keys():
    for city in ['chicago', 'nyc', 'denver']:
        lat, lon = city_list[city]
        get_raw_data(ds = ds_in,
                     gard_stat_id = gard_stat_id,
                     metric_id = metric_id, 
                     model = model, 
                     years = None, 
                     lat=lat, lon=lon,
                     out_path = out_path,
                     out_str = f"{city}_GARD-LENS_{model}_ssp370_{metric_id}")

#####################
# max precip
#####################
gard_var_id = 'pcp'
gard_stat_id = 'max'
metric_id = 'max_pr'

ds_in = xr.open_dataset(f'{gard_path}/GARDLENS_{gard_var_id}_stats_CONUS.nc')

for model in gard_info.keys():
    for city in ['chicago', 'nyc', 'denver']:
        lat, lon = city_list[city]
        get_raw_data(ds = ds_in,
                     gard_stat_id = gard_stat_id,
                     metric_id = metric_id, 
                     model = model, 
                     years = None, 
                     lat=lat, lon=lon,
                     out_path = out_path,
                     out_str = f"{city}_GARD-LENS_{model}_ssp370_{metric_id}")

#####################
# sum precip
#####################
gard_var_id = 'pcp'
gard_stat_id = 'sum'
metric_id = 'sum_pr'

ds_in = xr.open_dataset(f'{gard_path}/GARDLENS_{gard_var_id}_stats_CONUS.nc')

for model in gard_info.keys():
    for city in ['chicago', 'nyc', 'denver']:
        lat, lon = city_list[city]
        get_raw_data(ds = ds_in,
                     gard_stat_id = gard_stat_id,
                     metric_id = metric_id, 
                     model = model, 
                     years = None, 
                     lat=lat, lon=lon,
                     out_path = out_path,
                     out_str = f"{city}_GARD-LENS_{model}_ssp370_{metric_id}")

### Regridded

In [24]:
# Calculates summary indices for GARD-LENS ensemble for given GCM
def get_raw_data(metric_id, model, years, lat, lon, out_path, out_str):
    """
    """
    # Check if done
    if not os.path.isfile(f"{out_path}/{out_str}.csv"):
        
        # Read all
        ds = xr.open_mfdataset(f"{project_data_path}/metrics_regridded/GARD-LENS/conservative/{metric_id}_{model}_*.nc", chunks='auto')
    
        # Time slice
        ds = ds.rename({'year':'time'})
        ds_sel = ds.sel(time=slice(years[0],years[1]))

        # Location selection
        if lon < 0:
            lon = 360 + lon
        ds_sel = ds_sel.sel(lat=lat, lon=lon, method='nearest')
    
        # Construct dataframe
        df_out = ds_sel.to_dataframe().drop(columns=["lat", "lon"]).reset_index()
        df_out["ssp"] = 'ssp370'
        df_out["model"] = model
        
        # Store
        df_out.to_csv(f"{out_path}/{out_str}.csv", index=False)

In [29]:
%%time
for city in city_list.keys():
    lat, lon = city_list[city]
    for years in [[2020,2040], [2050,2070], [2080,2100]]:
        for metric_id in ['avg_tas', 'sum_pr', 'max_pr']:
            for model in gard_info.keys():
                get_raw_data(metric_id = metric_id, 
                             model = model,
                             years=years,
                             lat=lat, lon=lon,
                             out_path=f"{project_data_path}/summary_raw",
                             out_str=f"{city}_GARD-LENS_{model}_ssp370_{years[0]}-{years[1]}_{metric_id}")

CPU times: user 3min 23s, sys: 31.6 s, total: 3min 55s
Wall time: 16min 26s
