In [17]:
import numpy as np
import xarray as xr
import dask
import os
from glob import glob

import xclim
xclim.set_options(cf_compliance="log");

### Preliminaries

In [18]:
###############################
# Set paths
# UPDATE THIS FOR REPRODUCTION
###############################
in_path = '/gpfs/group/kaf26/default/dcl5300/ISIMIP3b_input_climate_data/files/'
out_path = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/metrics/isimip3b/'
quantile_path = '/gpfs/group/kaf26/default/dcl5300/lafferty-sriver_inprep_tbh_DATA/quantiles/'

In [19]:
###################
# Models
###################
from utils import isimip_ssp_dict

models = list(isimip_ssp_dict.keys())

In [20]:
####################
# File year layout
####################
start_years = [2015] + [yr for yr in range(2021,2101,10)]
year_steps = [5] + [9 for yr in range(2021,2101,10)]

In [21]:
###################
# Model details
###################
model_info = {}
for model in models:
    tmp = glob(in_path + model.lower() + '*_w5e5_ssp126_pr_global_daily_2015_2020.nc')
    tmp = tmp[0].replace(in_path + model.lower() + '_', '').replace('_w5e5_ssp126_pr_global_daily_2015_2020.nc', '')
    model_info.update({model: tmp})

In [22]:
############
# Dask
############
from dask_jobqueue import PBSCluster
cluster = PBSCluster(cores=1, resource_spec='pmem=30GB', memory='30GB',
                     worker_extra_args= ['#PBS -l feature=rhel7'],
                     walltime = '01:30:00')

cluster.scale(jobs=25)  # ask for jobs

from dask.distributed import Client
client = Client(cluster)

client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.102.201.236:42512,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Simple metrics (no historical quantiles required)

In [7]:
########################################################
# Calculate the metric for a 
# single model-year, including all SSPs and variables
########################################################
def model_year_metric(path, model, model_vers, ssps, var_ids, year, year_step, metric):
    # Function for longest consecutive spell if needed
    def n_longest_consecutive(ds, dim='time'):
        ds = ds.cumsum(dim=dim) - ds.cumsum(dim=dim).where(ds == 0).ffill(dim=dim).fillna(0)
        return ds.max(dim=dim)

    # Set up dictionary for all results
    ds_all = {}
    # Loop through SSPs
    for ssp in ssps:
        # Temporary list for each SSP
        ds_list = []
        # Loop through variables
        for var in var_ids:
            ## Temporary file for each variable
            ds_tmp = xr.open_dataset(path + model + '_' + model_vers + '_w5e5_' + 
                                     ssp + '_' + var + '_global_daily_' + str(year) 
                                     + '_' + str(year + year_step) + '.nc')
            
            ## Convert units
            # temperature: K -> C
            if var == 'tas' and ds_tmp.tas.attrs['units'] == 'K':
                ds_tmp['tas'] = ds_tmp['tas'] - 273.15
            if var == 'tasmax' and ds_tmp.tasmax.attrs['units'] == 'K':
                ds_tmp['tasmax'] = ds_tmp['tasmax'] - 273.15
            if var == 'tasmin' and ds_tmp.tasmin.attrs['units'] == 'K':
                ds_tmp['tasmin'] = ds_tmp['tasmin'] - 273.15

            # precip: kg m-2 s-1 -> mm day-1
            if var == 'pr' and ds_tmp.pr.attrs['units'] == 'kg m-2 s-1':
                ds_tmp['pr'] = ds_tmp['pr'] * 86400
                ds_tmp.pr.attrs['units'] = 'mm/day'

            # Calculate metric
            if metric == 'avg':
                ds_tmp = ds_tmp.resample(time='1Y').mean()
            elif metric == 'max':
                ds_tmp = ds_tmp.resample(time='1Y').max()
            elif metric == 'dry':
                # Number of dry days
                ds_tmp_0 = (ds_tmp == 0.).resample(time='1Y').sum() # 0mm
                ds_tmp_1 = (ds_tmp < 1.).resample(time='1Y').sum() # less than 1mm
                # Longest sonsecutive dry day streak
                ds_tmp_0c = (ds_tmp == 0.).resample(time='1Y').apply(n_longest_consecutive) # 0mm longest consecutive
                ds_tmp_1c = (ds_tmp < 1.).resample(time='1Y').apply(n_longest_consecutive) # less than 1mm longest consecutive
                # Merge
                ds_tmp = xr.merge([ds_tmp_0.rename({'pr':'count_eq_0'}),
                                   ds_tmp_0c.rename({'pr':'streak_eq_0'}),
                                   ds_tmp_1.rename({'pr':'count_lt_1'}),
                                   ds_tmp_1c.rename({'pr':'streak_lt_1'})])
            elif metric == 'max5d':
                ds_tmp = xclim.indicators.icclim.RX5day(ds=ds_tmp, freq='Y')
                ds_tmp = xr.Dataset({'RX5day':ds_tmp})
                
            # Append to list
            ds_list.append(ds_tmp)
            
        # Append to dict
        ds_all.update({ssp: ds_list})

    # Merge and concat along ssp dimension
    for ssp in ssps:
        ds_all[ssp] = xr.merge(ds_all[ssp])
        ds_all[ssp] = ds_all[ssp].assign_coords(ssp = ssp)
    
    # Return
    ds_out = xr.concat([ds_all[ssp] for ssp in ssps], dim='ssp')
    return ds_out

### Annual averages

In [22]:
# Loop through models: RUNTIME IS ~10 MINS PER MODEL WITH 9 DASK WORKERS
metric = 'avg'

# All variables
var_ids = ['tas', 'tasmin', 'tasmax', 'pr']

for model in models:
    # Check if already exists
    if os.path.isfile(out_path + 'native_grid/' + metric + '/' + model + '.nc'):
        print(model + ' already done')
        continue
    
    # Parallelize with dask over years
    delayed_res = []

    for year, year_step in zip(start_years, year_steps):
        tmp_res = dask.delayed(model_year_metric)(path = in_path,
                                                  model = model.lower(),
                                                  model_vers = model_info[model],
                                                  ssps = isimip_ssp_dict[model],
                                                  var_ids = var_ids,
                                                  year = year, 
                                                  year_step = year_step,
                                                  metric = metric)
        delayed_res.append(tmp_res)
            
    # Compute
    res = dask.compute(*delayed_res)

    # Store
    df_final = xr.combine_by_coords(res)
    df_final.to_netcdf(out_path + 'native_grid/' + metric + '/' + model + '.nc')

    print(model)

CanESM5 already done
CNRM-CM6-1 already done
CNRM-ESM2-1 already done
EC-Earth3 already done
GFDL-ESM4 already done
IPSL-CM6A-LR already done
MIROC6 already done
MPI-ESM1-2-HR already done
MRI-ESM2-0 already done
UKESM1-0-LL already done


### 1-day max

In [23]:
metric = 'max'

# All variables
var_ids = ['tas', 'tasmin', 'tasmax', 'pr']

for model in models:
    # Check if already exists
    if os.path.isfile(out_path + 'native_grid/' + metric + '/' + model + '.nc'):
        print(model + ' already done')
        continue
    
    # Parallelize with dask over years
    delayed_res = []

    for year, year_step in zip(start_years, year_steps):
        tmp_res = dask.delayed(model_year_metric)(path = in_path,
                                                  model = model.lower(),
                                                  model_vers = model_info[model],
                                                  ssps = isimip_ssp_dict[model],
                                                  var_ids = var_ids,
                                                  year = year, 
                                                  year_step = year_step,
                                                  metric = metric)
        delayed_res.append(tmp_res)
            
    # Compute
    res = dask.compute(*delayed_res)

    # Store
    df_final = xr.combine_by_coords(res)
    df_final.to_netcdf(out_path + 'native_grid/' + metric + '/' + model + '.nc')

    print(model)

CanESM5 already done
CNRM-CM6-1 already done
CNRM-ESM2-1 already done
EC-Earth3 already done
GFDL-ESM4 already done
IPSL-CM6A-LR already done
MIROC6 already done
MPI-ESM1-2-HR already done
MRI-ESM2-0 already done
UKESM1-0-LL already done


### 5-day max

In [None]:
# Loop through models: RUNTIME IS ~10 MINS PER MODEL WITH 9 DASK WORKERS
metric = 'max5d'

# Precip only
var_ids = ['pr']

for model in models:
    # Check if already exists
    if os.path.isfile(out_path + 'native_grid/' + metric + '/' + model + '.nc'):
        print(model + ' already done')
        continue
    
    # Parallelize with dask over years
    delayed_res = []

    for year, year_step in zip(start_years, year_steps):
        tmp_res = dask.delayed(model_year_metric)(path = in_path,
                                                  model = model.lower(),
                                                  model_vers = model_info[model],
                                                  ssps = isimip_ssp_dict[model],
                                                  var_ids = var_ids,
                                                  year = year, 
                                                  year_step = year_step,
                                                  metric = metric)
        delayed_res.append(tmp_res)
            
    # Compute
    res = dask.compute(*delayed_res)

    # Store
    df_final = xr.concat(res, dim='time')
    df_final.to_netcdf(out_path + 'native_grid/' + metric + '/' + model + '.nc')

    print(model)

### Dry days

In [8]:
# Loop through models: RUNTIME IS ~10 MINS PER MODEL WITH 9 DASK WORKERS
metric = 'dry'

# Precip only
var_ids = ['pr']

for model in models:
    # Check if already exists
    if os.path.isfile(out_path + 'native_grid/' + metric + '/' + model + '.nc'):
        print(model + ' already done')
        continue
    
    # Parallelize with dask over years
    delayed_res = []

    for year, year_step in zip(start_years, year_steps):
        tmp_res = dask.delayed(model_year_metric)(path = in_path,
                                                  model = model.lower(),
                                                  model_vers = model_info[model],
                                                  ssps = isimip_ssp_dict[model],
                                                  var_ids = var_ids,
                                                  year = year, 
                                                  year_step = year_step,
                                                  metric = metric)
        delayed_res.append(tmp_res)
            
    # Compute
    res = dask.compute(*delayed_res)

    # Store
    df_final = xr.concat(res, dim='time')
    df_final.to_netcdf(out_path + 'native_grid/' + metric + '/' + model + '.nc')

    print(model)

CanESM5 already done
CNRM-CM6-1 already done
CNRM-ESM2-1 already done
EC-Earth3 already done
GFDL-ESM4
IPSL-CM6A-LR
MIROC6
MPI-ESM1-2-HR
MRI-ESM2-0
UKESM1-0-LL


## Less simple metrics (historical quantiles required)

In [7]:
def model_year_ssp_metric(model_path, quantile_path, model, model_vers, ssp, var_id, year, year_step, obs):
    """
    Reads ISIMIP model output for a given ssp-year and calculates the number of hot/wet days 
    and the longest consecutive hot/wet day streak. This function will be wrapped in dask 
    distributed.
    """
    
    # Subfunction to calculate longest consecutive spell
    def n_longest_consecutive(ds, dim='time'):
        ds = ds.cumsum(dim=dim) - ds.cumsum(dim=dim).where(ds == 0).ffill(dim=dim).fillna(0)
        return ds.max(dim=dim)
    
    # Read historical quantiles
    if var_id in ['tasmax', 'tasmin', 'tas']:
        if 'gmfd' in obs:
            ds_q_gmfd = xr.open_dataset(quantile_path + 'gmfd_temperature_quantiles_isimip.nc')
        if 'era5' in obs:
            ds_q_era5 = xr.open_dataset(quantile_path + 'era5_temperature_quantiles_isimip', engine='zarr')
    elif var_id == 'pr':
        if 'gmfd' in obs:
            ds_q_gmfd = xr.open_dataset(quantile_path + 'gmfd_precip_quantiles_isimip.nc')
        if 'era5' in obs:
            ds_q_era5 = xr.open_dataset(quantile_path + 'era5_precip_quantiles_isimip', engine='zarr')
    
    # Read model file
    ds_tmp = xr.open_dataset(model_path + model + '_' + model_vers + '_w5e5_' + 
                             ssp + '_' + var_id + '_global_daily_' + str(year) 
                             + '_' + str(year + year_step) + '.nc')

    ds_tmp = ds_tmp.sel(lat=slice(90,-60)) # both obs extend only to 60S
           
    # Temperature: K -> C
    if var_id == 'tas' and ds_tmp.tas.attrs['units'] == 'K':
        ds_tmp['tas'] = ds_tmp['tas'] - 273.15
    if var_id == 'tasmax' and ds_tmp.tasmax.attrs['units'] == 'K':
        ds_tmp['tasmax'] = ds_tmp['tasmax'] - 273.15
    if var_id == 'tasmin' and ds_tmp.tasmin.attrs['units'] == 'K':
        ds_tmp['tasmin'] = ds_tmp['tasmin'] - 273.15

    # Precip: kg m-2 s-1 -> mm day-1
    if var_id == 'pr' and ds_tmp.pr.attrs['units'] == 'kg m-2 s-1':
        ds_tmp['pr'] = ds_tmp['pr'] * 86400

    # Calculate metrics
    ds_tmp_out = []
    for rp in ['q99', 'rp10']:
        # GMFD
        if 'gmfd' in obs:
            # Above/below binary
            ds_tmp_q_gmfd = ds_tmp[var_id] > ds_q_gmfd[var_id + '_' + rp]
            # Count
            ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
            # Streak
            ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
            
        # ERA5
        if 'era5' in obs:
            # Above/below binary
            ds_tmp_q_era5 = ds_tmp[var_id] > ds_q_era5[var_id + '_' + rp]
            # Count
            ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_count': ds_tmp_q_era5_count}))
            # Streak
            ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
    
    # Merge and return
    ds_out = xr.merge(ds_tmp_out)
    ds_out = ds_out.assign_coords(ssp=ssp)
    return ds_out

### Wet days

In [8]:
%%time
# Loop through models
metric = 'wet'
var_id = 'pr'

for model in models:
    # Check if already exists
    if os.path.isfile(out_path + 'native_grid/' + metric + '/' + model + '.nc'):
        print(model + ' already done')
        continue
    
    # Parallelize with dask over ssp-years
    delayed_res = []

    for ssp in isimip_ssp_dict[model]:
        for year, year_step in zip(start_years, year_steps):
            tmp_res = dask.delayed(model_year_ssp_metric)(in_path,
                                                          quantile_path,
                                                          model.lower(),
                                                          model_info[model],
                                                          ssp,
                                                          var_id,
                                                          year,
                                                          year_step,
                                                          ['gmfd', 'era5'])
            delayed_res.append(tmp_res)
            
    # Compute
    res = dask.compute(*delayed_res)
    
    # Combine in correct order along ssp, year
    df_final = xr.concat([xr.concat([ds for ds in res if ds.ssp == ssp], dim='time') for ssp in isimip_ssp_dict[model]], dim='ssp')
    del res

    # Store
    df_final.to_netcdf(out_path + 'native_grid/' + metric + '/' + model + '.nc')
    del df_final 
    
    print(model)

CanESM5
CNRM-CM6-1
CNRM-ESM2-1
EC-Earth3
GFDL-ESM4
IPSL-CM6A-LR
MIROC6
MPI-ESM1-2-HR
MRI-ESM2-0
UKESM1-0-LL
CPU times: user 13min 11s, sys: 1min 24s, total: 14min 35s
Wall time: 1h 12min 50s


### Hot days

In [9]:
# Loop through models (around 3.5 hours)
metric = 'hot'

for model in models:
    for var_id in ['tasmin', 'tasmax', 'tas']:
        # Check if already exists
        if os.path.isfile(out_path + 'native_grid/' + metric + '/' + model + '_' + var_id + '.nc'):
            print(model + ' ' + var_id + ' already done')
            continue
    
        # Parallelize with dask over ssp-years
        delayed_res = []

        for ssp in isimip_ssp_dict[model]:
            for year, year_step in zip(start_years, year_steps):
                tmp_res = dask.delayed(model_year_ssp_metric)(in_path,
                                                              quantile_path,
                                                              model.lower(),
                                                              model_info[model],
                                                              ssp,
                                                              var_id,
                                                              year,
                                                              year_step,
                                                              ['gmfd', 'era5'])
                delayed_res.append(tmp_res)
            
        # Compute
        res = dask.compute(*delayed_res)
    
        # Combine in correct order along ssp, year
        df_final = xr.concat([xr.concat([ds for ds in res if ds.ssp == ssp], dim='time') for ssp in isimip_ssp_dict[model]], dim='ssp')
        del res

        # Store
        df_final.to_netcdf(out_path + 'native_grid/' + metric + '/' + model + '_' + var_id + '.nc')
        print(model + ' ' + var_id)

CanESM5 tasmin
CanESM5 tasmax
CanESM5 tas
CNRM-CM6-1 tasmin
CNRM-CM6-1 tasmax
CNRM-CM6-1 tas
CNRM-ESM2-1 tasmin
CNRM-ESM2-1 tasmax
CNRM-ESM2-1 tas
EC-Earth3 tasmin
EC-Earth3 tasmax
EC-Earth3 tas
GFDL-ESM4 tasmin
GFDL-ESM4 tasmax
GFDL-ESM4 tas
IPSL-CM6A-LR tasmin
IPSL-CM6A-LR tasmax
IPSL-CM6A-LR tas
MIROC6 tasmin
MIROC6 tasmax
MIROC6 tas
MPI-ESM1-2-HR tasmin
MPI-ESM1-2-HR tasmax
MPI-ESM1-2-HR tas
MRI-ESM2-0 tasmin
MRI-ESM2-0 tasmax
MRI-ESM2-0 tas
UKESM1-0-LL tasmin
UKESM1-0-LL tasmax
UKESM1-0-LL tas


## Multivariate metrics (historical quantiles required)

### Hot and dry days

In [23]:
def model_year_ssp_hotdry(model_path, quantile_path, model, model_vers, ssp, year, year_step, obs):
    """
    Reads ISIMIP model output for a given ssp-year and calculates the number of hot+dry days 
    and the longest consecutive hot+dry day streak. This function will be wrapped in dask 
    distributed.
    """
    
    # Subfunction to calculate longest consecutive spell
    def n_longest_consecutive(ds, dim='time'):
        ds = ds.cumsum(dim=dim) - ds.cumsum(dim=dim).where(ds == 0).ffill(dim=dim).fillna(0)
        return ds.max(dim=dim)
    
    # Read historical quantiles
    if 'gmfd' in obs:
        ds_q_gmfd = xr.open_dataset(quantile_path + 'gmfd_temperature_quantiles_isimip.nc')
    if 'era5' in obs:
        ds_q_era5 = xr.open_dataset(quantile_path + 'era5_temperature_quantiles_isimip', engine='zarr')
    
    # Read model file
    ds_tasmax_tmp = xr.open_dataset(model_path + model + '_' + model_vers + '_w5e5_' + 
                             ssp + '_tasmax_global_daily_' + str(year) 
                             + '_' + str(year + year_step) + '.nc')

    ds_tasmax_tmp = ds_tasmax_tmp.sel(lat=slice(90,-60)) # both obs extend only to 60S

    ds_pr_tmp = xr.open_dataset(model_path + model + '_' + model_vers + '_w5e5_' + 
                             ssp + '_pr_global_daily_' + str(year) 
                             + '_' + str(year + year_step) + '.nc')

    ds_pr_tmp = ds_pr_tmp.sel(lat=slice(90,-60)) # both obs extend only to 60S
           
    # Temperature: K -> C
    if ds_tasmax_tmp.tasmax.attrs['units'] == 'K':
        ds_tasmax_tmp['tasmax'] = ds_tasmax_tmp['tasmax'] - 273.15

    # Precip: kg m-2 s-1 -> mm day-1
    if ds_pr_tmp.pr.attrs['units'] == 'kg m-2 s-1':
        ds_pr_tmp['pr'] = ds_pr_tmp['pr'] * 86400

    # Calculate metrics
    ds_tmp_out = []
    for rp in ['q99', 'rp10']:
        # GMFD
        if 'gmfd' in obs:
            # Above/below binary
            ds_tmp_q_gmfd = (ds_tasmax_tmp['tasmax'] > ds_q_gmfd['tasmax_' + rp]) & (ds_pr_tmp['pr'] < 1.)
            # Count
            ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
            # Streak
            ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
            
        # ERA5
        if 'era5' in obs:
            # Above/below binary
            ds_tmp_q_era5 = (ds_tasmax_tmp['tasmax'] > ds_q_era5['tasmax_' + rp]) & (ds_pr_tmp['pr'] < 1.)
            # Count
            ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'era5_count': ds_tmp_q_era5_count}))
            # Streak
            ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
    
    # Merge and return
    ds_out = xr.merge(ds_tmp_out)
    ds_out = ds_out.assign_coords(ssp=ssp)
    return ds_out

In [None]:
%%time
# Loop through models: RUNTIME IS ~20 MINS PER MODEL WITH 25 DASK WORKERS
metric = 'hotdry'

for model in models[:5]:
    # Check if already exists
    if os.path.isfile(out_path + 'native_grid/' + metric + '/' + model + '.nc'):
        print(model + ' already done')
        continue
    
    # Parallelize with dask over ssp-years
    delayed_res = []

    for ssp in isimip_ssp_dict[model]:
        for year, year_step in zip(start_years, year_steps):
            tmp_res = dask.delayed(model_year_ssp_hotdry)(in_path,
                                                          quantile_path,
                                                          model.lower(),
                                                          model_info[model],
                                                          ssp,
                                                          year,
                                                          year_step,
                                                          ['gmfd','era5'])
            delayed_res.append(tmp_res)
            
    # Compute
    res = dask.compute(*delayed_res)
    
    # Combine in correct order along ssp, year
    df_final = xr.concat([xr.concat([ds for ds in res if ds.ssp == ssp], dim='time') for ssp in isimip_ssp_dict[model]], dim='ssp')
    del res

    # Store
    df_final.to_netcdf(out_path + 'native_grid/' + metric + '/' + model + '.nc')
    del df_final 
    
    print(model)

CanESM5 already done
CNRM-CM6-1 already done
