In [2]:
# NOTE: may need to run the following command in terminal to install regionmask:
# mamba install -c conda-forge regionmask cartopy pygeos
# mamba install -c anaconda cryptography==38.0.4

In [1]:
import numpy as np
import pandas as pd
# import regionmask
import xarray as xr
import io
import os
import intake

import getpass
import azure.storage.blob
import zarr

### Preliminaries

In [2]:
######################
# Azure blob storage
######################
# connection string (from azure web login, select your storage account, then "Access keys")
connection_string = getpass.getpass()

    
# format storage
container_client = azure.storage.blob.ContainerClient.from_connection_string(
    connection_string, container_name="mpctransfer")

 ········


In [3]:
###################
# Models
###################
from utils import gardsv_ssp_dict, gardsv_var_dict, deepsdbc_dict

In [4]:
#################
# Data access
#################

# Complete catalog
cat = intake.open_esm_datastore(
    "https://cpdataeuwest.blob.core.windows.net/cp-cmip/version1/catalogs/global-downscaled-cmip6.json"
)

# function to grab all variables and SSPs for singe model/method
def grab_model(method, model, scenarios):
    # Search catalogue for method, model, all SSPs, all vars
    dsets = cat.search(
        method=method,
        source_id=model,
        experiment_id=scenarios,
        timescale='day'
    ).to_dataset_dict()
    
    # Concat along SSP dimension and return
    ds_ssp = []
    for key in list(dsets.keys()):
        # Get single ssp file
        ds_tmp = dsets[key]
        
        # Add ssp dimension
        if 'experiment_id' in ds_tmp.attrs:
            ds_tmp = ds_tmp.assign_coords(ssp = ds_tmp.attrs['experiment_id'])
        else:
            ds_tmp = ds_tmp.assign_coords(ssp = ds_tmp.attrs['intake_esm_attrs:experiment_id'])
        
        # For some models/methods we are missing precip
        # so need to fill with NaNs
        if 'pr' not in ds_tmp.data_vars:
            ds_tmp['pr'] = xr.full_like(ds_tmp['tasmax'], np.NaN)
            ds_tmp['pr'].attrs = {'units':'NaN'}
        
        # Append
        ds_ssp.append(ds_tmp)
    
    # Rechunk for faster compuations
    ds_out = xr.concat(ds_ssp, dim='ssp')
    ds_out = ds_out.chunk({'ssp':1, 'lat':360, 'lon':360,
                               'time':tuple(ds_out.time.groupby(ds_out.time.dt.year).count().to_numpy())})
    return ds_out

In [5]:
# Function for longest consecutive spell if needed
def n_longest_consecutive(ds, dim='time'):
    ds = ds.cumsum(dim=dim) - ds.cumsum(dim=dim).where(ds == 0).ffill(dim=dim).fillna(0)
    return ds.max(dim=dim)

In [6]:
##################
# Convert units
##################
def convert_units(ds):
    if 'tasmax' in ds.data_vars and ds.tasmax.attrs['units'] == 'K':
        ds['tasmax'] = ds['tasmax'] - 273.15
    if 'tasmin' in ds.data_vars and ds.tasmin.attrs['units'] == 'K':
        ds['tasmin'] = ds['tasmin'] - 273.15
    if 'pr' in ds.data_vars and ds.pr.attrs['units'] == 'kg m-2 s-1':
        ds['pr'] = ds['pr'] * 86400
    
    return ds

In [7]:
#########
# Dask
#########
import dask_gateway
gateway = dask_gateway.Gateway()

# cluster options
cluster_options = gateway.cluster_options()
cluster_options["worker_memory"] = 24
cluster_options["worker_cores"] = 1

# start cluster
cluster = gateway.new_cluster(cluster_options)
client = cluster.get_client()
cluster.scale(45)

# dashboard link
print(cluster.dashboard_link)

https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.090757d24ea24fae965d7db5a677b2e4/status


# GARD-SV

In [8]:
# Preliminaries
method = 'GARD-SV'
models = list(gardsv_ssp_dict.keys())

assert models == list(gardsv_var_dict.keys())

## Simple metrics

### Annual avgs

In [33]:
# loop through models: RUNTIME IS AROUND 5 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])
    
    ds = convert_units(ds)
    ds['tas'] = (ds['tasmin'] + ds['tasmax']) / 2.
    
    # Annual averages
    ds_final = ds.resample(time='1Y').mean()
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/avg/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


BCC-CSM2-MR

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MIROC6

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MPI-ESM1-2-HR


### 1-day max

In [34]:
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])

    ds = convert_units(ds)
    ds['tas'] = (ds['tasmin'] + ds['tasmax']) / 2.
    
    # Annual maxs
    ds_final = ds.resample(time='1Y').max()
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/max/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


BCC-CSM2-MR

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MIROC6

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MPI-ESM1-2-HR


### 5-day max

In [9]:
import xclim
xclim.set_options(cf_compliance="log");

In [10]:
models[2:]

['MIROC6', 'MPI-ESM1-2-HR']

In [11]:
models[2]

'MIROC6'

In [10]:
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models[2:]:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])
    ds = convert_units(ds)
    ds['tas'] = (ds['tasmin'] + ds['tasmax']) / 2.
    ds['pr'].attrs['units'] = 'mm/day'
    
    # Compute
    ds_RX5day = xclim.indicators.icclim.RX5day(ds=ds[['pr']], freq='Y')
    
    ds_temp5day = ds[['tas','tasmin','tasmax']].rolling(time=5).mean().resample(time='1Y').max()
    ds_temp5day -= 273.15 # K -> C
        
    # Storage options
    ds_final = xr.merge([ds_temp5day, ds_RX5day])
    ds_final = ds_final.isel(member_id=0).drop('member_id')
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/max5d/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


Exception in callback None()
handle: <Handle cancelled>
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 1391, in _do_ssl_handshake
    self.socket.do_handshake()
  File "/srv/conda/envs/notebook/lib/python3.10/ssl.py", line 1342, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:997)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 189, in _handle_events
    handler_func(fileobj, events)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 696, in _handle_events
    self._handle_read()
  File "/srv/conda/envs/notebook/lib/python3.10

BCC-CSM2-MR

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


Exception in callback None()
handle: <Handle cancelled>
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 1391, in _do_ssl_handshake
    self.socket.do_handshake()
  File "/srv/conda/envs/notebook/lib/python3.10/ssl.py", line 1342, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:997)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 189, in _handle_events
    handler_func(fileobj, events)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 696, in _handle_events
    self._handle_read()
  File "/srv/conda/envs/notebook/lib/python3.10

CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


Exception in callback None()
handle: <Handle cancelled>
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 1391, in _do_ssl_handshake
    self.socket.do_handshake()
  File "/srv/conda/envs/notebook/lib/python3.10/ssl.py", line 1342, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:997)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 189, in _handle_events
    handler_func(fileobj, events)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/tornado/iostream.py", line 696, in _handle_events
    self._handle_read()
  File "/srv/conda/envs/notebook/lib/python3.10

CancelledError: ('store-map-a9ba9cf6fc8fd729e12ce8ac82ed1ec5', 0, 2, 0, 0)

### Dry days

In [12]:
# Function for longest consecutive spell if needed
def n_longest_consecutive(ds):
    ds_out = ds.cumsum(dim='time') - ds.cumsum(dim='time').where(ds == 0).ffill(dim='time').fillna(0)
    return ds_out.max(dim='time')

In [17]:
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])
    ds = convert_units(ds)
    
    if ds['pr'].attrs['units'] == 'NaN':
        print(model + ' contains no precip')
        continue
    
    # Select only precip
    ds = ds.drop(['tasmin', 'tasmax'])
    
    # Compute
    # Number of dry days
    ds_tmp_0 = (ds == 0.).resample(time='1Y').sum() # 0mm
    ds_tmp_1 = (ds < 1.).resample(time='1Y').sum() # less than 1mm
    # Longest sonsecutive dry day streak
    ds_tmp_0c = (ds == 0.).resample(time='1Y').apply(n_longest_consecutive) # 0mm longest consecutive
    ds_tmp_1c = (ds < 1.).resample(time='1Y').apply(n_longest_consecutive) # less than 1mm longest consecutive
    # Merge
    ds_final = xr.merge([ds_tmp_0.rename({'pr':'count_eq_0'}),
                         ds_tmp_0c.rename({'pr':'streak_eq_0'}),
                         ds_tmp_1.rename({'pr':'count_lt_1'}),
                         ds_tmp_1c.rename({'pr':'streak_lt_1'})])
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/dry/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


BCC-CSM2-MR contains no precip

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MIROC6 contains no precip

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MPI-ESM1-2-HR


## Less simple metrics

### Wet days

In [18]:
%%time
    
# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_precip_quantiles_gardsv.nc')
ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_precip_quantiles_gardsv.nc')
    
# loop through models: RUNTIME IS AROUND 20 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])
    ds = convert_units(ds)
    
    if ds['pr'].attrs['units'] == 'NaN':
        print(model + ' contains no precip')
        continue
        
    # Select only precip
    ds = ds.drop(['tasmin', 'tasmax'])
    
    ## Calculate metrics
    var_id = 'pr'
    ds_tmp_out = []
    for rp in ['q99', 'rp10']:
        # Get above/below binary
        ds_tmp_q_era5 = ds[var_id] > ds_q_era5[var_id + '_' + rp]
        ds_tmp_q_gmfd = ds[var_id] > ds_q_gmfd[var_id + '_' + rp]
        
        # Count of hot days
        ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_count': ds_tmp_q_era5_count}))
        ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
        
        # Longest consecutive hot day streak
        ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
        ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
        
    # Merge metrics and append
    ds_final = xr.merge(ds_tmp_out)
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':20, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/wet/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


BCC-CSM2-MR contains no precip

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MIROC6 contains no precip

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MPI-ESM1-2-HR
CPU times: user 4min 24s, sys: 2.49 s, total: 4min 27s
Wall time: 29min 1s


### Hot days

In [20]:
%%time
    
# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_temperature_quantiles_gardsv.nc')
ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_temperature_quantiles_gardsv.nc')
    
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])
    ds = convert_units(ds)
    ds['tas'] = (ds['tasmax'] + ds['tasmin']) / 2.
    
    # Drop precip
    ds = ds.drop(['pr'])
    
    ## Calculate metrics
    ds_tmp_final = []
    for var_id in ['tasmin','tasmax','tas']:
        ds_tmp_out = []
        for rp in ['q99', 'rp10']:
            # Get above/below binary
            ds_tmp_q_era5 = ds[var_id] > ds_q_era5[var_id + '_' + rp]
            ds_tmp_q_gmfd = ds[var_id] > ds_q_gmfd[var_id + '_' + rp]
        
            # Count of hot days
            ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_count': ds_tmp_q_era5_count}))
            ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
        
            # Longest consecutive hot day streak
            ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
            ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
        
        # Merge RPs and append
        ds_out = xr.merge(ds_tmp_out)
        ds_tmp_final.append(ds_out)
    
    # Merge variables
    ds_final = xr.merge(ds_tmp_final)
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':20, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/hot/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


BCC-CSM2-MR

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MIROC6

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MPI-ESM1-2-HR
CPU times: user 42min 53s, sys: 12.1 s, total: 43min 6s
Wall time: 3h 14min 15s


## Multivariate metrics

### Hot and dry days

In [15]:
%%time
    
# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_temperature_quantiles_gardsv.nc')
ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_temperature_quantiles_gardsv.nc')
    
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])
    ds = convert_units(ds)
    
    if ds['pr'].attrs['units'] == 'NaN':
        print(model + ' contains no precip')
        continue
    
    # Drop tasmin
    ds = ds.drop(['tasmin'])
    
    ## Calculate metrics
    ds_tmp_out = []
    for rp in ['q99', 'rp10']:
        # Get above/below binary
        ds_tmp_q_gmfd = (ds['tasmax'] > ds_q_gmfd['tasmax_' + rp]) & (ds['pr'] < 1.)
        ds_tmp_q_era5 = (ds['tasmax'] > ds_q_era5['tasmax_' + rp]) & (ds['pr'] < 1.)
        
        # Count of hot+dry days
        ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'era5_count': ds_tmp_q_era5_count}))
        ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
        
        # Longest consecutive hot+dry day streak
        ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
        ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
        
    # Merge metrics and append
    ds_final = xr.merge(ds_tmp_out)
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':10, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/hotdry/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


BCC-CSM2-MR contains no precip

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MIROC6 contains no precip

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MPI-ESM1-2-HR
CPU times: user 3min 53s, sys: 4.83 s, total: 3min 58s
Wall time: 1h 2min 55s


## Spatially compounding metrics (historical quantiles required)

### Hot days

In [92]:
%%time

area_frac = 0.5
var_id = 'tasmax'

# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_temperature_quantiles_gardsv.nc')
ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_temperature_quantiles_gardsv.nc')
    
# loop through models: RUNTIME IS AROUND 20 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, gardsv_ssp_dict[model])
    ds = convert_units(ds)
    # ds['tas'] = (ds['tasmax'] + ds['tasmin']) / 2.
        
    # Drop precip and tasmin
    ds = ds.drop(['pr', 'tasmin'])
    
    # Calculate metrics
    gmfd_tmp_out = []
    era5_tmp_out = []
    for rp in ['q99', 'rp10']:
        ## GMFD
        # Above/below binary
        ds_tmp_q_gmfd = (ds[var_id] > ds_q_gmfd[var_id + '_' + rp]).persist()
        # Mask
        mask = regionmask.defined_regions.ar6.land.mask(ds_tmp_q_gmfd)
        # Loop through regions
        for region in regionmask.defined_regions.ar6.land.abbrevs:
            region_index = regionmask.defined_regions.ar6.land.map_keys(region)
            
            ds_tmp_q_masked = ds_tmp_q_gmfd.where(mask == region_index, drop=True)
            ds_tmp_region_q_masked = ds_tmp_q_masked.mean(dim=['lat','lon'], skipna=True)
            
            # Count
            out_name = var_id + '_' + rp + 'gmfd_count'
            tmp_q_count = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').sum()
            
            tmp_q_count = pd.wide_to_long(tmp_q_count.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_count = tmp_q_count.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_count['year'] = tmp_q_count['year'].dt.year
            tmp_q_count['ssp'] = tmp_q_count['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Streak
            out_name = var_id + '_' + rp + 'gmfd_streak'
            tmp_q_streak = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').apply(n_longest_consecutive)
            
            tmp_q_streak = pd.wide_to_long(tmp_q_streak.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_streak = tmp_q_streak.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_streak['year'] = tmp_q_streak['year'].dt.year
            tmp_q_streak['ssp'] = tmp_q_streak['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Merge
            tmp_q_out = pd.merge(tmp_q_count, tmp_q_streak, on=['year', 'ssp'])
            tmp_q_out['region'] = region
            gmfd_tmp_out.append(tmp_q_out)
            
        ## ERA5
        # Above/below binary
        ds_tmp_q_era5 = (ds[var_id] > ds_q_era5[var_id + '_' + rp]).persist()
        # Mask
        mask = regionmask.defined_regions.ar6.land.mask(ds_tmp_q_era5)
        # Loop through regions
        for region in regionmask.defined_regions.ar6.land.abbrevs:
            region_index = regionmask.defined_regions.ar6.land.map_keys(region)
            
            ds_tmp_q_masked = ds_tmp_q_era5.where(mask == region_index, drop=True)
            ds_tmp_region_q_masked = ds_tmp_q_masked.mean(dim=['lat','lon'], skipna=True)
            
            # Count
            out_name = var_id + '_' + rp + 'era5_count'
            tmp_q_count = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').sum()
            
            tmp_q_count = pd.wide_to_long(tmp_q_count.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_count = tmp_q_count.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_count['year'] = tmp_q_count['year'].dt.year
            tmp_q_count['ssp'] = tmp_q_count['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Streak
            out_name = var_id + '_' + rp + 'era5_streak'
            tmp_q_streak = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').apply(n_longest_consecutive)
            
            tmp_q_streak = pd.wide_to_long(tmp_q_streak.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_streak = tmp_q_streak.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_streak['year'] = tmp_q_streak['year'].dt.year
            tmp_q_streak['ssp'] = tmp_q_streak['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Merge
            tmp_q_out = pd.merge(tmp_q_count, tmp_q_streak, on=['year', 'ssp'])
            tmp_q_out['region'] = region
            era5_tmp_out.append(tmp_q_out)
    
    # Merge and remove NaNs
    df_out = pd.merge(pd.concat(gmfd_tmp_out), pd.concat(era5_tmp_out), on=['region', 'ssp', 'year'])
    df_out = df_out.groupby(['region', 'ssp', 'year']).max()
    
    # Upload to azure
    with io.BytesIO() as buffer:
        df_out.to_csv(buffer)
        buffer.seek(0)
        blob_client = container_client.get_blob_client('carbonplan/' + method + '/hot_spatial/' + model + '_' + var_id + '.csv')
        blob_client.upload_blob(buffer, overwrite=True)
    
    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


BCC-CSM2-MR

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MIROC6

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MPI-ESM1-2-HR
CPU times: user 44min 10s, sys: 58.9 s, total: 45min 9s
Wall time: 2h 24min 53s


# DeepSD-BC

In [11]:
# Preliminaries
method = 'DeepSD-BC'
models = list(deepsdbc_dict.keys())

## Simple metrics

### Annual avgs

In [9]:
# loop through models: RUNTIME IS AROUND 20 MINS PER MODEL WITH 40 DASK WORKERS
for model in models[1:]:
    # Grab model
    ds = grab_model(method, model,
                   [ssp for ssp in list(deepsdbc_dict[model].keys())])
    
    ds = convert_units(ds)
    ds['tas'] = (ds['tasmin'] + ds['tasmax']) / 2.
    
    # Annual averages
    ds_final = ds.resample(time='1Y').mean()
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/avg/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MRI-ESM2-0


### 1-day max

In [10]:
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model,
                   [ssp for ssp in list(deepsdbc_dict[model].keys())])

    ds = convert_units(ds)
    ds['tas'] = (ds['tasmin'] + ds['tasmax']) / 2.
    
    # Annual maxs
    ds_final = ds.resample(time='1Y').max()
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/max/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MRI-ESM2-0


### 5-day max

In [21]:
import xclim
xclim.set_options(cf_compliance="log");

In [None]:
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model,
                    [ssp for ssp in list(deepsdbc_dict[model].keys())])
    ds = convert_units(ds)
    ds['tas'] = (ds['tasmin'] + ds['tasmax']) / 2.
    ds['pr'].attrs['units'] = 'mm/day'
    
    # Compute
    ds_RX5day = xclim.indicators.icclim.RX5day(ds=ds[['pr']], freq='Y')
    
    ds_temp5day = ds[['tas','tasmin','tasmax']].rolling(time=5).mean().resample(time='1Y').max()
    ds_temp5day -= 273.15 # K -> C
        
    # Storage options
    ds_final = xr.merge([ds_temp5day, ds_RX5day])
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/max5d/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


### Dry days

In [17]:
# Function for longest consecutive spell if needed
def n_longest_consecutive(ds):
    ds_out = ds.cumsum(dim='time') - ds.cumsum(dim='time').where(ds == 0).ffill(dim='time').fillna(0)
    return ds_out.max(dim='time')

In [18]:
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model,
                    [ssp for ssp in list(deepsdbc_dict[model].keys()) if 'pr' in deepsdbc_dict[model][ssp]])
    ds = convert_units(ds)
    
    # Select only precip
    ds = ds.drop(['tasmin', 'tasmax'])
    
    # Compute
    # Number of dry days
    ds_tmp_0 = (ds == 0.).resample(time='1Y').sum() # 0mm
    ds_tmp_1 = (ds < 1.).resample(time='1Y').sum() # less than 1mm
    # Longest sonsecutive dry day streak
    ds_tmp_0c = (ds == 0.).resample(time='1Y').apply(n_longest_consecutive) # 0mm longest consecutive
    ds_tmp_1c = (ds < 1.).resample(time='1Y').apply(n_longest_consecutive) # less than 1mm longest consecutive
    # Merge
    ds_final = xr.merge([ds_tmp_0.rename({'pr':'count_eq_0'}),
                         ds_tmp_0c.rename({'pr':'streak_eq_0'}),
                         ds_tmp_1.rename({'pr':'count_lt_1'}),
                         ds_tmp_1c.rename({'pr':'streak_lt_1'})])
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':30, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/dry/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')

    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MRI-ESM2-0


## Less simple metrics

### Wet days

In [23]:
%%time
    
# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_precip_quantiles_nex-cil-deepsd.nc')
ds_q_era5['lon'] = np.where(ds_q_era5['lon'] > 180, ds_q_era5['lon'] - 360, ds_q_era5['lon'])
ds_q_era5 = ds_q_era5.sortby('lon')

ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_precip_quantiles_nex-cil-deepsd.nc')
ds_q_gmfd['lon'] = np.where(ds_q_gmfd['lon'] > 180, ds_q_gmfd['lon'] - 360, ds_q_gmfd['lon'])
ds_q_gmfd = ds_q_gmfd.sortby('lon')
    
# Loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model,
                    [ssp for ssp in list(deepsdbc_dict[model].keys()) if 'pr' in deepsdbc_dict[model][ssp]])
    ds = convert_units(ds)
    
    if ds['pr'].attrs['units'] == 'NaN':
        print(model + ' contains no precip')
        continue
    
    # Select only precip
    ds = ds.drop(['tasmin', 'tasmax'])
    
    ## Calculate metrics
    var_id = 'pr'
    ds_tmp_out = []
    for rp in ['q99', 'rp10']:
        # Get above/below binary
        ds_tmp_q_era5 = ds[var_id] > ds_q_era5[var_id + '_' + rp]
        ds_tmp_q_gmfd = ds[var_id] > ds_q_gmfd[var_id + '_' + rp]
        
        # Count of hot days
        ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_count': ds_tmp_q_era5_count}))
        ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
        
        # Longest consecutive hot day streak
        ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
        ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
        
    # Merge metrics and append
    ds_final = xr.merge(ds_tmp_out)
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':20, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/wet/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MRI-ESM2-0
CPU times: user 1min 53s, sys: 921 ms, total: 1min 54s
Wall time: 19min 51s


### Hot days

In [None]:
%%time
    
# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_temperature_quantiles_nex-cil-deepsd.nc')
ds_q_era5['lon'] = np.where(ds_q_era5['lon'] > 180, ds_q_era5['lon'] - 360, ds_q_era5['lon'])
ds_q_era5 = ds_q_era5.sortby('lon')

ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_temperature_quantiles_nex-cil-deepsd.nc')
ds_q_gmfd['lon'] = np.where(ds_q_gmfd['lon'] > 180, ds_q_gmfd['lon'] - 360, ds_q_gmfd['lon'])
ds_q_gmfd = ds_q_gmfd.sortby('lon')
    
# Loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, list(deepsdbc_dict[model].keys()))
    ds = convert_units(ds)
    ds['tas'] = (ds['tasmax'] + ds['tasmin']) / 2.
    
    # Drop precip
    ds = ds.drop(['pr'])
    
    ## Calculate metrics
    ds_tmp_final = []
    for var_id in ['tasmin', 'tasmax', 'tas']:
        ds_tmp_out = []
        for rp in ['q99', 'rp10']:
            # Get above/below binary
            ds_tmp_q_era5 = ds[var_id] > ds_q_era5[var_id + '_' + rp]
            ds_tmp_q_gmfd = ds[var_id] > ds_q_gmfd[var_id + '_' + rp]
        
            # Count of hot days
            ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_count': ds_tmp_q_era5_count}))
            ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
        
            # Longest consecutive hot day streak
            ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
            ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
            
        # Merge RPs and append
        ds_out = xr.merge(ds_tmp_out)
        ds_tmp_final.append(ds_out)
    
    # Merge variables
    ds_final = xr.merge(ds_tmp_final)
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':20, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/hot/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)

## Multivariate metrics

### Hot and dry days

In [18]:
%%time
    
# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_temperature_quantiles_nex-cil-deepsd.nc')
ds_q_era5['lon'] = np.where(ds_q_era5['lon'] > 180, ds_q_era5['lon'] - 360, ds_q_era5['lon'])
ds_q_era5 = ds_q_era5.sortby('lon')

ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_temperature_quantiles_nex-cil-deepsd.nc')
ds_q_gmfd['lon'] = np.where(ds_q_gmfd['lon'] > 180, ds_q_gmfd['lon'] - 360, ds_q_gmfd['lon'])
ds_q_gmfd = ds_q_gmfd.sortby('lon')
    
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model,
                    [ssp for ssp in list(deepsdbc_dict[model].keys()) if 'pr' in deepsdbc_dict[model][ssp]])
    ds = convert_units(ds)
    
    # Drop tasmin
    ds = ds.drop(['tasmin'])
    
    ## Calculate metrics
    ds_tmp_out = []
    for rp in ['q99', 'rp10']:
        # Get above/below binary
        ds_tmp_q_gmfd = (ds['tasmax'] > ds_q_gmfd['tasmax_' + rp]) & (ds['pr'] < 1.)
        ds_tmp_q_era5 = (ds['tasmax'] > ds_q_era5['tasmax_' + rp]) & (ds['pr'] < 1.)
        
        # Count of hot+dry days
        ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'era5_count': ds_tmp_q_era5_count}))
        ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
        
        # Longest consecutive hot+dry day streak
        ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'era5_streak': ds_tmp_q_era5_streak}))
        ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(xr.Dataset({'hotdry_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
        
    # Merge metrics and append
    ds_final = xr.merge(ds_tmp_out)
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':20, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'carbonplan/' + method + '/hotdry/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MRI-ESM2-0
CPU times: user 2min 15s, sys: 2.12 s, total: 2min 17s
Wall time: 43min 18s


## Spatially compounding metrics (historical quantiles required)

### Hot days

In [94]:
%%time

area_frac = 0.5
var_id = 'tasmax'

# Load quantiles
ds_q_era5 = xr.open_dataset('../data/quantiles/era5_temperature_quantiles_nex-cil-deepsd.nc')
ds_q_era5['lon'] = np.where(ds_q_era5['lon'] > 180, ds_q_era5['lon'] - 360, ds_q_era5['lon'])
ds_q_era5 = ds_q_era5.sortby('lon')

ds_q_gmfd = xr.open_dataset('../data/quantiles/gmfd_temperature_quantiles_nex-cil-deepsd.nc')
ds_q_gmfd['lon'] = np.where(ds_q_gmfd['lon'] > 180, ds_q_gmfd['lon'] - 360, ds_q_gmfd['lon'])
ds_q_gmfd = ds_q_gmfd.sortby('lon')
    
# loop through models: RUNTIME IS AROUND 20 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Grab model
    ds = grab_model(method, model, list(deepsdbc_dict[model].keys()))
    ds = convert_units(ds)
    # ds['tas'] = (ds['tasmax'] + ds['tasmin']) / 2.
        
    # Drop precip and tasmin
    ds = ds.drop(['pr', 'tasmin'])
    
    # Calculate metrics
    gmfd_tmp_out = []
    era5_tmp_out = []
    for rp in ['q99', 'rp10']:
        ## GMFD
        # Above/below binary
        ds_tmp_q_gmfd = (ds[var_id] > ds_q_gmfd[var_id + '_' + rp]).persist()
        # Mask
        mask = regionmask.defined_regions.ar6.land.mask(ds_tmp_q_gmfd)
        # Loop through regions
        for region in regionmask.defined_regions.ar6.land.abbrevs:
            region_index = regionmask.defined_regions.ar6.land.map_keys(region)
            
            ds_tmp_q_masked = ds_tmp_q_gmfd.where(mask == region_index, drop=True)
            ds_tmp_region_q_masked = ds_tmp_q_masked.mean(dim=['lat','lon'], skipna=True)
            
            # Count
            out_name = var_id + '_' + rp + 'gmfd_count'
            tmp_q_count = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').sum()
            
            tmp_q_count = pd.wide_to_long(tmp_q_count.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_count = tmp_q_count.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_count['year'] = tmp_q_count['year'].dt.year
            tmp_q_count['ssp'] = tmp_q_count['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Streak
            out_name = var_id + '_' + rp + 'gmfd_streak'
            tmp_q_streak = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').apply(n_longest_consecutive)
            
            tmp_q_streak = pd.wide_to_long(tmp_q_streak.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_streak = tmp_q_streak.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_streak['year'] = tmp_q_streak['year'].dt.year
            tmp_q_streak['ssp'] = tmp_q_streak['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Merge
            tmp_q_out = pd.merge(tmp_q_count, tmp_q_streak, on=['year', 'ssp'])
            tmp_q_out['region'] = region
            gmfd_tmp_out.append(tmp_q_out)
            
        ## ERA5
        # Above/below binary
        ds_tmp_q_era5 = (ds[var_id] > ds_q_era5[var_id + '_' + rp]).persist()
        # Mask
        mask = regionmask.defined_regions.ar6.land.mask(ds_tmp_q_era5)
        # Loop through regions
        for region in regionmask.defined_regions.ar6.land.abbrevs:
            region_index = regionmask.defined_regions.ar6.land.map_keys(region)
            
            ds_tmp_q_masked = ds_tmp_q_era5.where(mask == region_index, drop=True)
            ds_tmp_region_q_masked = ds_tmp_q_masked.mean(dim=['lat','lon'], skipna=True)
            
            # Count
            out_name = var_id + '_' + rp + 'era5_count'
            tmp_q_count = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').sum()
            
            tmp_q_count = pd.wide_to_long(tmp_q_count.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_count = tmp_q_count.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_count['year'] = tmp_q_count['year'].dt.year
            tmp_q_count['ssp'] = tmp_q_count['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Streak
            out_name = var_id + '_' + rp + 'era5_streak'
            tmp_q_streak = (ds_tmp_region_q_masked > area_frac).resample(time='1Y').apply(n_longest_consecutive)
            
            tmp_q_streak = pd.wide_to_long(tmp_q_streak.isel(member_id=0).to_pandas().T.reset_index(),
                                          stubnames='ssp', i='time', j=out_name).reset_index()
            tmp_q_streak = tmp_q_streak.rename(columns={out_name:'ssp', 'ssp':out_name, 'time':'year'})
            tmp_q_streak['year'] = tmp_q_streak['year'].dt.year
            tmp_q_streak['ssp'] = tmp_q_streak['ssp'].apply(lambda x: 'ssp' + str(x))
            
            # Merge
            tmp_q_out = pd.merge(tmp_q_count, tmp_q_streak, on=['year', 'ssp'])
            tmp_q_out['region'] = region
            era5_tmp_out.append(tmp_q_out)
      
    # Merge and remove NaNs
    df_out = pd.merge(pd.concat(gmfd_tmp_out), pd.concat(era5_tmp_out), on=['region', 'ssp', 'year'])
    df_out = df_out.groupby(['region', 'ssp', 'year']).max()
    
    # Upload to azure
    with io.BytesIO() as buffer:
        df_out.to_csv(buffer)
        buffer.seek(0)
        blob_client = container_client.get_blob_client('carbonplan/' + method + '/hot_spatial/' + model + '_' + var_id + '.csv')
        blob_client.upload_blob(buffer, overwrite=True)
    
    print(model)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


CanESM5

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


MRI-ESM2-0
CPU times: user 21min 16s, sys: 28.1 s, total: 21min 44s
Wall time: 1h 5min 54s
