In [1]:
###############################################
### TO RUN ON MICROSOFT PLANETARY COMPUTER ####
###############################################

In [1]:
import planetary_computer
import pystac_client
import pystac

import numpy as np
import xarray as xr
import pandas as pd

import collections
import fsspec
import requests

import getpass
import azure.storage.blob
import zarr

### Preliminaries

In [2]:
######################
# Azure blob storage
######################
# connection string (from azure web login, select your storage account, then "Access keys")
connection_string = getpass.getpass()

    
# format storage
container_client = azure.storage.blob.ContainerClient.from_connection_string(
    connection_string, container_name="mpctransfer")

 ········


In [3]:
###################
# Models
###################
from utils import cil_ssp_dict

models = list(cil_ssp_dict.keys())

In [4]:
#################
# Data access
#################

# Complete catalog
catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")

# function to grab variables and SSPs for singe model
def grab_model(model_id, vars_to_grab):
    # Search across all licences in CIL-GDPCIR
    search = catalog.search(
        collections=["cil-gdpcir-cc0", "cil-gdpcir-cc-by", "cil-gdpcir-cc-by-sa"],
        query={"cmip6:source_id" : {"eq": model_id},
               "cmip6:experiment_id": {"neq": "historical"}} # omit historical
    )
    ensemble = search.get_all_items()
    
    # grab all into one dataset
    ds_ssp = []

    for item in ensemble:
        signed = planetary_computer.sign(item)
        ds_vars = []
        for variable_id in vars_to_grab:
            asset = signed.assets[variable_id]
            ds_tmp = xr.open_dataset(asset.href, **asset.extra_fields["xarray:open_kwargs"])
            ds_tmp = ds_tmp.assign_coords(ssp = ds_tmp.attrs['experiment_id'])
            ds_vars.append(ds_tmp)
        ds_ssp.append(xr.merge(ds_vars))

    ds_out = xr.concat(ds_ssp, dim='ssp')
    
    return ds_out

In [5]:
#########
# Dask
#########
import dask_gateway
gateway = dask_gateway.Gateway()

# cluster options
cluster_options = gateway.cluster_options()
cluster_options["worker_memory"] = 16
cluster_options["worker_cores"] = 1

# start cluster
cluster = gateway.new_cluster(cluster_options)
client = cluster.get_client()
cluster.scale(50)

# dashboard link
print(cluster.dashboard_link)

https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.9137e99390d54e7eb85502a9fd7c7196/status


# Simple metrics

## Annual averages

In [None]:
%%time
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # load data (lazy)
    ds = grab_model(model, ['tasmin', 'tasmax', 'pr'])
    
    # compute
    ds['tas'] = (ds['tasmax'] + ds['tasmin']) / 2.
    ds_final = ds.resample(time='1Y').mean()
    
    # unit conversions
    ds_final['tas'] = ds_final['tas'] - 273.15 # K -> C
    ds_final['tasmax'] = ds_final['tasmax'] - 273.15 # K -> C
    ds_final['tasmin'] = ds_final['tasmin'] - 273.15 # K -> C
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':10, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'cil-gdpcir/avg/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)

## Annual maxima

In [11]:
%%time
# loop through models: RUNTIME IS AROUND 15 MINS PER MODEL WITH 40 DASK WORKERS
for model in models[9:]:
    # load data (lazy)
    ds = grab_model(model, ['tasmin', 'tasmax', 'pr'])
    
    # compute
    ds['tas'] = (ds['tasmax'] + ds['tasmin']) / 2.
    ds_final = ds.resample(time='1Y').max()
    
    # unit conversions
    ds_final['tas'] = ds_final['tas'] - 273.15 # K -> C
    ds_final['tasmax'] = ds_final['tasmax'] - 273.15 # K -> C
    ds_final['tasmin'] = ds_final['tasmin'] - 273.15 # K -> C
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':10, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'cil-gdpcir/max/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)

INM-CM5-0
MIROC-ES2L
MIROC6
MPI-ESM1-2-LR
NESM3
NorESM2-LM
NorESM2-MM
UKESM1-0-LL
CPU times: user 3min 33s, sys: 6.36 s, total: 3min 39s
Wall time: 1h 46min 43s


## Dry days

In [9]:
# Function for longest consecutive spell if needed
def n_longest_consecutive(ds, dim='time'):
    ds = ds.cumsum(dim=dim) - ds.cumsum(dim=dim).where(ds == 0).ffill(dim=dim).fillna(0)
    return ds.max(dim=dim)

In [10]:
%%time
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # load data (lazy)
    ds = grab_model(model, ['pr'])
    
    # Compute
    # Number of dry days
    ds_tmp_0 = (ds == 0.).resample(time='1Y').sum() # 0mm
    ds_tmp_1 = (ds < 1.).resample(time='1Y').sum() # less than 1mm
    # Longest sonsecutive dry day streak
    ds_tmp_0c = (ds == 0.).resample(time='1Y').apply(n_longest_consecutive) # 0mm longest consecutive
    ds_tmp_1c = (ds < 1.).resample(time='1Y').apply(n_longest_consecutive) # less than 1mm longest consecutive
    # Merge
    ds_final = xr.merge([ds_tmp_0.rename({'pr':'count_eq_0'}),
                       ds_tmp_0c.rename({'pr':'streak_eq_0'}),
                       ds_tmp_1.rename({'pr':'count_lt_1'}),
                       ds_tmp_1c.rename({'pr':'streak_lt_1'})])
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':10, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'cil-gdpcir/dry/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)

ACCESS-ESM1-5
BCC-CSM2-MR
CanESM5
CMCC-ESM2




EC-Earth3
EC-Earth3-Veg-LR
GFDL-ESM4
HadGEM3-GC31-LL
INM-CM4-8
INM-CM5-0
MIROC-ES2L
MIROC6
MPI-ESM1-2-LR
NESM3
NorESM2-LM
NorESM2-MM
UKESM1-0-LL
CPU times: user 12min 50s, sys: 13.7 s, total: 13min 3s
Wall time: 3h 6min 54s


# Less simple metrics

In [6]:
# Function for longest consecutive spell if needed
def n_longest_consecutive(ds, dim='time'):
    ds = ds.cumsum(dim=dim) - ds.cumsum(dim=dim).where(ds == 0).ffill(dim=dim).fillna(0)
    return ds.max(dim=dim)

## Wet days

In [27]:
%%time
    
# Load quantiles
ds_q_era5 = xr.open_dataset('../data/era5_precip_quantiles_nex-cil-deepsd.nc')
ds_q_era5['lon'] = np.where(ds_q_era5['lon'] > 180, ds_q_era5['lon'] - 360, ds_q_era5['lon'])
ds_q_era5 = ds_q_era5.sortby('lon')

ds_q_gmfd = xr.open_dataset('../data/gmfd_precip_quantiles_nex-cil-deepsd.nc')
ds_q_gmfd['lon'] = np.where(ds_q_gmfd['lon'] > 180, ds_q_gmfd['lon'] - 360, ds_q_gmfd['lon'])
ds_q_gmfd = ds_q_gmfd.sortby('lon')
    
# Loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 40 DASK WORKERS
for model in models:
    # Load data (lazy)
    ds = grab_model(model, ['pr'])
    
    ## Calculate metrics
    var_id = 'pr'
    ds_tmp_out = []
    for rp in ['rp5', 'rp10', 'rp20']:
        # Get above/below binary
        ds_tmp_q_era5 = ds > ds_q_era5[var_id + '_' + rp]
        ds_tmp_q_gmfd = ds > ds_q_gmfd[var_id + '_' + rp]
        
        # Count of hot/wet days
        ds_tmp_q_era5_count = ds_tmp_q_era5.resample(time='1Y').sum()
        ds_tmp_out.append(ds_tmp_q_era5_count.rename({var_id : var_id + '_' + rp + 'era5_count'}))
        ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
        ds_tmp_out.append(ds_tmp_q_gmfd_count.rename({var_id : var_id + '_' + rp + 'gmfd_count'}))
        
        # Longest consecutive hot/wet day streak
        ds_tmp_q_era5_streak = ds_tmp_q_era5.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(ds_tmp_q_era5_streak.rename({var_id : var_id + '_' + rp + 'era5_streak'}))
        ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
        ds_tmp_out.append(ds_tmp_q_gmfd_streak.rename({var_id : var_id + '_' + rp + 'gmfd_streak'}))
        
    # Merge metrics and append
    ds_final = xr.merge(ds_tmp_out)
        
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':10, 'lat':720, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'cil-gdpcir/wet/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)

ACCESS-ESM1-5
BCC-CSM2-MR
CanESM5
CMCC-ESM2
EC-Earth3
EC-Earth3-Veg-LR
GFDL-ESM4
HadGEM3-GC31-LL
INM-CM4-8
INM-CM5-0
MIROC-ES2L
MIROC6
MPI-ESM1-2-LR
NESM3
NorESM2-LM
NorESM2-MM
UKESM1-0-LL
CPU times: user 1h 3min 51s, sys: 32.4 s, total: 1h 4min 24s
Wall time: 9h 9min 23s


## Hot days

In [7]:
# In order to speed up the calculation, we calculate hot days for tasmin, tasmax only
# and only use the GMFD quantiles

In [None]:
%%time
    
# Load quantiles
ds_q_gmfd = xr.open_dataset('../data/gmfd_temperature_quantiles_nex-cil-deepsd.nc')
ds_q_gmfd['lon'] = np.where(ds_q_gmfd['lon'] > 180, ds_q_gmfd['lon'] - 360, ds_q_gmfd['lon'])
ds_q_gmfd = ds_q_gmfd.sortby('lon')
    
# Loop through models
for model in models:
    # Load data (lazy)
    ds = grab_model(model, ['tasmin','tasmax'])
    ds -= 273.15 # K -> C
    
    # Calculate metrics
    ds_tmp_final = []
    for var_id in ['tasmin','tasmax']:
        ds_tmp_out = []
        for rp in ['rp5', 'rp10', 'rp20']:
            # Get above/below binary
            ds_tmp_q_gmfd = ds[var_id] > ds_q_gmfd[var_id + '_' + rp]
            
            # Count of hot days
            ds_tmp_q_gmfd_count = ds_tmp_q_gmfd.resample(time='1Y').sum()
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_count': ds_tmp_q_gmfd_count}))
            
            # Longest consecutive hot day streak
            ds_tmp_q_gmfd_streak = ds_tmp_q_gmfd.resample(time='1Y').apply(n_longest_consecutive)
            ds_tmp_out.append(xr.Dataset({var_id + '_' + rp + 'gmfd_streak': ds_tmp_q_gmfd_streak}))
        
        # Merge RPs and append
        ds_out = xr.merge(ds_tmp_out)
        ds_tmp_final.append(ds_out)
    
    # Merge variables
    ds_final = xr.merge(ds_tmp_final)
    
    # storage options    
    ds_final = ds_final.chunk({'ssp':1, 'time':20, 'lat':600, 'lon':1440})
    
    compressor = zarr.Blosc(cname='zstd', clevel=3)
    encoding = {vname: {'compressor': compressor} for vname in ds_final.data_vars}
    
    azure_prefix = 'cil-gdpcir/hot/' + model
    store = zarr.ABSStore(client=container_client, prefix=azure_prefix)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode='w')
    print(model)

ACCESS-ESM1-5
