In [1]:
###############################################
### TO RUN ON MICROSOFT PLANETARY COMPUTER ####
###############################################

In [1]:
import collections
import getpass
import io

import azure.storage.blob
import fsspec
import numpy as np
import pandas as pd
import planetary_computer
import pystac
import pystac_client
import requests
import xarray as xr
import zarr

# import regionmask

In [4]:
#################
# Data access
#################

# Complete catalog
catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")


# function to grab variables and all SSPs for singe model
def grab_model(model_id, vars_to_grab, subset_US):
    # Search across all licences in CIL-GDPCIR
    search = catalog.search(
        collections=["cil-gdpcir-cc0", "cil-gdpcir-cc-by", "cil-gdpcir-cc-by-sa"],
        query={"cmip6:source_id": {"eq": model_id}, "cmip6:experiment_id": {"neq": "historical"}},  # omit historical
    )
    ensemble = search.item_collection()

    # Grab all into one dataset
    ds_ssp = []

    for item in ensemble:
        signed = planetary_computer.sign(item)
        ds_vars = []
        for variable_id in vars_to_grab:
            asset = signed.assets[variable_id]
            ds_tmp = xr.open_dataset(asset.href, **asset.extra_fields["xarray:open_kwargs"])
            ds_tmp = ds_tmp.assign_coords(ssp=ds_tmp.attrs["experiment_id"])
            ds_vars.append(ds_tmp)
        ds_ssp.append(xr.merge(ds_vars))

    ds_out = xr.concat(ds_ssp, dim="ssp")
    
    # Subset US if desired
    if subset_US:
        ds_out = ds_out.sel(lon=slice(-130,-50), lat=slice(20,60))

    return ds_out

In [5]:
# Get all models
models = []
for license in ["cil-gdpcir-cc0", "cil-gdpcir-cc-by", "cil-gdpcir-cc-by-sa"]:
    collection = catalog.get_collection(license)
    models_tmp = collection.summaries.to_dict()['cmip6:source_id']
    models.append(models_tmp)
    
models = np.hstack(models)

In [6]:
#########
# Dask
#########
import dask_gateway

gateway = dask_gateway.Gateway()

# cluster options
cluster_options = gateway.cluster_options()
cluster_options["worker_memory"] = 30
cluster_options["worker_cores"] = 1

# start cluster
cluster = gateway.new_cluster(cluster_options)
client = cluster.get_client()
cluster.scale(30)

# dashboard link
print(cluster.dashboard_link)

https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.82f79b92a1d648a788af1d4037732aa0/status


In [7]:
%%time
#########################
### Calculate metrics ###
#########################
# loop through models: RUNTIME IS AROUND 10 MINS PER MODEL WITH 30 DASK WORKERS
for model in models:
    # FGOALS-g3 missing pr
    if model == 'FGOALS-g3':
        # load data (lazy)
        ds = grab_model(model, ["tasmin", "tasmax"], True)
    else:
        ds = grab_model(model, ["tasmin", "tasmax", "pr"], True)
    
    # unit conversions
    ds["tasmax"] = ds["tasmax"] - 273.15  # K -> C
    ds["tasmin"] = ds["tasmin"] - 273.15  # K -> C

    # compute
    ds["tas"] = (ds["tasmax"] + ds["tasmin"]) / 2.0
    
    ds_tas_avg = ds["tas"].resample(time="1Y").mean()
    ds_tasmax_max = ds["tasmax"].resample(time="1Y").max()

    if model != 'FGOALS-g3':
        ds_pr_sum = ds["pr"].resample(time="1Y").sum()
        ds_pr_max = ds["pr"].resample(time="1Y").max()
    
    # merge
    if model == 'FGOALS-g3':
        ds_final = xr.Dataset({"tas_avg": ds_tas_avg,
                               "tasmax_max": ds_tasmax_max})
    else:
        ds_final = xr.Dataset({"tas_avg": ds_tas_avg,
                               "pr_sum": ds_pr_sum,
                               "tasmax_max": ds_tasmax_max,
                               "pr_max": ds_pr_max})

    # storage options
    ds_final = ds_final.chunk({"ssp": 1, "time": 10, "lat": 720, "lon": 1440})

    compressor = zarr.Blosc(cname="zstd", clevel=3)
    encoding = {vname: {"compressor": compressor} for vname in ds_final.data_vars}

    store = zarr.ABSStore(client=container_client, prefix=model)

    # store
    ds_final.to_zarr(store=store, encoding=encoding, consolidated=True, mode="w")
    print(model)

FGOALS-g3
INM-CM4-8
INM-CM5-0
BCC-CSM2-MR
ACCESS-ESM1-5
ACCESS-CM2
MIROC-ES2L
MIROC6
NorESM2-LM
NorESM2-MM
GFDL-CM4
GFDL-ESM4
NESM3
MPI-ESM1-2-HR
HadGEM3-GC31-LL
UKESM1-0-LL
MPI-ESM1-2-LR
EC-Earth3
EC-Earth3-AerChem
EC-Earth3-CC
EC-Earth3-Veg
EC-Earth3-Veg-LR
CMCC-CM2-SR5
CMCC-ESM2
CanESM5
CPU times: user 3min 54s, sys: 14.3 s, total: 4min 8s
Wall time: 1h 41min 38s


2024-01-02 19:15:12,166 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
