# Preprocess OMIP2 Model Output

## Import packages

In [1]:
%matplotlib inline
import numpy as np
import xarray as xr
import s3fs
import zarr
import util
import warnings
warnings.filterwarnings('ignore')

## Create a new Dask cluster with the Dask Gateway

In [2]:
from dask_gateway import Gateway
gateway = Gateway()

In [19]:
##A line of trick to clean your dask cluster before you start your computation
from dask.distributed import Client
clusters=gateway.list_clusters()
print(clusters )
for cluster in clusters :
    cluster= gateway.connect(cluster.name)
    print(cluster)
    client = Client(cluster)
    client.close()
    cluster.shutdown()

[ClusterReport<name=daskhub.53e2151608a4442d9ded7e0d0d754f5b, status=RUNNING>]
GatewayCluster<daskhub.53e2151608a4442d9ded7e0d0d754f5b, status=running>


In [4]:
cluster = gateway.new_cluster(worker_memory=2, worker_cores=1)

cluster.scale(8)
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

## Get a client from the Dask Gateway Cluster

As stated above, creating a Dask `Client` is mandatory in order to perform following Daks computations on your Dask Cluster.

In [5]:
from distributed import Client

if cluster:
    client = Client(cluster) # create a dask Gateway cluster
else:
    client = Client()   # create a local dask cluster on the machine.
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: /jupyterhub/services/dask-gateway/clusters/daskhub.53e2151608a4442d9ded7e0d0d754f5b/status,


## Open dictionary of OMIP2 simulations
With file lists taken from the ESGF Search Catalog with the following search requirements:
1. On the native grid
2. Have the varaibles:`umo`, `vmo`, `so`, `thetao`, `zos`, `mlotst`, `siconc`, `deptho`, `areacello`
3. At monthly time steps
4. The last 61 years of the simulation

In [6]:
# Load in file of model names and fnames
model_fnames_dict = np.load("../models.npy", allow_pickle=True).item()

In [7]:
list(model_fnames_dict.keys())

['CMCC-CM2-SR5',
 'NorESM2-LM',
 'MRI-ESM2-0',
 'CNRM-CM6-1',
 'FGOALS-f3-L',
 'CNRM-CM6-1-HR',
 'TaiESM1-TIMCOM',
 'CMCC-CM2-HR4',
 'TaiESM1-TIMCOM2',
 'ACCESS-OM2',
 'EC-Earth3',
 'ACCESS-OM2-025',
 'MIROC6']

## Function for writing zarr file to bucket

In [9]:
def write_subset_to_zarr(ds,sim_name,v_name):
    # set path on bucket
    path='WAFFLES/OMIP2/'+sim_name
    s3_prefix =  "s3://"+path
    print(s3_prefix)
    
    # get storage keys
    access_key = !aws configure get aws_access_key_id
    access_key = access_key[0]
    secret_key = !aws configure get aws_secret_access_key
    secret_key = secret_key[0]
    
    # set storage target
    client_kwargs={'endpoint_url': 'https://object-store.cloud.muni.cz'}
    target = s3fs.S3FileSystem(anon=False,client_kwargs=client_kwargs)
    
    # make file name for variable in simulation
    zarr_file_name= f'{v_name}_{sim_name}'
    uri = f"{s3_prefix}/{zarr_file_name}"
    
    # get store argument for zarr
    store = zarr.storage.FSStore(uri,client_kwargs=client_kwargs,
                                 key=access_key, secret=secret_key)
    
    # write variable to zarr
    %time ds[v_name].to_dataset().to_zarr(store=store,mode='w',consolidated=True)
    
    return

## Preprocess and write subsets of all model data
Open a model and save each variable in the dataset as a `zarr` file in the `waffles` bucket

In [10]:
sim_name = list(model_fnames_dict.keys())[0]

In [11]:
sim_name

'CMCC-CM2-SR5'

In [12]:
# load model from esgf
ds = util.load_ds_from_esgf_file_in_model_fnames_dict(sim_name, model_fnames_dict)

In [13]:
ds

Unnamed: 0,Array,Chunk
Bytes,11.44 kiB,11.44 kiB
Shape,"(732, 2)","(732, 2)"
Count,1 Graph Layer,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 11.44 kiB 11.44 kiB Shape (732, 2) (732, 2) Count 1 Graph Layer 1 Chunks Type object numpy.ndarray",2  732,

Unnamed: 0,Array,Chunk
Bytes,11.44 kiB,11.44 kiB
Shape,"(732, 2)","(732, 2)"
Count,1 Graph Layer,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,800 B,800 B
Shape,"(50, 2)","(50, 2)"
Count,1 Graph Layer,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800 B 800 B Shape (50, 2) (50, 2) Count 1 Graph Layer 1 Chunks Type float64 numpy.ndarray",2  50,

Unnamed: 0,Array,Chunk
Bytes,800 B,800 B
Shape,"(50, 2)","(50, 2)"
Count,1 Graph Layer,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,810.00 kiB,78.12 kiB
Shape,"(72, 360, 4)","(50, 50, 4)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 810.00 kiB 78.12 kiB Shape (72, 360, 4) (50, 50, 4) Count 1 Graph Layer 16 Chunks Type float64 numpy.ndarray",4  360  72,

Unnamed: 0,Array,Chunk
Bytes,810.00 kiB,78.12 kiB
Shape,"(72, 360, 4)","(50, 50, 4)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,810.00 kiB,78.12 kiB
Shape,"(72, 360, 4)","(50, 50, 4)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 810.00 kiB 78.12 kiB Shape (72, 360, 4) (50, 50, 4) Count 1 Graph Layer 16 Chunks Type float64 numpy.ndarray",4  360  72,

Unnamed: 0,Array,Chunk
Bytes,810.00 kiB,78.12 kiB
Shape,"(72, 360, 4)","(50, 50, 4)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,202.50 kiB,19.53 kiB
Shape,"(72, 360)","(50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 202.50 kiB 19.53 kiB Shape (72, 360) (50, 50) Count 1 Graph Layer 16 Chunks Type float64 numpy.ndarray",360  72,

Unnamed: 0,Array,Chunk
Bytes,202.50 kiB,19.53 kiB
Shape,"(72, 360)","(50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,202.50 kiB,19.53 kiB
Shape,"(72, 360)","(50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 202.50 kiB 19.53 kiB Shape (72, 360) (50, 50) Count 1 Graph Layer 16 Chunks Type float64 numpy.ndarray",360  72,

Unnamed: 0,Array,Chunk
Bytes,202.50 kiB,19.53 kiB
Shape,"(72, 360)","(50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,405.00 kiB,19.53 kiB
Shape,"(2, 72, 360)","(1, 50, 50)"
Count,1 Graph Layer,32 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 405.00 kiB 19.53 kiB Shape (2, 72, 360) (1, 50, 50) Count 1 Graph Layer 32 Chunks Type float64 numpy.ndarray",360  72  2,

Unnamed: 0,Array,Chunk
Bytes,405.00 kiB,19.53 kiB
Shape,"(2, 72, 360)","(1, 50, 50)"
Count,1 Graph Layer,32 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,405.00 kiB,19.53 kiB
Shape,"(2, 72, 360)","(1, 50, 50)"
Count,1 Graph Layer,32 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 405.00 kiB 19.53 kiB Shape (2, 72, 360) (1, 50, 50) Count 1 Graph Layer 32 Chunks Type float64 numpy.ndarray",360  72  2,

Unnamed: 0,Array,Chunk
Bytes,405.00 kiB,19.53 kiB
Shape,"(2, 72, 360)","(1, 50, 50)"
Count,1 Graph Layer,32 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 3.53 GiB 349.04 MiB Shape (732, 50, 72, 360) (732, 50, 50, 50) Count 1 Graph Layer 16 Chunks Type float32 numpy.ndarray",732  1  360  72  50,

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 3.53 GiB 349.04 MiB Shape (732, 50, 72, 360) (732, 50, 50, 50) Count 1 Graph Layer 16 Chunks Type float32 numpy.ndarray",732  1  360  72  50,

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 3.53 GiB 349.04 MiB Shape (732, 50, 72, 360) (732, 50, 50, 50) Count 1 Graph Layer 16 Chunks Type float32 numpy.ndarray",732  1  360  72  50,

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 3.53 GiB 349.04 MiB Shape (732, 50, 72, 360) (732, 50, 50, 50) Count 1 Graph Layer 16 Chunks Type float32 numpy.ndarray",732  1  360  72  50,

Unnamed: 0,Array,Chunk
Bytes,3.53 GiB,349.04 MiB
Shape,"(732, 50, 72, 360)","(732, 50, 50, 50)"
Count,1 Graph Layer,16 Chunks
Type,float32,numpy.ndarray


In [16]:
v_name = list(ds.keys())[0]

In [17]:
v_name

'so'

In [18]:
write_subset_to_zarr(ds,sim_name,v_name)

s3://WAFFLES/OMIP2/CMCC-CM2-SR5


RuntimeError: NetCDF: Authorization failure

## Some useful, spare code