# Preprocess OMIP2 Model Output

In [None]:
pip install xmip

## Import packages

In [1]:
%matplotlib inline
import numpy as np
import xarray as xr
import s3fs
import zarr
import dask
import xmip.preprocessing as xmip
import warnings
warnings.filterwarnings('ignore')

## Create a new Dask cluster with the Dask Gateway

In [2]:
from dask_gateway import Gateway
gateway = Gateway()

In [23]:
##A line of trick to clean your dask cluster before you start your computation
from dask.distributed import Client
clusters=gateway.list_clusters()
print(clusters )
for cluster in clusters :
    cluster= gateway.connect(cluster.name)
    print(cluster)
    client = Client(cluster)
    client.close()
    cluster.shutdown()

[]


In [4]:
cluster = gateway.new_cluster(worker_memory=2, worker_cores=1)

cluster.scale(8)
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

## Get a client from the Dask Gateway Cluster

As stated above, creating a Dask `Client` is mandatory in order to perform following Daks computations on your Dask Cluster.

In [5]:
from distributed import Client

if cluster:
    client = Client(cluster) # create a dask Gateway cluster
else:
    client = Client()   # create a local dask cluster on the machine.
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: /jupyterhub/services/dask-gateway/clusters/daskhub.b1b7b935f3214a169bd90e122436c57d/status,


## Open dictionary of OMIP2 simulations
With file lists taken from the ESGF Search Catalog with the following search requirements:
1. On the native grid
2. Have the varaibles:`umo`, `vmo`, `so`, `thetao`, `zos`, `mlotst`, `siconc`, `deptho`, `areacello`
3. At monthly time steps
4. The last 61 years of the simulation

In [6]:
# Load in file of model names and fnames
model_fnames_dict = np.load("models.npy", allow_pickle=True).item()

## Function for writing zarr file to bucket

In [7]:
def write_dataset_to_zarr(ds,sim_name):
    # set path on bucket
    path='WAFFLES/OMIP2/'
    s3_prefix = "s3://" + path
    print(s3_prefix)
    
    # get storage keys
    access_key = !aws configure get aws_access_key_id
    access_key = access_key[0]
    secret_key = !aws configure get aws_secret_access_key
    secret_key = secret_key[0]
    
    # set storage target
    client_kwargs = {'endpoint_url': 'https://object-store.cloud.muni.cz'}
    target = s3fs.S3FileSystem(anon=False,client_kwargs=client_kwargs)
    
    # make file name for variable in simulation
    zarr_file_name = sim_name
    uri = f"{s3_prefix}/{zarr_file_name}"
    
    # get store argument for zarr
    store = zarr.storage.FSStore(uri,client_kwargs=client_kwargs,
                                 key=access_key, secret=secret_key)
    
    # write variable to zarr
    %time ds.to_zarr(store=store,mode='w',consolidated=True)
    
    return

## Function for reading from zarr bucket

In [8]:
def read_dataset_from_zarr(sim_name):
    # set path on bucket
    path='WAFFLES/OMIP2/'
    s3_prefix = "s3://" + path
    print(s3_prefix)
    
    # get storage keys
    access_key = !aws configure get aws_access_key_id
    access_key = access_key[0]
    secret_key = !aws configure get aws_secret_access_key
    secret_key = secret_key[0]
    
    # set storage target
    client_kwargs={'endpoint_url': 'https://object-store.cloud.muni.cz'}
    target = s3fs.S3FileSystem(anon=False,client_kwargs=client_kwargs)
    
    # file name for simulation
    zarr_file_name = sim_name
    uri = f"{s3_prefix}/{zarr_file_name}"
    
    # get store argument for zarr
    store = zarr.storage.FSStore(uri,client_kwargs=client_kwargs,
                                 key=access_key, secret=secret_key)
    
    # read variable from zarr
    ds=xr.open_zarr(store=store)
    
    return ds

## Preprocess and write subsets of all model data
Open a model and save each variable in the dataset as a `zarr` file in the `waffles` bucket

## Testing below this line
---

## Function for loading data from ESGF node

In [10]:
def reindex_lat(ds):
    # check if lat is decreasing
    if ds.lat.isel(x=0,y=0) > 0:
        ds = ds.reindex(y=list(reversed(ds.y))).assign_coords(y=ds.y)
    
    return ds

def model_preproc(ds):
    # fix naming
    ds = xmip.rename_cmip6(ds)
    # reindex y if lat is decreasing
    ds = reindex_lat(ds)
    # promote empty dims to actual coordinates
    ds = xmip.promote_empty_dims(ds)
    # demote coordinates from data_variables
    ds = xmip.correct_coordinates(ds)
    # broadcast lon/lat
    ds = xmip.broadcast_lonlat(ds)
    # shift all lons to consistent 0-360
    ds = xmip.correct_lon(ds)
    # fix the units
    ds = xmip.correct_units(ds)
    # rename the `bounds` according to their style (bound or vertex)
    ds = xmip.parse_lon_lat_bounds(ds)
    # sort verticies in a consistent manner
    ds = xmip.sort_vertex_order(ds)
    # convert vertex into bounds and vice versa, so both are available
    ds = xmip.maybe_convert_bounds_to_vertex(ds)
    ds = xmip.maybe_convert_vertex_to_bounds(ds)
    ds = xmip.fix_metadata(ds)
    ds = ds.drop_vars(["bnds", "vertex"], errors="ignore")
    return ds

def ds_from_esgf(model,model_fnames_dict,variables,flg_onefile=False,testing=False):    
    ## Generate filename from model_fnames_dict
    fnames_i = model_fnames_dict[model]
    
    # testing: Open a diverse, but small dataset
    if testing:
        fnames_i = fnames_i[0:30:6]
        
    # Only open a single file
    if flg_onefile:
        fnames_i = [fnames_i[0]]
        
    print(fnames_i)
    
    dss = {}
    print('Going through the variables...')
    
    for v in variables:
        print(v)
        ffs = [f for f in fnames_i if v+'_' in f]
        if len(ffs) > 0:
            n_files = str(len(ffs))
            print('Opening '+n_files+' file(s)...')
            # Open filenames
            with dask.config.set(**{'array.slicing.split_large_chunks': True}):
                dss[v] = xr.open_mfdataset(ffs,preprocess=model_preproc)
    
    # Combine datasets
    ds = xr.merge([dss[v] for v in dss.keys()],compat='override')
    # Subset by >50N
    print('Subsetting...')
    cond = (ds['lat']>=50)
    dsnow = ds.where(cond,drop=True) #.persist()
    
    # rechunk
    print('Rechunking...')
    if ('time' in list(dsnow.dims)) & ('lev' in list(dsnow.dims)):
        dsnow = dsnow.chunk(chunks={'time':-1,'lev':-1,'x':50,'y':50})
    elif 'time' in list(dsnow.dims):
        dsnow = dsnow.chunk(chunks={'time':-1,'x':50,'y':50})
    else:
        dsnow = dsnow.chunk(chunks={'x':50,'y':50})
    
    print('Done.')
    return(dsnow)

Models that **do not** load with `util.load_ds_from_esgf_file_in_model_fnames_dict`
1. CMCC-CM2-SR5 : `OSError: [Errno -68] NetCDF: I/O failure: b'http://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/OMIP/CMCC/CMCC-CM2-SR5/omip2/r1i1p1f1/Omon/thetao/gn/v20200226/thetao_Omon_CMCC-CM2-SR5_omip2_r1i1p1f1_gn_195801-201812.nc'`
2. ACCESS-OM2 : Takes prohibatively long time to even load <- allow to run last just to see if it ever loads
3. ACCESS-OM2-025 : Takes prohibatively long time to even load  
``` 
Task exception was never retrieved
future: <Task finished name='Task-8173' coro=<Client._gather.<locals>.wait() done, defined at /srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/client.py:2038> exception=AllExit()>
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/client.py", line 2047, in wait
    raise AllExit()
distributed.client.AllExit
```
4. EC-Earth3 : `OSError: [Errno -68] NetCDF: I/O failure: b'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/OMIP/EC-Earth-Consortium/EC-Earth3/omip2/r1i1p1f1/Omon/mlotst/gn/v20200928/mlotst_Omon_EC-Earth3_omip2_r1i1p1f1_gn_199401-199412.nc'`
5. 

Models that **do not** save to zarr when including 4D variables
1. TaiESM1-TIMCOM2 : `RuntimeError: NetCDF: Authorization failure`
2. NorESM2-LM : `RuntimeError: NetCDF: Authorization failure`
3. MRI-ESM2-0 : `OSError: [Errno -70] NetCDF: DAP server error: b'http://esgf-data2.diasjp.net/thredds/dodsC/esg_dataroot/CMIP6/OMIP/MRI/MRI-ESM2-0/omip1-spunup/r1i1p1f1/Ofx/areacello/gn/v20200406/areacello_Ofx_MRI-ESM2-0_omip1-spunup_r1i1p1f1_gn.nc'`
4. TaiESM1-TIMCOM : `RuntimeError: NetCDF: Authorization failure`
5. FGOALS-f3-L (Failed for 3D vars!) : `RuntimeError: NetCDF: Authorization failure`
6. MIROC6 (Failed for 3D vars!) : `OSError: [Errno -70] NetCDF: DAP server error: b'http://esgf-data2.diasjp.net/thredds/dodsC/esg_dataroot/CMIP6/OMIP/MIROC/MIROC6/omip2/r2i1p1f1/Omon/zos/gn/v20200612/zos_Omon_MIROC6_omip2_r2i1p1f1_gn_189801-199712.nc'`
7. CNRM-CM6-1 : (Failed for 3D vars!) : `RuntimeError: NetCDF: Authorization failure`
8. CMCC-CM2-HR4 : `RuntimeError: NetCDF: Authorization failure`
9. CNRM-CM6-1-HR : `RuntimeError: NetCDF: Authorization failure`

In [9]:
list(model_fnames_dict.keys())

['EC-Earth3',
 'NorESM2-LM',
 'MRI-ESM2-0',
 'CMCC-CM2-SR5',
 'CNRM-CM6-1',
 'FGOALS-f3-L',
 'CMCC-CM2-HR4',
 'MIROC6',
 'TaiESM1-TIMCOM2',
 'CNRM-CM6-1-HR',
 'ACCESS-OM2-025',
 'ACCESS-OM2',
 'TaiESM1-TIMCOM']

In [16]:
sim_name = list(model_fnames_dict.keys())[1]

In [17]:
sim_name

'NorESM2-LM'

In [None]:
variables = ['vmo','thetao','so','umo','zos','mlotst','areacello','deptho']

In [18]:
%%time
# load model from sim_name
ds = ds_from_esgf(sim_name1,model_fnames_dict,variables)

OSError: [Errno -68] NetCDF: I/O failure: b'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/OMIP/NCC/NorESM2-LM/omip2/r1i1p1f1/Omon/vmo/gn/v20190920/vmo_Omon_NorESM2-LM_omip2_r1i1p1f1_gn_201001-201812.nc'

In [19]:
ds

NameError: name 'ds' is not defined

In [None]:
list(ds.keys())

In [None]:
ds_sub = ds[['so']]
# ds_sub = ds[['areacello','deptho','mlotst','zos','so']]
# ds_sub = ds[['areacello','deptho','mlotst','zos']]

In [None]:
ds_sub

In [17]:
# See what's already in the bucket
target = s3fs.S3FileSystem(anon=False,client_kwargs={'endpoint_url': 'https://object-store.cloud.muni.cz'})
path='WAFFLES/OMIP2/'
target.ls(path)

['WAFFLES/OMIP2/']

In [None]:
ds_sub = ds_sub.persist()

In [None]:
write_dataset_to_zarr(ds_sub,sim_name)

In [None]:
test = read_dataset_from_zarr(sim_name)

In [None]:
test

In [None]:
test.so.isel(time=0,lev=0).plot(robust=True)

## Some useful, spare code