# ACCESS-S2 tercile probability forecasts

In [1]:
from dask_jobqueue import PBSCluster
from dask.distributed import Client

In [2]:
# One node on Gadi has 48 cores - try and use up a full node before going to multiple nodes (jobs)

walltime = '01:00:00'
cores = 20
memory = str(4 * cores)
memory = memory + 'GB'

cluster = PBSCluster(walltime=str(walltime), cores=cores, memory=str(memory), processes=cores,
                     job_extra_directives=['-q normal',
                                           '-P w42',
                                           '-l ncpus='+str(cores),
                                           '-l mem='+str(memory),
                                           '-l storage=gdata/w42+gdata/rt52+gdata/ux62'],
                     local_directory='$TMPDIR',
                     job_directives_skip=["select"])

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38937 instead


In [3]:
cluster.scale(jobs=1)
client = Client(cluster)

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: /proxy/38937/status,

0,1
Dashboard: /proxy/38937/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.121.56:37557,Workers: 0
Dashboard: /proxy/38937/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
# client.close()
# cluster.close()

In [12]:
import os
import xarray as xr
import numpy as np

# Calculate hindcast climatology

Using a climatological period of 1981-2018 if possible

In [7]:
hc_path = '/g/data/ux62/access-s2/hindcast/raw_model/atmos/pr/daily/'

This is quite a bit of data, so we'll preprocess it to select Aus region and 62 days lead time only.

In [8]:
def preprocess(da):
    """
    Select desired region, give new dims and coords, \\
    then select desired leads
    """
    da = da.sel(
        lon=slice(110, 160),
        lat=slice(-45, -10)
    )
    
    da = da.expand_dims({'init_date': [da['time'].values[0]]}) # New dimension for init date
    da = da.rename({'time': 'lead_time'}) # Rename time
    da = da.assign_coords({'lead_time': range(len(da['lead_time']))})
    
    # Some hindcasts don't have all leads available, so we take what we can
    lead_max = len(da['lead_time'])
    sel_min = np.min([62, lead_max])
    # print(sel_min, da)
    da = da.sel(lead_time=range(sel_min))

    return da

In [9]:
members = sorted(os.listdir(hc_path)) # list members

Open all files, rearrange them to have init_date and lead_time dims, then concat into one dataset.

This takes a bit of time - around 5 minutes per member so ~45 minutes.

In [17]:
%%time

ds_list = []
for member in members:
    print(member) # print to see where we're up to
    init_dates = sorted(os.listdir(hc_path + member + '/'))
    
    hcast_ens = xr.open_mfdataset(hc_path + 'e01/*.nc', preprocess=preprocess)
        
    # hcast_ens = xr.concat(files, dim='init_date') # concatenate over all init dates
    hcast_ens = hcast_ens.groupby('init_date.dayofyear').mean()
    hcast_ens = hcast_ens.expand_dims({'member': [int(member[1:])]}) # give this dataset the ensemble dimension
    
    ds_list.append(hcast_ens)
    
hcast_ds = xr.concat(ds_list, dim='member') # concat over all members

e01
e02
e03
e04
e05
e06
e07
e08
e09
CPU times: user 22min 40s, sys: 10min 16s, total: 32min 56s
Wall time: 43min 4s


In [61]:
# Average over ensemble members to obtain climatology
hcast_clim = hcast_ds.mean('member')
hcast_clim

Unnamed: 0,Array,Chunk
Bytes,205.62 MiB,915.47 kiB
Shape,"(230, 62, 63, 60)","(1, 62, 63, 60)"
Dask graph,230 chunks in 36315 graph layers,230 chunks in 36315 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 205.62 MiB 915.47 kiB Shape (230, 62, 63, 60) (1, 62, 63, 60) Dask graph 230 chunks in 36315 graph layers Data type float32 numpy.ndarray",230  1  60  63  62,

Unnamed: 0,Array,Chunk
Bytes,205.62 MiB,915.47 kiB
Shape,"(230, 62, 63, 60)","(1, 62, 63, 60)"
Dask graph,230 chunks in 36315 graph layers,230 chunks in 36315 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [49]:
# Rechunk to ~200 MB chunks
hcast_clim = hcast_clim['pr'].chunk({
    'lat': -1,
    'lon': -1,
    'lead_time': -1,
    'dayofyear': -1
})

In [50]:
# To ensure desired chunks get saved, need to delete encoding.
# Bug. See https://stackoverflow.com/questions/67476513/zarr-not-respecting-chunk-size-from-xarray-and-reverting-to-original-chunk-size
del hcast_clim.encoding['chunks']

In [51]:
# Climatology as a function of grid cell, lead time and initialisation date (dayofyear)
hcast_clim.to_dataset(name='pr').to_zarr(
    '/g/data/w42/dr6273/work/data/access-s2/pr_hindcast_dayofyear_clim_1981-2018.zarr',
    mode='w',
    consolidated=True
)

<xarray.backends.zarr.ZarrStore at 0x15443f3cc970>

# Dayofyear climatology is problematic as hindcasts are initialised twice a week, so dayofyear changes each year.
# Try monthly climatology

In [14]:
%%time

ds_list = []
for member in members:
    print(member) # print to see where we're up to
    init_dates = sorted(os.listdir(hc_path + member + '/'))
    
    hcast_ens = xr.open_mfdataset(hc_path + 'e01/*.nc', preprocess=preprocess)
        
    # hcast_ens = xr.concat(files, dim='init_date') # concatenate over all init dates
    hcast_ens = hcast_ens.groupby('init_date.month').mean()
    hcast_ens = hcast_ens.expand_dims({'member': [int(member[1:])]}) # give this dataset the ensemble dimension
    
    ds_list.append(hcast_ens)
    
hcast_ds = xr.concat(ds_list, dim='member') # concat over all members

e01
e02
e03
e04
e05
e06
e07
e08
e09
CPU times: user 16min 29s, sys: 10min 32s, total: 27min 1s
Wall time: 35min 16s


In [69]:
# Average over ensemble members to obtain climatology
hcast_clim = hcast_ds.mean('member')
hcast_clim

Unnamed: 0,Array,Chunk
Bytes,10.73 MiB,915.47 kiB
Shape,"(12, 62, 63, 60)","(1, 62, 63, 60)"
Dask graph,12 chunks in 35082 graph layers,12 chunks in 35082 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.73 MiB 915.47 kiB Shape (12, 62, 63, 60) (1, 62, 63, 60) Dask graph 12 chunks in 35082 graph layers Data type float32 numpy.ndarray",12  1  60  63  62,

Unnamed: 0,Array,Chunk
Bytes,10.73 MiB,915.47 kiB
Shape,"(12, 62, 63, 60)","(1, 62, 63, 60)"
Dask graph,12 chunks in 35082 graph layers,12 chunks in 35082 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [70]:
# Rechunk to ~200 MB chunks
hcast_clim = hcast_clim['pr'].chunk({
    'lat': -1,
    'lon': -1,
    'lead_time': -1,
    'month': -1
})

In [71]:
# To ensure desired chunks get saved, need to delete encoding.
# Bug. See https://stackoverflow.com/questions/67476513/zarr-not-respecting-chunk-size-from-xarray-and-reverting-to-original-chunk-size
del hcast_clim.encoding['chunks']

KeyError: 'chunks'

In [79]:
# Climatology as a function of grid cell, lead time and initialisation date (dayofyear)
hcast_clim.to_dataset(name='pr').to_zarr(
    '/g/data/w42/dr6273/work/data/access-s2/pr_hindcast_month_clim_1981-2018.zarr',
    mode='w',
    consolidated=True
)

<xarray.backends.zarr.ZarrStore at 0x154420c75620>

# Close cluster

In [54]:
client.close()
cluster.close()