# Subset surface climate data for eastern Australia

In [1]:
from dask.distributed import Client,LocalCluster
from dask_jobqueue import PBSCluster

In [2]:
# One node on Gadi has 48 cores - try and use up a full node before going to multiple nodes (jobs)

walltime = '00:30:00'
cores = 48
memory = str(4 * cores) + 'GB'

cluster = PBSCluster(walltime=str(walltime), cores=cores, memory=str(memory), processes=cores,
                     job_extra_directives=['-q normal',
                                           '-P w42',
                                           '-l ncpus='+str(cores),
                                           '-l mem='+str(memory),
                                           '-l storage=gdata/w42+gdata/rt52'],
                     local_directory='$TMPDIR',
                     job_directives_skip=["select"])
                     # python=os.environ["DASK_PYTHON"])

In [3]:
cluster.scale(jobs=3)
client = Client(cluster)

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: http://10.6.65.42:8787/status,

0,1
Dashboard: http://10.6.65.42:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.65.42:34623,Workers: 0
Dashboard: http://10.6.65.42:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
# client.close()
# cluster.close()

In [6]:
# %load_ext autoreload
# %autoreload 2

In [7]:
import xarray as xr
import numpy as np

In [8]:
import functions as fn

In [9]:
rez_region = {
    'name': 'REZ',
    'boundary': fn.get_REZ_boundary()
}
aus_region = {
    'name': 'AUS',
    'boundary': [112, 155, -10, -45]
}

# ERA5

In [10]:
years = range(1959, 2022)

In [11]:
root_path = '/g/data/rt52/era5/single-levels/'

# Solar radiation

Ideally we would have `ssrd` [Joules m^-2]. However, NCI does not store this variable.

Instead, (I think) we can use the Mean surface downward short-wave radiation flux `msdwswrf` [Watts m^-2]. This is the same as `ssrd` but expressed as a temporal average. In this case the average radiation per second over the hour.

Links:

- https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-Table4
- https://apps.ecmwf.int/codes/grib/param-db?id=169
- https://apps.ecmwf.int/codes/grib/param-db?id=235035
- https://www.ecmwf.int/sites/default/files/elibrary/2015/18490-radiation-quantities-ecmwf-model-and-mars.pdf

In [9]:
mssrd = fn.open_era_data(
    root_path=root_path+'reanalysis/',
    variable='msdwswrf',
    years=years,
    subset_region=aus_region['boundary'],
    rename_lon_lat=['lon', 'lat']
)

Check size of dataset (in GB)

In [20]:
mssrd.nbytes / (1024 ** 3)

50.18762895464897

Re-chunk to ~100Mb chunks.

In [11]:
mssrd = mssrd.chunk({
    'time': 24*90,
    'lon': -1,
    'lat': -1
})

Save to zarr

In [12]:
mssrd_encoding = {
    'msdwswrf': {
        'dtype': 'float32'
    }
}

In [19]:
mssrd.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/msdwswrf/msdwswrf_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=mssrd_encoding
)

<xarray.backends.zarr.ZarrStore at 0x14ee81f4aff0>

### Compute daily averages

Do this using 24 hour rolling mean, then select all 1400 hour times (equivalent to 0000 eastern state time)

In [21]:
mssrd = xr.open_zarr(
    '/g/data/w42/dr6273/work/data/era5/msdwswrf/msdwswrf_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    consolidated=True
)

In [None]:
mssrd_1400 = fn.daily_mean_1400(mssrd)

In [22]:
# mssrd_24hr = mssrd.rolling(time=24).mean()

In [23]:
# mssrd_1400 = mssrd_24hr.isel(time=mssrd_24hr.time.dt.hour == 14)

In [24]:
mssrd_1400 = mssrd_1400.chunk({'time': 365*6})

In [25]:
mssrd_1400.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/msdwswrf/msdwswrf_era5_daily_1400UTC_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=mssrd_encoding
)

<xarray.backends.zarr.ZarrStore at 0x14ee7b904a50>

# Wind speed 100m

Calculate using $w = \sqrt{u^2 + v^2}$

In [12]:
u = fn.open_era_data(
    root_path=root_path+'reanalysis/',
    variable='100u',
    years=years,
    subset_region=aus_region['boundary'],
    rename_lon_lat=['lon', 'lat']
)

In [13]:
v = fn.open_era_data(
    root_path=root_path+'reanalysis/',
    variable='100v',
    years=years,
    subset_region=aus_region['boundary'],
    rename_lon_lat=['lon', 'lat']
)

In [28]:
w = np.sqrt(u.rename({'u100': 'w100'})**2 + v.rename({'v100': 'w100'})**2)

Check size of dataset (in GB)

In [29]:
w.nbytes / (1024 ** 3)

50.188901253044605

Re-chunk to ~100Mb chunks.

In [14]:
u = u.chunk({
    'time': 24*90,
    'lon': -1,
    'lat': -1
})

v = v.chunk({
    'time': 24*90,
    'lon': -1,
    'lat': -1
})

In [30]:
w = w.chunk({
    'time': 24*90,
    'lon': -1,
    'lat': -1
})

Save to zarr

In [15]:
u_encoding = {
    'u100': {
        'dtype': 'float32'
    }
}
v_encoding = {
    'v100': {
        'dtype': 'float32'
    }
}
w_encoding = {
    'w100': {
        'dtype': 'float32'
    }
}

In [17]:
u.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/100u/100u_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=u_encoding
)

v.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/100v/100v_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=v_encoding
)

<xarray.backends.zarr.ZarrStore at 0x14e3c0e49cb0>

In [32]:
w.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/100w/100w_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=w_encoding
)

<xarray.backends.zarr.ZarrStore at 0x14ee7a3b0900>

### Compute daily averages

Do this using 24 hour rolling mean, then select all 1400 hour times (equivalent to 0000 eastern state time)

In [12]:
u = xr.open_zarr(
    '/g/data/w42/dr6273/work/data/era5/100u/100u_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    consolidated=True
)
v = xr.open_zarr(
    '/g/data/w42/dr6273/work/data/era5/100v/100v_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    consolidated=True
)

In [24]:
w = xr.open_zarr(
    '/g/data/w42/dr6273/work/data/era5/100w/100w_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    consolidated=True
)

In [13]:
u_1400 = fn.daily_mean_1400(u)
u_1400 = u_1400.chunk({'time': 365*6})

v_1400 = fn.daily_mean_1400(v)
v_1400 = v_1400.chunk({'time': 365*6})

In [25]:
w_1400 = fn.daily_mean_1400(w)
w_1400 = w_1400.chunk({'time': 365*6})

In [17]:
u_1400.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/100u/100u_era5_daily_1400UTC_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=u_encoding
)

<xarray.backends.zarr.ZarrStore at 0x14d841f7f680>

In [18]:
v_1400.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/100v/100v_era5_daily_1400UTC_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=v_encoding
)

<xarray.backends.zarr.ZarrStore at 0x14d84c1530d0>

In [28]:
w_1400.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/100w/100w_era5_daily_1400UTC_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+aus_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=w_encoding
)

<xarray.backends.zarr.ZarrStore at 0x146d0e5dbca0>

# Temperature 2m

In [12]:
t = fn.open_era_data(
    root_path=root_path+'reanalysis/',
    variable='2t',
    years=years,
    subset_region=rez_boundary,
    rename_lon_lat=['lon', 'lat']
)

Check size of dataset (in GB)

In [22]:
t.nbytes / (1024 ** 3)

25.412138305604458

Re-chunk to ~100Mb chunks.

In [28]:
t = t.chunk({
    'time': 24*90,
    'lon': -1,
    'lat': -1
})

Save to zarr

In [18]:
t_encoding = {
    't2m': {
        'dtype': 'float32'
    }
}

In [33]:
t.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/2t/2t_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_REZ_region.zarr',
    mode='w',
    consolidated=True,
    encoding=t_encoding
)

<xarray.backends.zarr.ZarrStore at 0x147310823b50>

### Compute daily averages

Do this using 24 hour rolling mean, then select all 1400 hour times (equivalent to 0000 eastern state time)

In [13]:
t = xr.open_zarr(
    '/g/data/w42/dr6273/work/data/era5/2t/2t_era5_reanalysis_sfc_'+str(years[0])+'-'+str(years[-1])+'_REZ_region.zarr',
    consolidated=True
)

In [15]:
t_1400 = fn.daily_mean_1400(t)

In [16]:
t_1400 = t_1400.chunk({'time': 365*6})

In [19]:
t_1400.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/2t/2t_era5_daily_1400UTC_sfc_'+str(years[0])+'-'+str(years[-1])+'_REZ_region.zarr',
    mode='w',
    consolidated=True,
    encoding=t_encoding
)

<xarray.backends.zarr.ZarrStore at 0x1508ff744890>

# Monthly runoff

In [30]:
ro = fn.open_era_data(
    root_path=root_path+'monthly-averaged/',
    variable='ro',
    years=years,
    subset_region=rez_region['boundary'],
    rename_lon_lat=['lon', 'lat']
)

Check size of dataset (in GB)

In [31]:
ro.nbytes / (1024 ** 3)

0.03534848242998123

Re-chunk to single chunk.

In [32]:
ro = ro.chunk({
    'time': -1,
    'lon': -1,
    'lat': -1
})

Save to zarr

In [33]:
ro_encoding = {
    'ro': {
        'dtype': 'float32'
    }
}

In [35]:
ro.to_zarr(
    '/g/data/w42/dr6273/work/data/era5/ro/ro_era5_monthly-averaged_sfc_'+str(years[0])+'-'+str(years[-1])+'_'+rez_region['name']+'_region.zarr',
    mode='w',
    consolidated=True,
    encoding=ro_encoding
)

<xarray.backends.zarr.ZarrStore at 0x146d0c31b7d0>