# Prepare atmospheric and ocean data

In [1]:
from dask.distributed import Client,LocalCluster
from dask_jobqueue import PBSCluster

In [2]:
# One node on Gadi has 48 cores - try and use up a full node before going to multiple nodes (jobs)

walltime = '00:20:00'
cores = 48
memory = str(4 * cores) + 'GB'

cluster = PBSCluster(walltime=str(walltime), cores=cores, memory=str(memory), processes=cores,
                     job_extra_directives=['-q normal',
                                           '-P w42',
                                           '-l ncpus='+str(cores),
                                           '-l mem='+str(memory),
                                           '-l storage=gdata/w42+gdata/rt52+gdata/xv83'],
                     local_directory='$TMPDIR',
                     job_directives_skip=["select"])
                     # python=os.environ["DASK_PYTHON"])

In [3]:
cluster.scale(jobs=2)
client = Client(cluster)

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: http://10.6.65.35:8787/status,

0,1
Dashboard: http://10.6.65.35:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.65.35:39001,Workers: 0
Dashboard: http://10.6.65.35:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
# %reload_ext autoreload
# %autoreload 2

In [86]:
client.close()
cluster.close()

In [6]:
import xarray as xr
import numpy as np
import pandas as pd

In [7]:
import functions as fn

In [7]:
def anomalise_and_process(
    load,
    dataset,
    filepath,
    data_root_path,
    var_name,
    years,
    subset_region,
    subset_level,
    preprocess_func,
    mfdataset_chunks,
    anom_freq,
    new_var_name,
    long_name,
    units,
    chunks
):
    """
    Load or process and write data.
    
    Data is processed by loading from the desired dataset a variable over a specified period.
    The data are anomalised by removing the anom_freq means computed over the entire
    period.
    
    load : Bool. Whether to load or process.
    dataset : "era5" or "hadisst"
    filepath : file to load saved data
    data_root_path : path to raw era5 or hadisst data on Gadi.
    var_name : string. name of era5 variable (ignored if other dataset).
    years : range. Period to process.
    subset_region : list. Region to subset over. See functions.open_era_data
    subset_level : list. Pressure levels to select. See functions.open_era_data
    preprocess_func : Preprocess using xr.open_mfdataset if desired. See functions.open_era_data
    mfdataset_chunks : Chunks to use for preprocessing in xr.open_mfdataset. See functions.open_era_data
    anom_freq : string. Over what frequency to anomalise, e.g. month or dayofyear
    new_var_name : desired name of variable.
    long_name : long name for xarray Dataset attributes.
    units : units for xarray Dataset attributes.
    chunks : dict. Specify chunks before writing to zarr.
    """
    if load:
        return xr.open_zarr(filepath, consolidated=True)
    else:
        print('Opening files...')
        if dataset == 'era5':
            ds = fn.open_era_data(
                root_path=data_root_path,
                variable=var_name,
                years=years,
                subset_region=subset_region,
                subset_level=subset_level,
                preprocess_func=preprocess_func,
                mfdataset_chunks=mfdataset_chunks
            )
        elif dataset == 'hadisst':
            ds = xr.open_zarr(data_root_path, consolidated=True)
            ds = ds.sel(time=slice(str(years[0]), str(years[-1])))
            ds['time'] = pd.date_range(str(years[0]), str(years[-1])+'-12-01', freq='1MS')
            ds = xr.where(ds > -2, ds, np.nan)
        else:
            raise ValueError("Incorrect dataset.")
            
        ds = ds.rename({
            'latitude': 'lat',
            'longitude': 'lon',
            var_name: new_var_name
        })
        da = ds[new_var_name]
        da = da.chunk(chunks)
            
        print('Anomalising...')
        da_anoms = da.groupby('time.' + anom_freq).apply(lambda x: x - x.mean('time'))
            
        print('Attributes and writing...')
        da_anoms = da_anoms.assign_attrs({
            'long_name': long_name,
            'short_name': new_var_name,
            'units': units
        })
            
        ds_anoms = da_anoms.to_dataset(name=new_var_name + '_anom')
        encoding = {new_var_name + '_anom': {'dtype': 'float32'}}
        ds_anoms.to_zarr(
            filepath,
            mode='w',
            consolidated=True,
            encoding=encoding
        )
        
        return ds_anoms

In [8]:
data_fp = '/g/data/w42/dr6273/work/data/'
years = range(1959, 2021)

# Daily data on extended Australia region

In [9]:
extended_aus = [100, 180, 10, -60]

### Note:

I couldn't process hourly data to daily using the function above.
So, try loading hourly data beforehand, using preprocess to select subregion, level and calculate averages.
Write these daily data to file, then load using this function to anomalise as before.

In [10]:
import os

In [11]:
def get_files(file_path, var, years):
    """
    Get list of files
    """
    fp_list = []
    for year in years:
        fp_dir = file_path+var+'/'+str(year)+'/'
        for fp in sorted(os.listdir(fp_dir)):
            fp_list.append(fp_dir+fp)
    return fp_list

In [12]:
def _daily_mean_1400(ds):
    """
    Daily average of ds. Computed for 24 hour periods
    starting at 1400UTC (so an eastern Aus day, roughly).
    Currently hard-coded as its easier with open_mfdataset preprocess/
    """
    ds_24 = ds.rolling(time=24).mean()
    ds_daily = ds_24.isel(time=ds_24.time.dt.hour == 14)
    return ds_daily#.chunk({'time': -1})

In [13]:
dataset = 'era5'
levels = [500]

In [14]:
files = get_files('/g/data/rt52/era5/pressure-levels/reanalysis/', 'z', years)

In [19]:
# Using preprocess in open_mfdataset to select desired levels improves performance
#  versus doing a .sel() afterwards
def preprocess(ds):
    """
    Select desired levels and subregion
    """
    ds = ds.rename({
        'longitude': 'lon',
        'latitude': 'lat'
    })
    ds = ds.sel(
        level=levels,
        lon=slice(extended_aus[0], extended_aus[1]),
        lat=slice(extended_aus[2], extended_aus[3])
    )
    return ds #_daily_mean_1400(ds)

In [20]:
%%time
z = xr.open_mfdataset(
    files,
    chunks={'time': 24, 'level': 1},
    preprocess=preprocess,
    compat='override',
    coords='minimal',
    engine='netcdf4'
)

CPU times: user 11.4 s, sys: 708 ms, total: 12.1 s
Wall time: 12 s


In [21]:
z = _daily_mean_1400(z)

In [22]:
z = z.chunk({'time': 365})

In [24]:
%%time
encoding = {'z': {'dtype': 'float32'}}
    
z.to_zarr(
    data_fp + dataset + '/z/z_' + dataset + '_daily_average_pl_' + str(years[0]) + '-' + str(years[-1]) + '.zarr',
    mode='w',
    consolidated=True,
    encoding=encoding
)

KeyboardInterrupt: 

In [82]:
# year = [1959]

# z = fn.open_era_data(
#     root_path='/g/data/rt52/era5/pressure-levels/reanalysis/',
#     variable='z',
#     years=year,
#     subset_region=extended_aus,
#     subset_level=[500],
#     preprocess_func=_daily_mean_1400,
#     mfdataset_chunks={'time': 24}
# )

# z = z.chunk({'time': -1})

# z.to_zarr(
#     data_fp + dataset + '/z/z_' + dataset + '_daily_average_pl_' + str(years[0]) + '-' + str(years[-1]) + '.zarr',
#     mode='w',
#     consolidated=True
# )

## 500 hPa geopotential height

In [90]:
# years = range(2010, 2011)

In [83]:
# dataset = 'era5'
# z_fp = data_fp + dataset + '/z/z_anom_' + dataset + '_daily_average_pl_' + str(years[0]) + '-' + str(years[-1]) + '.zarr'
# print(z_fp)

In [84]:
# z_anoms = anomalise_and_process(
#     load=False,
#     dataset='era5',
#     filepath=z_fp,
#     data_root_path='/g/data/rt52/era5/pressure-levels/reanalysis/',
#     var_name='z',
#     years=years,
#     subset_region=extended_aus,
#     subset_level=[500],
#     preprocess_func=_daily_mean_1400,
#     mfdataset_chunks={'time': 24},
#     anom_freq='dayofyear',
#     new_var_name='z',
#     long_name='Geopotential height',
#     units='m',
#     chunks={'time': 12, 'lat': -1, 'lon': -1}
# )

In [85]:
# z = fn.open_era_data(root_path='/g/data/rt52/era5/pressure-levels/reanalysis/',
#                 variable='z',
#                 years=years,
#                     subset_region=extended_aus,
#                     subset_level=[500],
#                     preprocess_func=_daily_mean_1400,
#                     mfdataset_chunks={'time': 24})

# Monthly data, global

## SST data

In [9]:
dataset = 'hadisst'
sst_fp = data_fp + dataset + '/sst/sst_anom_'+dataset+'_moda_sfc_'+str(years[0])+'-'+str(years[-1])+'.zarr'
print(sst_fp)

/g/data/w42/dr6273/work/data/hadisst/sst/sst_anom_hadisst_moda_sfc_1959-2020.zarr


In [46]:
sst_anoms = anomalise_and_process(
    load=True,
    dataset='hadisst',
    filepath=sst_fp,
    data_root_path='/g/data/xv83/reanalyses/HadISST/ocean_month.zarr',
    var_name='sst',
    years=years,
    subset_region=None,
    subset_level=None,
    preprocess_func=None,
    mfdataset_chunks=None,
    anom_freq='month',
    new_var_name='sst',
    long_name='Sea surface temperature',
    units='K',
    chunks={'time': 12, 'lat': -1, 'lon': -1}
)

### Nino 3.4

In [47]:
nino34 = sst_anoms.sst_anom.sel(
    lat=slice(5, -5),
    lon=slice(-170, -120)
).mean(['lat','lon']).to_dataset(name='nino34')

In [48]:
nino34_dt = fn.detrend_dim(nino34.nino34, 'time').to_dataset(name='nino34_detrended')

In [49]:
nino34 = nino34.merge(nino34_dt)

In [50]:
nino34_fp = data_fp + dataset + '/climate_modes/'+dataset+'_nino34_'+str(years[0])+'-'+str(years[-1])+'.zarr'
nino34.to_zarr(nino34_fp, mode='w', consolidated=True)

<xarray.backends.zarr.ZarrStore at 0x150d125c49e0>

### Dipole mode index

In [51]:
def calc_dmi(da):
    """
    Calculate Dipole Mode Index
    """    
    da_W = da.sel(lat=slice(10, -10), lon=slice(50, 70)).mean(['lat', 'lon'])
    da_E = da.sel(lat=slice(0, -10), lon=slice(90, 110)).mean(['lat', 'lon'])
    
    return (da_W - da_E)

In [52]:
dmi = calc_dmi(sst_anoms.sst_anom).to_dataset(name='dmi')

In [53]:
dmi_dt = fn.detrend_dim(dmi.dmi, 'time').to_dataset(name='dmi_detrended')

In [54]:
dmi = dmi.merge(dmi_dt)

In [55]:
dmi_fp = data_fp + dataset + '/climate_modes/' + dataset + '_dmi_' + str(years[0]) + '-' + str(years[-1]) + '.zarr'

In [56]:
dmi.to_zarr(dmi_fp, mode='w', consolidated=True)

<xarray.backends.zarr.ZarrStore at 0x150d0f843920>

## MSLP data

In [8]:
dataset = 'era5'
mslp_fp = data_fp + dataset + '/mslp/mslp_anom_'+dataset+'_moda_sfc_'+str(years[0])+'-'+str(years[-1])+'.zarr'
print(mslp_fp)

/g/data/w42/dr6273/work/data/era5/mslp/mslp_anom_era5_moda_sfc_1959-2020.zarr


In [94]:
mslp_anoms = anomalise_and_process(
    load=False,
    dataset='era5',
    filepath=mslp_fp,
    data_root_path='/g/data/rt52/era5/single-levels/monthly-averaged/',
    var_name='msl',
    years=years,
    subset_region=None,
    subset_level=None,
    preprocess_func=None,
    mfdataset_chunks=None,
    anom_freq='month',
    new_var_name='mslp',
    long_name='Mean sea level pressure',
    units='Pa',
    chunks={'time': 12, 'lat': -1, 'lon': -1}
)

### Southern Annular Mode index

In [26]:
def calc_sam(mslp_anoms, lat_name='lat', lon_name='lon', time_name='time'):
    """
    Calculate Southern Annular Mode index from MSLP anomalies. The SAM index is
    defined as the difference in MSLP anomalies between 40 and 65 degrees South.
    MSLP anomalies are first normalised by dividing by their standard deviation
    (calculated as a function of calendar month).
    """
    
    mslp_40 = mslp_anoms.interp({lat_name: -40}).mean(lon_name)
    mslp_65 = mslp_anoms.interp({lat_name: -65}).mean(lon_name)
    
    norm_40 = mslp_40.groupby(time_name+'.month').apply(lambda x: x / x.std(time_name))
    norm_65 = mslp_65.groupby(time_name+'.month').apply(lambda x: x / x.std(time_name))
    
    return norm_40 - norm_65

In [27]:
sam = calc_sam(mslp_anoms)

In [31]:
sam = sam.rename({'mslp_anom': 'sam'})

In [32]:
sam_dt = fn.detrend_dim(sam.sam, 'time').to_dataset(name='sam_detrended')

In [33]:
sam = sam.merge(sam_dt)

In [36]:
sam_fp = data_fp + dataset + '/climate_modes/'+dataset+'_sam_'+str(years[0])+'-'+str(years[-1])+'.zarr'
sam.to_zarr(sam_fp, mode='w', consolidated=True)

<xarray.backends.zarr.ZarrStore at 0x1524973aad50>

# Close cluster

In [59]:
client.close()
cluster.close()