# Prep variables for HydroTas 2020-2021 workplan:
- Skill assessment
  - rainfall, surface temp and surface wind over Australia region and Tasmania region
  - Assess skill as function of start month and ensemble size
- UNSEEN
  - Tasmanian rainfall and Melbourne surface temperature

In [1]:
import glob
import cftime
import geopandas
import regionmask
import numpy as np
import xarray as xr
import myfuncs as my
from dask.distributed import Client
from dask_jobqueue import PBSCluster

In [2]:
walltime = '01:00:00'
cores = 24
memory = '48GB'

cluster = PBSCluster(processes=1,
                     walltime=str(walltime), cores=cores, memory=str(memory),
                     job_extra=['-l ncpus='+str(cores),
                                '-l mem='+str(memory),
                                '-P ux06',
                                '-l jobfs=100GB',
                                '-l storage=gdata/xv83+gdata/v14+scratch/v14'],
                     local_directory='$PBS_JOBFS',
                     header_skip=['select'])

In [3]:
cluster.scale(jobs=2)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.6.55.45:45537  Dashboard: http://10.6.55.45:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


# Create Australia land mask on CAFE grid

In [4]:
def preprocess_f6(ds):
    """ Preprocess steps for the CAFE-f6 forecasts"""
    # Drop some coordinates
    for drop_coord in ['average_DT', 'average_T1', 'average_T2', 'zsurf', 'area']:
        if drop_coord in ds.coords:
            ds = ds.drop(drop_coord)
    # Truncate latitudes to 10dp
    for dim in ds.dims:
        if 'lat' in dim:
            ds = ds.assign_coords({dim: ds[dim].round(decimals=10)})
    return ds

In [5]:
CAFE_area = preprocess_f6(
    my.open_zarr(
        '/g/data/v14/vxk563/CAFE/forecasts/f6/WIP/c5-d60-pX-f6-20201101/ZARR/atmos_isobaric_daily.zarr.zip',
    )['area'])

NRM = geopandas.read_file('data/NRM_clusters/NRM_clusters.shp')
regions = regionmask.Regions_cls(
    name='NRM_regions', 
    numbers=list(NRM.index), 
    names=list(NRM.label), 
    abbrevs=list(NRM.code),
    outlines=list(NRM.geometry))
regions_mask = regions.mask(CAFE_area, lon_name='lon', lat_name='lat')

Australia_mask = xr.where(regions_mask.notnull(), True, False)

# Parameters

In [12]:
REGIONS = {'AUS': Australia_mask,
           'TAS': (-42, 146.5),
           'MEL': (-37.81, 144.96)}

# Get f6 atmospheric monthly variables

In [86]:
VARIABLES = {'precip': 'precip', 
             't_ref': 't_ref',
             'u_ref': 'u_ref', 
             'v_ref': 'v_ref'}

In [87]:
paths_xv83 = glob.glob(
    '/g/data/xv83/ds0092/CAFE/forecasts/f6/WIP/c5-d60-pX-f6-??????01/ZARR/atmos_isobaric_month.zarr.zip'
)
paths_v14 = glob.glob(
    '/g/data/v14/vxk563/CAFE/forecasts/f6/WIP/c5-d60-pX-f6-??????01/ZARR/atmos_isobaric_month.zarr.zip'
)
paths = sorted(paths_xv83+paths_v14, key=lambda x: x.split('/')[-3])

In [88]:
%%time
for name, region in REGIONS.items():
    ds = my.open_zarr_forecasts(
        paths, 
        variables=VARIABLES.keys(),
        preprocess=preprocess_f6).rename(VARIABLES)#.chunk({'init_date':16})
    
    # Weighted mean over region
    ds = my.get_region(ds, region).weighted(CAFE_area).mean(['lat','lon'])

    # Fill nans in time with dummy times so that time operations work nicely
    ds = ds.assign_coords({
        'time': ds.time.fillna(cftime.DatetimeJulian(1800, 1, 1))})

    # Chunk appropriately
    ds = ds.chunk({'init_date': -1, 'lead_time': -1})
    
    # Save
    [my.to_zarr(ds[[var]], f'./data/f6_{var}_{name}_raw.zarr')
     for var in VARIABLES.values()]

CPU times: user 1min 12s, sys: 7.28 s, total: 1min 19s
Wall time: 2min 53s


# Get f5 atmospheric monthly variables

In [13]:
VARIABLES = {'precip': 'precip', 
             't_ref': 't_ref',
             'u_ref': 'u_ref', 
             'v_ref': 'v_ref'}

In [14]:
path = '/g/data/v14/vxk563/CAFE/forecasts/f5/ZARR/atmos_isobaric_month.zarr.zip'

In [15]:
def preprocess_f5(ds):
    """ Preprocess steps for the CAFE-f6 forecasts"""
    # Drop some coordinates
    for drop_coord in ['average_DT', 'average_T1', 'average_T2', 'zsurf', 'area']:
        if drop_coord in ds.coords:
            ds = ds.drop(drop_coord)
    # Truncate latitudes to 10dp
    for dim in ds.dims:
        if 'lat' in dim:
            ds = ds.assign_coords({dim: ds[dim].round(decimals=10)})
    return ds

In [16]:
%%time
for name, region in REGIONS.items():
    ds = my.open_zarr(
        path, 
        variables=VARIABLES.keys(),
        preprocess=preprocess_f5).rename(VARIABLES)
    
    # Weighted mean over region
    ds = my.get_region(ds, region).weighted(CAFE_area).mean(['lat','lon'])

    # Fill nans in time with dummy times so that time operations work nicely
    ds.time.attrs['calendar_type'] = 'JULIAN'
    ds = ds.assign_coords({
        'time': ds.time.fillna(cftime.DatetimeJulian(1800, 1, 1))})

    # Chunk appropriately
    ds = ds.chunk({'init_date': -1, 'lead_time': -1})
    
    # Save
    [my.to_zarr(ds[[var]], f'./data/f5_{var}_{name}_raw.zarr')
     for var in VARIABLES.values()]

CPU times: user 15.3 s, sys: 1.38 s, total: 16.6 s
Wall time: 1min 51s


In [19]:
obsv_lead_times = ds.lead_time
obsv_init_dates = ds.init_date

# JRA-55 data

In [20]:
VARIABLES = {'TPRAT_GDS0_SFC': 'precip', 
             'TMP_GDS0_HTGL': 't_ref', 
             'UGRD_GDS0_HTGL': 'u_ref', 
             'VGRD_GDS0_HTGL': 'v_ref'}

In [21]:
path = '/g/data/v14/ds0092/data/ZARR/csiro-dcfp-jra55/surface_month_cafe-grid.zarr'

In [22]:
def preprocess_jra(ds):
    """ Preprocess steps for the JRA data"""
    # Rename time and level
    for key, val in {'initial_time0_hours': 'time', 
                     'lv_ISBL1': 'level'}.items():
        if key in ds.coords:
                ds = ds.rename({key: val})
    # Drop filename attribute
    del ds.attrs['filename']
    # Truncate latitudes to 10dp
    for dim in ds.dims:
        if 'lat' in dim:
            ds = ds.assign_coords({dim: ds[dim].round(decimals=10)})
    return ds

In [23]:
%%time
for name, region in REGIONS.items():
    ds = my.open_zarr(
        path, 
        variables=VARIABLES.keys(), 
        preprocess=preprocess_jra).rename(VARIABLES)
    
    # Weighted mean over region
    ds = my.get_region(ds, region).weighted(CAFE_area).mean(['lat','lon'])

    # Chunk appropriately
    ds = ds.chunk({'time': -1})
    
    # Save
    [my.to_zarr(ds[[var]], f'./data/jra55_{var}_{name}_ts.zarr')
     for var in VARIABLES.values()]
    
    # Stack by initial date
    ds = my.stack_by_init_date(
        ds, 
        obsv_init_dates, 
        len(obsv_lead_times)).chunk(
        {'init_date': -1, 'lead_time': -1})
    
    # Fill nans in time with dummy times so that time operations work nicely
    ds.time.attrs['calendar_type'] = 'Proleptic_Gregorian'
    ds = ds.assign_coords({
        'time': ds.time.fillna(cftime.DatetimeProlepticGregorian(1800, 1, 1))})
    
    # Save
    [my.to_zarr(ds[[var]], f'./data/jra55_{var}_{name}.zarr')
     for var in VARIABLES.values()]

CPU times: user 6.81 s, sys: 692 ms, total: 7.5 s
Wall time: 21.6 s


# AWAP monthly data

In [24]:
VARIABLES = {'precip': 'precip'}

In [25]:
path = '/g/data/v14/ds0092/data/ZARR/csiro-dcfp-csiro-awap/rain_day_19000101-20201202_cafe-grid.zarr'

In [26]:
def preprocess_awap(ds):
    """ Preprocess steps for the AWAP data"""
    # Truncate latitudes to 10dp
    for dim in ds.dims:
        if 'lat' in dim:
            ds = ds.assign_coords({dim: ds[dim].round(decimals=10)})
    return ds

In [27]:
%%time
for name, region in REGIONS.items():
    ds = my.open_zarr(
        path, 
        variables=VARIABLES.keys(), 
        preprocess=preprocess_awap).rename(VARIABLES)
    
    # Sum to monthly values
    def sum_min_samples(ds, dim, min_samples):
        """ Return sum only if there are more than min_samples along dim """
        s = ds.sum(dim, skipna=False)
        return s if len(ds[dim]) >= min_samples else np.nan*s
    ds = ds.resample(time='MS').map(sum_min_samples, dim='time', min_samples=28)
    
    # Weighted mean over region
    ds = my.get_region(ds, region).weighted(CAFE_area).mean(['lat','lon'])

    # Chunk appropriately
    ds = ds.chunk({'time': -1})
    
    # Save
    [my.to_zarr(ds[[var]], f'./data/awap_{var}_{name}_ts.zarr')
     for var in VARIABLES.values()]
    
    # Stack by initial date
    ds = my.stack_by_init_date(
        ds, 
        obsv_init_dates, 
        len(obsv_lead_times)).chunk(
        {'init_date': -1, 'lead_time': -1})
    
    # Fill nans in time with dummy times so that time operations work nicely
    ds.time.attrs['calendar_type'] = 'Proleptic_Gregorian'
    ds = ds.assign_coords({
        'time': ds.time.fillna(cftime.DatetimeProlepticGregorian(1800, 1, 1))})
    
    # Save
    [my.to_zarr(ds[[var]], f'./data/awap_{var}_{name}.zarr')
     for var in VARIABLES.values()]

CPU times: user 59.8 s, sys: 1.27 s, total: 1min 1s
Wall time: 1min 14s


# End

In [28]:
cluster.close()
client.close()