# Load, manipulate and save desired climate information

In [None]:
from dask_jobqueue import PBSCluster
from dask.distributed import Client

In [None]:
# One node on Gadi has 48 cores - try and use up a full node before going to multiple nodes (jobs)

walltime = '00:30:00'
cores = 48
memory = str(4 * cores)
memory = memory + 'GB'

cluster = PBSCluster(walltime=str(walltime), cores=cores, memory=str(memory),
                     job_extra=['-l ncpus='+str(cores),
                                '-l mem='+str(memory),
                                '-P xv83',
                                '-l storage=gdata/xv83+gdata/rt52+scratch/xv83'],
                     header_skip=["select"])

In [None]:
cluster.scale(jobs=1)
client = Client(cluster)

In [None]:
client

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import os

import matplotlib
import matplotlib.pyplot as plt

In [None]:
import functions as fn

# ERA5

In [None]:
years = range(1979, 2021)

In [None]:
root_path = '/g/data/rt52/era5/single-levels/monthly-averaged/'

### Calculate vapour pressure deficit (VPD)

- Use formula from https://www.nature.com/articles/s41598-019-51857-8#Sec12

$$ \mathrm{VPD} = c_1 \left( \exp \left[ \frac{c_2 \cdot T}{c_3 + T} \right] - \exp \left[ \frac{c_2 \cdot T_d}{c_3 + T_d} \right] \right),$$

where $c_1 = 0.611$ KPa, $c_2 = 17.5$, $c_3 = 240.978$ $^\circ$C, $T$ is temperature ($^\circ$C) and $T_d$ is dew-point temperature ($^\circ$C). The units of VPD are kilopascals (kPa). The first term represents saturated vapour pressure, and the second term represents actual vapour pressure.

In [None]:
def calculate_VPD(T, Td):
    """
    Calculates vapour pressure deficit from temperature and dew-point temperature.
    Data should be in degrees Celsius. VPD has units of kilopascals (kPa).
    Ref:  https://www.nature.com/articles/s41598-019-51857-8
    """

    c1 = 0.611
    c2 = 17.5
    c3 = 240.978
    
    saturated_vp = np.exp(c2 * T / (c3 + T))
    actual_vp = np.exp(c2 * Td / (c3 + Td))
    vpd = c1 * (saturated_vp - actual_vp)
    
    vpd.attrs['long_name'] = '2 metre vapour pressure deficit'
    vpd.attrs['short_name'] = '2m VPD'
    vpd.attrs['units'] = 'kPa'
    
    return vpd

In [None]:
# 2m temperature
era5_2t = fn.open_era_data(root_path, '2t', years)
era5_2t = era5_2t.rename({'latitude': 'lat',
                          'longitude': 'lon'})

# 2m dew point temperature
era5_2d = fn.open_era_data(root_path, '2d', years)
era5_2d = era5_2d.rename({'latitude': 'lat',
                          'longitude': 'lon'})

In [None]:
era5_vpd = calculate_VPD(era5_2t['t2m'] - 273.15,
                         era5_2d['d2m'] - 273.15)

Rechunk to single time chunk

In [None]:
era5_vpd = era5_vpd.chunk({'time': -1,
                           'lat': 250,
                           'lon': 250})

Save to zarr

In [None]:
era5_vpd = era5_vpd.to_dataset(name='vpd')

In [None]:
vpd_encoding = {'vpd': {'dtype': 'float32'}}

In [None]:
era5_vpd.to_zarr('/g/data/xv83/dr6273/work/data/vpd/vpd_era5_moda_sfc_'+str(years[0])+'-'+str(years[-1])+'.zarr',
                 mode='w',
                 consolidated=True,
                 encoding=vpd_encoding)

# GPCC

#### Merge different GPCC data sets to get latest data, if we need 2021
- As of 24/02/22, still waiting on ERA5 Nov and Dec 2021.

In [None]:
gpcc_path = '/g/data/xv83/dr6273/work/data/gpcc/'

In [None]:
years = range(1979, 2021)

In [None]:
gpcc = xr.open_mfdataset(gpcc_path + 'precip.full.data.monthly.v2020.1891-2019.concat.monitoring.v6.202001-202012.1deg.nc')

In [None]:
gpcc = gpcc.sel(time=slice(str(years[0]), str(years[-1])))

In [None]:
gpcc = gpcc['precip']
gpcc = gpcc.assign_attrs({'short_name': 'precip'})

In [None]:
gpcc = gpcc.chunk({'time': -1, 'lat': -1, 'lon': -1})

In [None]:
gpcc = gpcc.to_dataset(name='precip')

In [None]:
gpcc_encoding = {'precip': {'dtype': 'float32'}}

In [None]:
gpcc.to_zarr('/g/data/xv83/dr6273/work/data/gpcc/precip_gpcc_sfc_'+str(years[0])+'-'+str(years[-1])+'.zarr',
            mode='w',
            consolidated=True,
            encoding=gpcc_encoding)

# Berkeley temperature
- Provides temperature anomalies and the climatology. Use these to reconstruct temperature

In [None]:
years = range(1979, 2021)

In [None]:
berk = xr.open_mfdataset('/g/data/xv83/dr6273/work/data/berkeley/Complete_TAVG_LatLong1.nc')

In [None]:
berk['time'] = pd.date_range('1750-01-01', '2022-01-01', freq='1MS')
berk = berk.sel(time=slice(str(years[0]), str(years[-1])))

In [None]:
berk = berk.sortby('latitude', ascending=False)

In [None]:
anoms = berk['temperature']

In [None]:
clim = berk['climatology']

In [None]:
clim = clim.assign_coords({'month_number': range(1,13)}) # Change month number from 0-11 to 1-12

In [None]:
clim = clim.rename({'month_number': 'month'})

In [None]:
temp = anoms.groupby('time.month') + clim

In [None]:
temp = temp.drop('month')

In [None]:
berk = temp.rename({'latitude': 'lat', 'longitude': 'lon'})

In [None]:
berk = berk.assign_attrs({'short_name': 'temp'})

In [None]:
berk = berk.chunk({'time': -1, 'lat': -1, 'lon': -1})

In [None]:
berk = berk.to_dataset(name='temperature')

In [None]:
berk_encoding = {'temperature': {'dtype': 'float32'}}

In [None]:
berk.to_zarr('/g/data/xv83/dr6273/work/data/berkeley/tavg_berkeley_sfc_'+str(years[0])+'-'+str(years[-1])+'.zarr',
            mode='w',
            consolidated=True,
            encoding=berk_encoding)

### Tmax and Tmin

In [None]:
tmax = xr.open_mfdataset('/g/data/xv83/dr6273/work/data/berkeley/Complete_TMAX_LatLong1.nc')
tmin = xr.open_mfdataset('/g/data/xv83/dr6273/work/data/berkeley/Complete_TMIN_LatLong1.nc')

In [None]:
tmax['time'] = pd.date_range('1850-01-01', '2021-12-01', freq='1MS')
tmax = tmax.sel(time=slice(str(years[0]), str(years[-1])))

tmin['time'] = pd.date_range('1850-01-01', '2021-12-01', freq='1MS')
tmin = tmin.sel(time=slice(str(years[0]), str(years[-1])))

In [None]:
tmax = tmax.sortby('latitude', ascending=False)
tmin = tmin.sortby('latitude', ascending=False)

In [None]:
tmax_anoms = tmax['temperature']
tmin_anoms = tmin['temperature']

In [None]:
tmax_clim = tmax['climatology']
tmin_clim = tmin['climatology']

In [None]:
tmax_clim = tmax_clim.assign_coords({'month_number': range(1,13)}) # Change month number from 0-11 to 1-12
tmin_clim = tmin_clim.assign_coords({'month_number': range(1,13)}) # Change month number from 0-11 to 1-12

In [None]:
tmax_clim = tmax_clim.rename({'month_number': 'month'})
tmin_clim = tmin_clim.rename({'month_number': 'month'})

In [None]:
tmax = tmax_anoms.groupby('time.month') + tmax_clim
tmin = tmin_anoms.groupby('time.month') + tmin_clim

In [None]:
tmax = tmax.drop('month')
tmin = tmin.drop('month')

In [None]:
tmax = tmax.rename({'latitude': 'lat', 'longitude': 'lon'})
tmin = tmin.rename({'latitude': 'lat', 'longitude': 'lon'})

In [None]:
tmax = tmax.assign_attrs({'short_name': 'tmax'})
tmin = tmin.assign_attrs({'short_name': 'tmin'})

In [None]:
tmax = tmax.chunk({'time': -1, 'lat': -1, 'lon': -1})
tmin = tmin.chunk({'time': -1, 'lat': -1, 'lon': -1})

In [None]:
tmax = tmax.to_dataset(name='tmax')
tmin = tmin.to_dataset(name='tmin')

In [None]:
tmax_encoding = {'tmax': {'dtype': 'float32'}}
tmin_encoding = {'tmin': {'dtype': 'float32'}}

In [None]:
tmax.to_zarr('/g/data/xv83/dr6273/work/data/berkeley/tmax_berkeley_sfc_'+str(years[0])+'-'+str(years[-1])+'.zarr',
            mode='w',
            consolidated=True,
            encoding=tmax_encoding)
tmin.to_zarr('/g/data/xv83/dr6273/work/data/berkeley/tmin_berkeley_sfc_'+str(years[0])+'-'+str(years[-1])+'.zarr',
            mode='w',
            consolidated=True,
            encoding=tmin_encoding)

# Close cluster

In [None]:
client.close()
cluster.close()