In [None]:
%load_ext autoreload
%autoreload 2

# Prepare ML Dataset

Prepare a dataset that is preprocessed and ready to use for ML applications.
A dataset is ML ready if there is little processing required and it is trivial to extract a single example.

In [None]:
import dask
import dask.array as da
import dask.distributed
import dask_jobqueue
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import scipy.stats
import xarray as xr
import xskillscore as xs

In [None]:
from crims2s.util import fix_dataset_dims

In [None]:
INPUT_TRAIN = '***BASEDIR***training-input/0.3.0/netcdf'
INPUT_PROCESSED = '***BASEDIR***processed/training-input/'
OBSERVATIONS = '***BASEDIR***training-output-reference/'
BENCHNMARK = '***BASEDIR***training-output-benchmark/'
CENTER = 'ecmwf'

In [None]:
input_path = pathlib.Path(INPUT_TRAIN)
obs_path = pathlib.Path(OBSERVATIONS)
processed_path = pathlib.Path(INPUT_PROCESSED)

## Boost dask cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ***HOME***.bash_profile','conda activate s2s'],
)

In [None]:
cluster.scale(jobs=8)  # Scale to two working nodes as configured.

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## Bi-weekly aggregated observations

For training we will use these aggregates as targets.

For temperature this should be a gaussian distribution.
For precipitation this should be a censored gamma.

In [None]:
obs_files = [f for f in obs_path.iterdir()]

In [None]:
obs = xr.open_mfdataset(obs_files, preprocess=fix_dataset_dims)

In [None]:
obs

In [None]:
obs_w12 = obs.sel(lead_time=slice(None, '13D'))
obs_w34 = obs.sel(lead_time=slice('14D', '27D'))
obs_w56 = obs.sel(lead_time=slice('28D', '41D'))

In [None]:
obs_w12.lead_time.data.astype('timedelta64[D]')

In [None]:
obs_w56.isel(forecast_dayofyear=0, forecast_year=19).valid_time.compute()

In [None]:
obs_w12_t2m_mean = obs_w12.t2m.mean(dim='lead_time')
obs_w12_t2m_std = obs_w12.t2m.std(dim='lead_time')

In [None]:
obs_w12_t2m_mean

In [None]:
obs_w12_t2m_dataset = 

In [None]:
datasets = []
for week in [obs_w12, obs_w34, obs_w56]:
    t2m_mean = obs_w12.t2m.mean(dim='lead_time')
    t2m_std = obs_w12.t2m.std(dim='lead_time')
    
    t2m_dataset = xr.Dataset({'t2m_mean': obs_w12_t2m_mean, 't2m_std': obs_w12_t2m_std})
    datasets.append(t2m_dataset)

In [None]:
leads = [datetime.timedelta(days=x) for x in [0, 14, 28]]
aggregate_t2m = xr.concat(datasets, dim='lead_time').assign_coords(lead_time=xr.DataArray(data=leads, dims='lead_time'))

In [None]:
aggregate_t2m

In [None]:
aggregate_t2m.isel(forecast_dayofyear=12, forecast_year=10, lead_time=1).t2m_std.plot()

In [None]:
aggregate_t2m = aggregate_t2m.assign_coords(forecast_time=obs.forecast_time)

Reuse the coords from the original dataset by selecting only the three dates of interest

In [None]:
subset_for_coords = obs.sel(lead_time=xr.DataArray(leads, dims='lead_time'))

In [None]:
subset_for_coords

In [None]:
aggregate_t2m = aggregate_t2m.assign_coords(valid_time=subset_for_coords.valid_time)

In [None]:
aggregate_t2m

In [None]:
output_path = pathlib.Path('***BASEDIR***processed/training-output-reference/t2m_biweekly.nc')

In [None]:
aggregate_t2m.to_netcdf(output_path, compute=True)

In [None]:
!ls '***BASEDIR***processed/training-output-reference'

In [None]:
aggregate_obs = xr.open_dataset(output_path)

In [None]:
aggregate_obs