In [None]:
%load_ext autoreload
%autoreload 2

# Parametric Distribution

Allow the use of any parametric distribution instead of only being able to have normal distributions when making a forecast probabilistic.
The difficult part is fitting more exotic distributions such as a gamma for every lat lon in the xdataset.

There seems to be a good reference in xclim: https://github.com/Ouranosinc/xclim/blob/f9d53c4cccb51174495860905c766f184796fc51/xclim/indices/stats.py

In [None]:
import dask
import dask.array as da
import dask.distributed
import dask_jobqueue
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import scipy.stats
import xarray as xr
import xskillscore as xs

In [None]:
from crims2s.util import fix_dataset_dims

In [None]:
INPUT_TRAIN = '***BASEDIR***training-input/0.3.0/netcdf'
OBSERVATIONS = '***BASEDIR***training-output-reference/'
BENCHNMARK = '***BASEDIR***training-output-benchmark/'

## Boost dask cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ***HOME***.bash_profile','conda activate s2s'],
)

In [None]:
cluster.scale(jobs=3)  # Scale to two working nodes as configured.
client = dask.distributed.Client(cluster)

In [None]:
client

## Read data

### ECMWF

In [None]:
CENTER = 'ecmwf'
FIELD = 't2m'

In [None]:
input_path = pathlib.Path(INPUT_TRAIN)

In [None]:
input_files = sorted([f for f in input_path.iterdir() if CENTER in f.stem and FIELD in f.stem])

In [None]:
input_files[:10]

In [None]:
ecmwf = xr.open_mfdataset(input_files, preprocess=fix_dataset_dims)

In [None]:
ecmwf_w34 = ecmwf.sel(lead_time=slice('14D', '27D'))
ecmwf_w34_train = ecmwf_w34.sel(forecast_year=slice(None, 2018))
ecmwf_w34_val = ecmwf_w34.sel(forecast_year=slice(2019, None))

In [None]:
ecmwf_w34_val

### Observations

In [None]:
obs_path = pathlib.Path(OBSERVATIONS)
obs_files = [f for f in obs_path.iterdir() if 't2m' in f.stem]

In [None]:
obs = xr.open_mfdataset(obs_files, preprocess=fix_dataset_dims).isel(lead_time=slice(1, None))
obs_w34 = obs.sel(lead_time=slice('14D', '27D'))

In [None]:
obs_w34_train = obs_w34.sel(forecast_year=slice(None, 2018))
obs_w34_val = obs_w34.sel(forecast_year=slice(2019, None))

In [None]:
obs_w34_val

## Non-parametric

The first thing I want to try is a totally non parametric version, I think it will be easiest.

In [None]:
obs_train_thresholds = obs_w34_train.quantile([0.33, 0.66], dim=['lead_time', 'forecast_year'])

In [None]:
obs_train_thresholds

In [None]:
ecmwf_w34_val.dims

In [None]:
n_lead_time = ecmwf_w34_val.dims['lead_time']
n_lead_time

In [None]:
below = (obs_w34_val < obs_train_thresholds.isel(quantile=0)).sum(dim=['lead_time']).drop_vars('quantile') / n_lead_time

In [None]:
whithin = ((obs_w34_val < obs_train_thresholds.isel(quantile=1)) & (obs_w34_val >= obs_train_thresholds.isel(quantile=0))).sum(dim='lead_time') / n_lead_time

In [None]:
whithin

In [None]:
above = (obs_w34_val >= obs_train_thresholds.isel(quantile=1)).sum(dim=['lead_time']).drop_vars('quantile') / n_lead_time

In [None]:
whithin.isel(forecast_dayofyear=0).t2m.plot()

In [None]:
(whithin + above + below).t2m.plot()

In [None]:
below.t2m.plot()

In [None]:
above.t2m.plot()

In [None]:
forecast = xr.concat([below, whithin, above], 'category').assign_coords(category=['below normal', 'near normal', 'above normal'])

In [None]:
land_mask = ~obs_w34_val.isel(forecast_dayofyear=0, lead_time=0, forecast_year=0).t2m.isnull()

In [None]:
land_mask.plot()

In [None]:
masked_forecast = xr.where(land_mask, forecast, np.nan)

In [None]:
masked_forecast.isel(forecast_dayofyear=0, category=0).t2m.plot()

In [None]:
forecast.isel(category=2, forecast_dayofyear=0).t2m.plot()

In [None]:
forecast.sum(dim='category').isel(forecast_dayofyear=0).t2m.plot()

In [None]:
def make_probabilistic_non_parametric(model, thresholds, dim=None):
    if dim is None:
        dim = model.dims.keys()
    elif isinstance(dim, str):
        dim = [dim]
    
    dims_sizes = [model.dims[d] for d in dim]
    sample_size = np.prod(dims_sizes)
    
    below = (model < thresholds.isel(quantile=0)).sum(dim=dim).drop_vars('quantile') / sample_size
    whithin = ((model < thresholds.isel(quantile=1)) & (model >= thresholds.isel(quantile=0))).sum(dim=dim) / sample_size
    above = (model >= thresholds.isel(quantile=1)).sum(dim=dim).drop_vars('quantile') / sample_size
    
    return xr.concat([below, whithin, above], 'category').assign_coords(category=['below normal', 'near normal', 'above normal'])

In [None]:
p = make_probabilistic_non_parametric(obs_w34_val, obs_train_thresholds, dim='lead_time')

In [None]:
p.isel(forecast_dayofyear=0, category=2).t2m.plot()

In [None]:
p.isel(forecast_dayofyear=0).sum(dim='category').t2m.plot()

## Normal parametric ditribution

This is a little bit of an easier case because we can estimate the distribution parameters using mean and std.

## Arbitrary parametric distritbution

This is the most complicated case because we have to use scipy's distribution parameter function on some of the dimensions.