In [None]:
%load_ext autoreload
%autoreload 2

# Unbiased ECMWF

Here we propose a small model which is a debiased ECMWF forecast according to the data we have.
The plan is
* Compute the bias between the ECMWF model and the observations
* Make a debiased model
* Turn this model into a probabilistic forecast
* Score the forecast
For now we do it only on temperature and weeks 3-4.

In [None]:
import dask
import dask.array as da
import dask.distributed
import dask_jobqueue
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import scipy.stats
import xarray as xr
import xskillscore as xs

In [None]:
from crims2s.util import fix_dataset_dims

In [None]:
INPUT_TRAIN = '***BASEDIR***training-input/0.3.0/netcdf'
OBSERVATIONS = '***BASEDIR***training-output-reference/'
BENCHNMARK = '***BASEDIR***training-output-benchmark/'

## Boost dask cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ***HOME***.bash_profile','conda activate s2s'],
)

In [None]:
cluster.scale(jobs=2)  # Scale to two working nodes as configured.
client = dask.distributed.Client(cluster)

In [None]:
client

## Read data

### ECMWF

In [None]:
CENTER = 'ecmwf'
FIELD = 't2m'

In [None]:
input_path = pathlib.Path(INPUT_TRAIN)

In [None]:
input_files = sorted([f for f in input_path.iterdir() if CENTER in f.stem and FIELD in f.stem])

In [None]:
input_files[:10]

In [None]:
ecmwf = xr.open_mfdataset(input_files, preprocess=fix_dataset_dims)

In [None]:
ecmwf_w34 = ecmwf.sel(lead_time=slice('14D', '27D'))
ecmwf_w34_train = ecmwf_w34.sel(forecast_year=slice(None, 2018))
ecmwf_w34_val = ecmwf_w34.sel(forecast_year=slice(2019, None))

In [None]:
ecmwf_w34_val

### Observations

In [None]:
obs_path = pathlib.Path(OBSERVATIONS)
obs_files = [f for f in obs_path.iterdir() if 't2m' in f.stem]

In [None]:
obs = xr.open_mfdataset(obs_files, preprocess=fix_dataset_dims).isel(lead_time=slice(1, None))
obs_w34 = obs.sel(lead_time=slice('14D', '27D'))

In [None]:
obs_w34_train = obs_w34.sel(forecast_year=slice(None, 2018))
obs_w34_val = obs_w34.sel(forecast_year=slice(2019, None))

In [None]:
obs_w34_val

## Debiasing

### Compute bias using training data

In [None]:
ecmwf_w34_bias = (obs_w34_train - ecmwf_w34_train).mean(dim=['lead_time', 'forecast_year'])

In [None]:
ecmwf_w34_bias

### Bias correct ECMWF

In [None]:
ecmwf_w34_val_corrected = ecmwf_w34_val + ecmwf_w34_bias

In [None]:
ecmwf_w34_val_corrected

## Turn into probabilistic forecast

### Get thresholds from train observations

In [None]:
obs_w34_train_thresholds = obs_w34_train.quantile([0.33, 0.67], dim=['lead_time', 'forecast_year'])

In [None]:
obs_w34_train_thresholds

### Compute p of thresholds according to the model

There are two ways to do this. 
We can either count the amount of members that are whithin each category.
Or compute a distribution of all the members of the model, and then compute the value of the CDF for each threshold.

Here we do it using the distribution method.

#### Compute a distribution of the members of the model

In [None]:
ecmwf_w34_val_corrected_mean = ecmwf_w34_val_corrected.mean(dim=['realization', 'lead_time'])
ecmwf_w34_val_corrected_std = ecmwf_w34_val_corrected.std(dim=['realization', 'lead_time'])

#### Compute the value of the CDF for each threshold

In [None]:
ecmwf_w34_val_corrected_mean

In [None]:
def make_probabilistic(forecast, obs):
    thresholds = obs.quantile([0.33, 0.67], dim=['lead_time', 'forecast_year'])
    
    loc = forecast.mean(dim=['realization', 'lead_time']).compute().t2m
    scale = forecast.std(dim=['realization', 'lead_time']).compute().t2m
    
    cdfs = xr.apply_ufunc(scipy.stats.norm.cdf, thresholds.t2m, dask='allowed', kwargs={'loc': loc, 'scale': scale})
    
    below = cdfs.isel(quantile=0).drop_vars('quantile')
    normal = (cdfs.isel(quantile=1) - cdfs.isel(quantile=0))
    above = xr.ones_like(normal) - cdfs.isel(quantile=1).drop_vars('quantile')
    
    return xr.concat([below, normal, above], 'category').assign_coords(category=['below normal', 'near normal', 'above normal'])

In [None]:
val_probabilistic_forecast = make_probabilistic(ecmwf_w34_val_corrected, obs_w34_train)

In [None]:
val_probabilistic_forecast.isel(category=2, forecast_dayofyear=40).plot()

In [None]:
val_probabilistic_forecast.isel(category=1, forecast_dayofyear=40).plot()

In [None]:
loc = ecmwf_w34_val_corrected_mean.compute().t2m
scale = ecmwf_w34_val_corrected_std.compute().t2m

In [None]:
cdfs = xr.apply_ufunc(scipy.stats.norm.cdf, obs_w34_train_thresholds.t2m, dask='allowed', kwargs={'loc': loc, 'scale': scale})

In [None]:
cdfs.isel(quantile=1).plot()

In [None]:
below = cdfs.isel(quantile=0).drop_vars('quantile')
normal = (cdfs.isel(quantile=1) - cdfs.isel(quantile=0))
above = xr.ones_like(normal) - cdfs.isel(quantile=1).drop_vars('quantile')

In [None]:
val_probabilistic_forecast = xr.concat([below, normal, above], 'category', coords='minimal').assign_coords(category=['below normal', 'near normal', 'above normal'])

### Sanity check

In [None]:
val_probabilistic_forecast.sum(dim='category').isel(forecast_dayofyear=0).plot()

## Score it

### Make a probabilistic version of the val obs

In [None]:
obs_w34_val

In [None]:
loc = obs_w34_val.mean(dim=['lead_time']).compute().t2m
scale = obs_w34_val.std(dim=['lead_time']).compute().t2m

In [None]:
cdfs = xr.apply_ufunc(scipy.stats.norm.cdf, obs_w34_train_thresholds.t2m, dask='allowed', kwargs={'loc': loc, 'scale': scale})

In [None]:
below = cdfs.isel(quantile=0).drop_vars('quantile')
normal = (cdfs.isel(quantile=1) - cdfs.isel(quantile=0))
above = xr.ones_like(normal) - cdfs.isel(quantile=1).drop_vars('quantile')

In [None]:
val_probabilistic_obs = xr.concat([below, normal, above], 'category').assign_coords(category=['below normal', 'near normal', 'above normal'])

In [None]:
val_probabilistic_obs.isel(category=2, forecast_dayofyear=0).plot()

### Compare both proba using rps

In [None]:
val_probabilistic_obs

In [None]:
xs.rps(val_probabilistic_obs, val_probabilistic_forecast, category_edges=None, input_distributions='p', dim=['forecast_dayofyear']).plot()

In [None]:
val_probabilistic_obs.isel(forecast_dayofyear=0, category=2).plot()

In [None]:
val_probabilistic_forecast.isel(forecast_dayofyear=0, category=2).plot()