In [None]:
%load_ext autoreload
%autoreload 2

# Distributions of parameters

For every field that we want to use as input for deep learning, plot their distributions, so that we know what kind of normalization to apply.
I expect that we will mostly have normal distritbutions (or use normal distritbutions even though we shouldn't).

## Boot dask cluster

In [None]:
import dask
import dask.array as da
import dask.distributed
import dask_jobqueue
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import scipy.stats
import xarray as xr
import xskillscore as xs

In [None]:
from crims2s.util import fix_dataset_dims

In [None]:
INPUT_TRAIN = '***BASEDIR***training-input/0.3.0/netcdf'
INPUT_PROCESSED = '***BASEDIR***processed/training-input/'
OBSERVATIONS = '***BASEDIR***training-output-reference/'
BENCHNMARK = '***BASEDIR***training-output-benchmark/'
CENTER = 'ecmwf'

In [None]:
input_path = pathlib.Path(INPUT_TRAIN)
obs_path = pathlib.Path(OBSERVATIONS)
processed_path = pathlib.Path(INPUT_PROCESSED)

## Boost dask cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ***HOME***.bash_profile','conda activate s2s'],
)

In [None]:
cluster.scale(jobs=6)  # Scale to two working nodes as configured.

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

## Surface temperature

In [None]:
FIELD = 't2m'

In [None]:
field_files = sorted([f for f in input_path.iterdir() if CENTER in f.stem and FIELD in f.stem])
ecmwf = xr.open_mfdataset(field_files, preprocess=fix_dataset_dims)
ecmwf = ecmwf.persist()

In [None]:
### Overall mean

In [None]:
t2m_mean = ecmwf.t2m.mean().compute()

In [None]:
t2m_std = ecmwf.t2m.std().compute()

In [None]:
t2m_mean

In [None]:
t2m_std

### Check shape of distribution

In [None]:
ecmwf.t2m.isel(forecast_year=10).plot.hist()

This is not as gaussian as I'd like but for now a normal will do.

### Check yearly progression

Should we adjust for climate change correlation?
Ideally we would do this on a tile by tile basis (and look for a correlation between year and average temperature).
If there is a strong correlation, one way or the other, we should adjust for temperature.

In [None]:
ecmwf.dims

In [None]:
yearly_means = ecmwf.t2m.mean(dim=['realization', 'forecast_dayofyear', 'lead_time', 'latitude', 'longitude']).compute()

In [None]:
yearly_means.plot()

The trend is pretty clear! But it's about 1 degree over our training dataset. Given that temperture had a std of about 20, i'm not sure it matters enough to be priority.

Also, our models will be boosting models, so if the model incorporates this trend there could be double accounting.

In [None]:
obs_files = [f for f in obs_path.iterdir() if FIELD in f.stem]
obs = xr.open_mfdataset(obs_files, preprocess=fix_dataset_dims)

In [None]:
obs_yearly_means = obs.t2m.mean(dim=['forecast_dayofyear', 'lead_time', 'latitude', 'longitude']).compute()

In [None]:
obs_yearly_means.plot()

In [None]:
del ecmwf

In [None]:
del obs

## Other temperatures

In [None]:
FIELD = '-t-'
field_files = sorted([f for f in input_path.iterdir() if CENTER in f.stem and FIELD in f.stem])

In [None]:
field_files[:10]

In [None]:
ecmwf = fix_dataset_dims(xr.open_dataset(field_files[0], chunks={'plev': 1}))

In [None]:
ecmwf

In [None]:
ecmwf = xr.open_mfdataset(field_files, preprocess=fix_dataset_dims, chunks={'plev': 1})


In [None]:
ecmwf

In [None]:
by_plev = ecmwf.mean(dim=['latitude', 'longitude']).persist()

In [None]:
by_plev

In [None]:
del by_plev

In [None]:
c = by_plev.compute()
c

In [None]:
ecmwf = ecmwf.persist()

In [None]:
ecmwf

In [None]:
dask.config.get('array.chunk-size')

In [None]:
t_mean = ecmwf.mean().compute()
t_mean

In [None]:
t_std = ecmwf.mean().compute()
t_std

## Precipitation

For precipitation we will do our lienar rescaling over 2 weeks since the provided data is accumulated precipitation.
To the mean will be mean total precipitation over 2 weeks, etc.

In [None]:
FIELD = 'tp'

In [None]:
field_files = sorted([f for f in input_path.iterdir() if CENTER in f.stem and f'-{FIELD}-' in f.stem])

In [None]:
field_files[:10]

In [None]:
ecmwf = xr.open_mfdataset(field_files, preprocess=fix_dataset_dims)
ecmwf = ecmwf.persist()

In [None]:
ecmwf

In [None]:
w12 = ecmwf.isel(lead_time=14).drop('valid_time')
w34 = ecmwf.isel(lead_time=28) - w12
w56 = ecmwf.isel(lead_time=42) - ecmwf.isel(lead_time=28)

In [None]:
w12

In [None]:
w34

In [None]:
leads = [datetime.timedelta(days=x) for x in [14, 28, 42]]

In [None]:
tp = xr.concat([w12, w34, w56], dim='lead_time').assign_coords(lead_time=xr.DataArray(data=leads, dims='lead_time'))

In [None]:
tp.mean(dim=['latitude', 'longitude', 'realization', 'forecast_dayofyear', 'forecast_year']).compute()

In [None]:
tp_mean = tp.mean().compute()
tp_mean

In [None]:
tp_std = tp.std().compute()
tp_std

In [None]:
### Overall mean

In [None]:
tp_mean = ecmwf.tp.mean().compute()
tp_mean

In [None]:
tp_std = ecmwf.tp.std().compute()
tp_std

In [None]:
ecmwf

In [None]:
ecmwf.isel(forecast_dayofyear=0, forecast_year=0, latitude=40, longitude=10).mean(dim=['realization']).tp.plot()

## Geopotential Height

We read this field from the processed dir instead of the input dir.
Consequently there is no need for fix_dataset_dims we already did that when procesing the file.

In [None]:
HEIGHTS = [100]

In [None]:
for h in HEIGHTS:
    FIELD = f'-gh{h}-'
    field_files = sorted([f for f in processed_path.iterdir() if CENTER in f.stem and FIELD in f.stem])
    print(f'{len(field_files)} files.')
    ecmwf = xr.open_mfdataset(field_files)
    ecmwf = ecmwf.persist()
    mean = ecmwf.mean().compute().gh.item()
    std = ecmwf.std().compute().gh.item()
    del ecmwf
    print(f'GH{h}. Mean: {mean}. STD: {std}.')

In [None]:
field_files = sorted([f for f in processed_path.iterdir() if CENTER in f.stem and FIELD in f.stem])
ecmwf = xr.open_mfdataset(field_files)

In [None]:
ecmwf.isel(lead_time=0).gh.plot()

In [None]:
client.restart()

## Sea Surface Temperature

In [None]:
client.restart()

In [None]:
FIELD = 'sst'

In [None]:
field_files = sorted([f for f in input_path.iterdir() if CENTER in f.stem and FIELD in f.stem])
ecmwf = xr.open_mfdataset(field_files, preprocess=fix_dataset_dims)
ecmwf = ecmwf.persist()

In [None]:
### Overall mean

In [None]:
sst_mean = ecmwf.sst.mean().compute()
sst_mean

In [None]:
sst_std = ecmwf.sst.std().compute()
sst_std