# ML Dataset

First attempt at an ML dataset.
Here one example will be one forecast date, and the corresponding observations.

We will use Arlan's top ten as features: https://docs.google.com/spreadsheets/d/1CzNvLExxJYhFAS_bWs97DON2xpG3zmDt-HHwQhqjpkE/edit#gid=0.

- *Surface Air Temperature (at 2 meter)*
- *Total precipitation*
- Soil moisture top 20 cm
- Snow Depth Water equivalent
- Sea Surface Temperature
- Sea Ice Cover
- Mean Sea Level Pressure
- Geopotential height@ 1000 hPa
- Geopotential height@ 200 hPa
- U-velocity (aka Zonal wind )@ 200 hPa
- U-velocity (aka Zonal wind )@ 850 hPa
- V-velocity (Meridional wind)@ 200 hPa
- V-velocity (Meridional wind)@ 850 hPa





In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask.distributed
import logging
import pathlib
import xarray as xr

from crims2s.data import normalize_dataset
from crims2s.dask import create_dask_cluster
from crims2s.mldataset import datestrings_from_input_dir, read_flat_fields, read_raw_obs
from crims2s.util import fix_dataset_dims

In [None]:
_logger = logging.getLogger(__name__)

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
_logger

## Initialize dask cluster

In [None]:
INPUT_DATA = '***BASEDIR***/training-input/'
INPUT_DATA_SPLIT_PLEV = '***BASEDIR***processed/training-input'
OBS_DATA = '***BASEDIR***/processed/training-output-reference/'
OBS_FILE_TEST = '***BASEDIR***/renku/forecast-like-observations_2020_biweekly_terciled.nc'
TRAIN_OBS = '***BASEDIR***/renku/hindcast-like-observations_2000-2019_biweekly_terciled.nc'
OUTPUT_DIR = '***BASEDIR***/training/2021-07-24-first'

RAW_OBS_T2M = '***BASEDIR***/raw-obs/obs_t2m_raw.nc'
RAW_OBS_PR = '***BASEDIR***/raw-obs/obs_pr_raw.nc'


CENTER = 'ecmwf'

In [None]:
output_path = pathlib.Path(OUTPUT_DIR)
output_path.mkdir(exist_ok=True, parents=True)

In [None]:
FLAT_FIELDS = ['t2m', 'tp', 'sm20', 'sst', 'ci', 'msl']
MULTILEVEL_FIELDS = [('u', [200, 850]), ('v', [200, 850]), ('gh', [1000, 200])]

In [None]:
train_obs_terciled = xr.open_dataset(TRAIN_OBS)

In [None]:
raw_obs = read_raw_obs(RAW_OBS_T2M, RAW_OBS_PR)

In [None]:
raw_obs

In [None]:
input_path = pathlib.Path(INPUT_DATA)
forecast_times = datestrings_from_input_dir(input_path, CENTER)

In [None]:
forecast_times[:10]

In [None]:
sample = read_flat_fields(input_path, CENTER, FLAT_FIELDS, forecast_times[0])

In [None]:
sample

In [None]:
raw_obs.sel(time=sample.valid_time.compute())

In [None]:
sample.isel(lead_time=[1]).isnull().sum(dim=['forecast_year', 'realization']).siconc.compute().plot()

In [None]:
examples = process_one_forecast_week(forecast_times[0], train_obs_terciled)

In [None]:
examples[10][1]

In [None]:
for x, y in examples:
    forecast_time = y.forecast_time
    year = int(forecast_time.dt.year)
    month = int(forecast_time.dt.month)
    day = int(forecast_time.dt.day)
    filename = f'train_example_{year:04}{month:02}{day:02}.nc'
    
    output_file = output_path / filename
    
    x.to_netcdf(output_file, group='/x', mode='w')
    y.to_netcdf(output_file, group='/y', mode='a')

In [None]:
y.isel(variable=0, lead_time=0, category=0).plot()

In [None]:
!ls -lh {OUTPUT_DIR}

In [None]:
tstt = xr.open_dataset(output_file, group='/y')

In [None]:
tstt

In [None]:
y

In [None]:
forecast_times[:5]

In [None]:
filenames = [f'{INPUT_DATA}/{CENTER}-hindcast-{f}-{forecast_times[0]}.nc' for f in flat_fields]
flat_dataset = xr.open_mfdataset(filenames, preprocess=fix_dataset_dims).isel(depth_below_and_layer=0, meanSea=0).drop(['depth_below_and_layer', 'meanSea'])

In [None]:
flattened_fields = []

for field, levels in multilevel_fields:
    filename = f'{INPUT_DATA}/{CENTER}-hindcast-{field}-{forecast_times[0]}.nc'
    ds = fix_dataset_dims(xr.open_dataset(filename))
    print(ds)
    for level in levels:
        flattened_fields.append(ds.sel(plev=level).drop('plev').rename({field: f'{field}{level}'}))

In [None]:
flattened_dataset = xr.merge(flattened_fields)

In [None]:
ds = normalize_dataset(xr.merge([flattened_dataset, flat_dataset]))

In [None]:
ds

In [None]:
ds.mean().compute()

In [None]:
ds.std().compute()

## Make examples from bigger dataset

In [None]:
to_export_x = ds.isel(forecast_year=0).to_array().isel(forecast_dayofyear=0).transpose('lead_time', 'latitude', 'longitude', 'realization', 'variable')

In [None]:
to_export_x

In [None]:
to_export_x.to_netcdf('***BASEDIR***/mldataset/')

## Already terciled observations

In [None]:
TRAIN_OBS = '***BASEDIR***/renku/hindcast-like-observations_2000-2019_biweekly_terciled.nc'
train_obs_terciled = xr.open_dataset(TRAIN_OBS)

In [None]:
train_obs_terciled