In [None]:
%load_ext autoreload
%autoreload 2

Export a dataset suitable for machine learning to the HDF5 format.
The target dataset should have the following features both for validation and train:

* Numerical station ID, to be expanded to one-hot on training.
* Temporal features
* Station features
* Target value (in this case the 2t error)

In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import numpy as np
import os
import pathlib
import pandas as pd
import seaborn as sns

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))

# Boot cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=2)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

# Read dataset

In [None]:
INPUT_DIR = DATA_DIR / '2021-03-17-ppdataset/'

In [None]:
df = dd.read_parquet(DATA_DIR / '2021-03-17-ppdataset/*.parquet')

In [None]:
df

In [None]:
df.head()

In [None]:
#df = df.persist()

Adding pertinent fields

In [None]:
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['squared_error_2t'] = (df['gdps_2t'] - df['obs_2t']) ** 2
df['step_hour'] = df['step'] / 3600
df['step_td'] = dd.to_timedelta(df['step'], unit='S')
df['valid'] = df['date'] + df['step_td']
df['forecast_hour'] = df['date'].dt.hour

In [None]:
df['abs_error_2t'] = np.abs(df['error_2t'])

Selection the stations which have a certain number of total observations.

In [None]:
obs_counts = df['station'].value_counts().compute()

In [None]:
obs_counts

In [None]:
stations_with_obs = obs_counts.index[obs_counts > 30].tolist()

In [None]:
subset = df[df['station'].isin(stations_with_obs)]

In [None]:
subset

In [None]:
subset = subset.persist()

In [None]:
subset.groupby('step')['abs_error_2t'].mean().compute().plot()

In [None]:
grouped = subset[subset['step_hour'] < 48].groupby(['step', 'forecast_hour']).mean().compute()

In [None]:
sns.set_theme()

In [None]:
sns.scatterplot(x='step_hour', y='squared_error_2t', hue='forecast_hour', markers=True, data=grouped)

In [None]:
counts = subset[subset['step_hour'] < 48].groupby(['step_hour', 'forecast_hour']).count().compute()

In [None]:
counts

In [None]:
sns.scatterplot(x='step_hour', y='squared_error_2t', hue='forecast_hour', data=counts)

## Generating station ids

In [None]:
stations = subset['station'].compute()

In [None]:
stations = stations.astype('category')

In [None]:
station_ids = stations.cat.codes

In [None]:
station_ids = np.array(station_ids)

## Generating features

In [None]:
subset.columns

In [None]:
feature_columns = [c for c in subset.columns if c.startswith('gdps')]

In [None]:
feature_columns = ['latitude', 'longitude', 'elevation', *feature_columns]

In [None]:
feature_columns

In [None]:
features = subset[feature_columns]

In [None]:
features_array = features.to_dask_array()

Apply a very crude rescaling of the values.
Some columns might warrant a dedicated rescaling (wdirs which are angles, prate which have an exponential distribution)

In [None]:
features_array = (features_array - features_array.mean(axis=0)) / features_array.std(axis=0)

In [None]:
features_array

In [None]:
subset['error_2t'].groupby('step').mean()

## Timestamps for future reference

In [None]:
subset['timestamp'] = subset['date'].astype('int64') // 10**9

In [None]:
subset['step'].head()

In [None]:
time_reference = subset[['timestamp', 'step']].to_dask_array()

In [None]:
time_reference

## Temporal features

In [None]:
subset['yearly_component'] = da.sin((subset['valid'].dt.dayofyear / 366) * 2*np.pi )

In [None]:
subset['daily_component'] = da.sin((subset['valid'].dt.hour / 24) * 2*np.pi)

In [None]:
subset['step_component'] = (subset['step'] / (237 * 60 * 60))

In [None]:
temporal_features = subset[['yearly_component', 'daily_component', 'step_component']].to_dask_array()

## Target value

In this case 2t error.

In [None]:
target_features = subset[['obs_2t']].to_dask_array()

In [None]:
target_features

In [None]:
target_features.mean().compute()

In [None]:
target_features.std().compute()

In [None]:
target_features = target_features / 15.0

## Get sizes of arrays

In [None]:
temporal_features.compute_chunk_sizes()

In [None]:
features_array.compute_chunk_sizes()

In [None]:
target_features.compute_chunk_sizes()

In [None]:
time_reference.compute_chunk_sizes()

In [None]:
station_ids_compute = station_ids

## Merging everything together

In [None]:
features_array = da.concatenate([temporal_features, features_array], axis=1)

In [None]:
features_array = features_array.rechunk((100000, 26))

Train val split

In [None]:
train_mask = (subset['date'].dt.year == 2019).values

In [None]:
val_mask = (subset['date'].dt.year == 2020).values

In [None]:
train_mask.compute_chunk_sizes()

In [None]:
val_mask.compute_chunk_sizes()

In [None]:
train_features = features_array[train_mask]
train_targets = target_features[train_mask]
train_stations = station_ids[train_mask]
train_time = time_reference[train_mask]

In [None]:
val_features = features_array[val_mask]
val_targets = target_features[val_mask]
val_stations = station_ids[val_mask]
val_time = time_reference[val_mask]

# Export to HDF5

In [None]:
import h5py

In [None]:
f = h5py.File(DATA_DIR / '2021-03-17-ppdataset/full.hdf', 'w')

In [None]:
f.create_dataset('/train/x', data=train_features, compression='lzf', chunks=(10000,26))

In [None]:
f.create_dataset('/train/stations', data=train_stations, compression='lzf', chunks=(10000,))

In [None]:
f.create_dataset('/train/y', data=train_targets, compression='lzf', chunks=(10000,1))

In [None]:
f.create_dataset('/train/time', data=train_time, compression='lzf', chunks=(10000,2))

In [None]:
f.create_dataset('/val/x', data=val_features, compression='lzf', chunks=(10000,26))

In [None]:
f.create_dataset('/val/stations', data=val_stations, compression='lzf', chunks=(10000,))

In [None]:
f.create_dataset('/val/y', data=val_targets, compression='lzf', chunks=(10000,1))

In [None]:
f.create_dataset('/val/time', data=val_time, compression='lzf', chunks=(10000,2))

In [None]:
f.close()

In [None]:
station_ids.max()