In [None]:
%load_ext autoreload
%autoreload 2

Perform analysis of the parquet exported data using a Dask cluster.

In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import numpy as np
import os
import pathlib
import pandas as pd

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))

# Boot cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=2)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

# Read dataset

In [None]:
INPUT_DIR = DATA_DIR / '2021-03-17-ppdataset/'

In [None]:
df = dd.read_parquet(DATA_DIR / '2021-03-17-ppdataset/*.parquet')

In [None]:
df

In [None]:
df = df.persist()

In [None]:
(df['date'] == '2019-06-01').sum().compute()

In [None]:
subset = df[df['date'] == '2019-06-01'].compute()

In [None]:
df['error_2t'] = df['gdps_2t'] - df['obs_2t']
df['squared_error_2t'] = (df['gdps_2t'] - df['obs_2t']) ** 2
df['step_hour'] = df['step'] / 3600
df['step_td'] = dd.to_timedelta(df['step'], unit='S')
df['valid'] = df['date'] + df['step_td']

In [None]:
error_by_step = df.groupby('step_hour').mean()['squared_error_2t'].compute()

In [None]:
error_by_step.plot()

In [None]:
error_by_run = df.groupby('date').mean()['squared_error_2t'].compute()

In [None]:
np.sqrt(error_by_run).plot()

In [None]:
obs_counts = df['station'].value_counts().compute()

In [None]:
obs_counts

In [None]:
obs_counts > 30

In [None]:
type(obs_counts)

In [None]:
stations_with_obs = obs_counts.index[obs_counts > 30].tolist()

In [None]:
subset = df[df['station'].isin(stations_with_obs)]

In [None]:
subset

In [None]:
subset['station'].value_counts().compute().min()

In [None]:
subset = subset.categorize(columns=['station'])

In [None]:
subset = subset.persist()

In [None]:
station_ids = subset['station'].cat.codes.values

In [None]:
station_ids

In [None]:
feature_columns = [c for c in subset.columns if c.startswith('gdps')]

In [None]:
feature_columns

In [None]:
features = subset[feature_columns]

In [None]:
features.mean().compute()

In [None]:
features.std().compute()

In [None]:
one_hot_station = dd.get_dummies(subset['station'])

In [None]:
one_hot_station

In [None]:
subset['valid']

In [None]:
subset['yearly_component'] = da.sin((subset['valid'].dt.dayofyear / 366) * 2*np.pi )

In [None]:
subset['daily_component'] = da.sin((subset['valid'].dt.hour / 24) * 2*np.pi)

In [None]:
subset['step_component'] = (subset['step'] / (237 * 60 * 60))

In [None]:
temporal_features = subset[['yearly_component', 'daily_component', 'step_component']]

In [None]:
temporal_features.describe().compute()

In [None]:
temporal_features_array = temporal_features.to_dask_array()

In [None]:
temporal_features_array

In [None]:
station_ids

In [None]:
features_array = da.concatenate([temporal_features_array, features.to_dask_array()], axis=1, allow_unknown_chunksizes=True)

In [None]:
features_array.compute_chunk_sizes()

In [None]:
train_mask = (subset['date'].dt.year == 2019).values

In [None]:
val_mask = (subset['date'].dt.year == 2020).values

In [None]:
train_mask.compute_chunk_sizes()

In [None]:
val_mask.compute_chunk_sizes()

In [None]:
train_features = features_array[train_mask]

In [None]:
val_features = features_array[val_mask]

In [None]:
train_features.compute_chunk_sizes()
train_features = train_features.rechunk()

In [None]:
train_features

In [None]:
train_features_comp = train_features.compute()

In [None]:
train_features_comp.to_hdf5(DATA_DIR / '2021-03-17-ppdataset/test.hdf', '/train/x')

In [None]:
station_ids_comp = station_ids.compute()

In [None]:
import h5py

In [None]:
f = h5py.File(DATA_DIR / '2021-03-17-ppdataset/test.hdf', 'w')

In [None]:
f.create_dataset('/train/x/', shape=train_features_comp.shape, dtype=train_features_comp.dtype, data=train_features_comp, compression='lzf')

In [None]:
def array_to_hdf(file, name, array):
    file.create_dataset(name, shape=array.shape, dtype=array.dtype, data=array)

In [None]:
f.create_dataset('/train/stations', shape=train_features_comp.shape, dtype=train_features_comp.dtype, data=train_features_comp, compression='lzf')

In [None]:
array_to_hdf(f, '/train/stations', station_ids_comp)

In [None]:
train_y = subset['error_2t'].to_dask_array()

In [None]:
train_y.compute_chunk_sizes()

In [None]:
train_y_comput = train_y[train_mask].compute()

In [None]:
array_to_hdf(f, '/train/y', train_y_comput)

In [None]:
val_x_compute = val_features.compute()

In [None]:
val_mask.compute_chunk_size()

In [None]:
station_ids.compute_chunk_sizes()

In [None]:
val_stations = station_ids[val_mask]

In [None]:
array_to_hdf(f, '/val/x', )

In [None]:
f.close()