In [None]:
%load_ext autoreload
%autoreload 2

# Error against step

One very useful graph is the mean error against the step.
We plot it in this notebook

In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import matplotlib.pyplot as plt
import numpy as np
import os
import pathlib
import pandas as pd
import seaborn as sns

In [None]:
sns.set_theme()

# Boot cluster

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=6)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

# Read dataset

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
INPUT = DATA_DIR / 'interpolated/2021-05-11-ppdataset/*.parquet'

In [None]:
df = dd.read_parquet(INPUT)

In [None]:
df['step_hour'] = df['step'] / 3600
df['error_2t'] = df['obs_2t'] - df['gdps_2t']
df['squared_error_2t'] = (df['gdps_2t'] - df['obs_2t']) ** 2
df['rmse_2t'] = da.sqrt(df['squared_error_2t'])
df['mabs_2t'] = np.abs(df['error_2t'])
df['step_td'] = dd.to_timedelta(df['step'], unit='S')
df['valid'] = df['date'] + df['step_td']

In [None]:
head = df.head()

In [None]:
head

In [None]:
head.columns

In [None]:
by_step_mean = df.groupby('step_hour').mean()

In [None]:
by_step_count = df[df['station'] == 'CYUL'].groupby('step_hour').count()

In [None]:
step_counts = by_step_count['rmse_2t'].compute()

In [None]:
step_counts

In [None]:
by_step_comp = by_step.compute()

In [None]:
by_step_comp['rmse_2t']

In [None]:
sns.lineplot(x='step_hour', y='rmse_2t', data=step_counts)

In [None]:
sns.lineplot(data=step_counts)

In [None]:
obs_count_by_station = df.groupby('station').count().compute()

In [None]:
obs_count_by_station

In [None]:
sns.ecdfplot(data=obs_count_by_station, y='rmse_2t')

In [None]:
good_stations = obs_count_by_station[obs_count_by_station['rmse_2t'] > 110000].index

In [None]:
len(good_stations)

In [None]:
#by_step_mean = df[(df['station'] == 'CYUL') & (df['date'].dt.hour == 12)].groupby('step_hour').mean()
by_step_mean = df.groupby('step_hour').mean()

In [None]:
by_step_mean_comp = by_step_mean.compute()

In [None]:
by_step_mean_comp

In [None]:
fig, ax = plt.subplots()
sns.lineplot(x='step_hour', y='rmse_2t', data=by_step_mean_comp, ax=ax)
ax.set_title('GDPS RMSE through steps for 2t')
ax.set_xlabel('Step (hours)')
ax.set_ylabel('RMSE (°C)')
plt.savefig(DATA_DIR / 'rmse.png', dpi=300)

## Time series for one station

In [None]:
cyul = df[(df['station'] == 'CYUL') & (df['date'] > '2020-1-1') & (df['date'] < '2020-2-1')].compute()

In [None]:
cyul

In [None]:
fig, ax = plt.subplots()
sns.lineplot(x='valid', y='squared_error_2t', hue='step_hour', data=cyul[(cyul['step_hour'] ==228) | (cyul['step_hour'] == 48.0)], ax=ax)
ax.set_xlabel('Validity time')
ax.set_ylabel('2t (°C)')
ax.set_title('2t at CYUL')
plt.xticks(rotation=45)
plt.savefig(DATA_DIR / 'obs2t.png', dpi=300, bbox_inches='tight')