# Fix wave or ripple in error

Our data has a ripple pattern in the error through time.
This notebooks investigates it an tries to come up with a good explanation so that we can move forward.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import numpy as np
import os
import pandas as pd
import pathlib
import plotly.express as px

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
INPUT_DATASET = DATA_DIR / 'interpolated/2021-12-20-test/'

In [None]:
input_path = pathlib.Path(INPUT_DATASET)

In [None]:
sample_path = next(iter(input_path.glob('*.parquet')))

In [None]:
sample = pd.read_parquet(sample_path)

In [None]:
columns = set(sample.columns)
columns -= set(['gdps_hpbl'])

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=8)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

In [None]:
df = dd.read_parquet(list(iter(input_path.glob('*.parquet'))), columns=columns)


In [None]:
df.head()

In [None]:
df = client.persist(df)

In [None]:
df = df.reset_index()
df['step_hour'] = df['step'] / 3600
df['error_2t'] = df['obs_2t'] - df['gdps_2t']
df['squared_error_2t'] = (df['gdps_2t'] - df['obs_2t']) ** 2
df['rmse_2t'] = da.sqrt(df['squared_error_2t'])
df['mabs_2t'] = np.abs(df['error_2t'])
df['forecast_month'] = df['date'].dt.month
df['forecast_hour'] = df['date'].dt.hour

In [None]:
df = df.set_index('date')

In [None]:
df['step_td'] = dd.to_timedelta(df['step'], unit='S')


In [None]:
df['valid'] = df.index + df['step_td']


## Error by valid time

In [None]:
df['valid_hour'] = df['valid'].dt.hour

In [None]:
by_valid_hour = df.groupby(['valid_hour', 'forecast_hour', 'step_hour']).agg({'squared_error_2t': 'mean', 'station': 'count'}).compute()

In [None]:
by_valid_hour = by_valid_hour.reset_index()


In [None]:
by_valid_hour

In [None]:
px.box(data_frame=by_valid_hour, x='valid_hour', y='squared_error_2t', color='forecast_hour')

In [None]:
px.box(data_frame=by_valid_hour, x='step_hour', y='squared_error_2t', color='forecast_hour')

In [None]:
px.box(data_frame=by_valid_hour, x='step_hour', y='station', color='forecast_hour')

In [None]:
by_valid_hour.head()

In [None]:
px.box(data_frame=by_valid_hour, x='valid_hour', y='station')

I bet some stations close at night and this is why we have a selection bias.

In [None]:
cyul = df[df['station'] == 'CYUL'].compute()

In [None]:
px.box(data_frame=cyul, x='step_hour', y='squared_error_2t')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df = df.reset_index()

In [None]:
df['forecast_date'] = df['date'].dt.date

In [None]:
daily_station_counts = df.groupby(['station', 'forecast_date']).count()

In [None]:
daily_station_counts.groupby('station').mean().head()

In [None]:
counts = daily_station_counts.compute()

In [None]:
px.histogram(data_frame=counts, x='date', log_y=True)

In [None]:
len(counts[counts['date'] > 150])

In [None]:
med = counts.groupby('station').median()

In [None]:
px.histogram(data_frame=med, x='date', log_y=True)

In [None]:
df['valid_date'] = df['valid'].dt.date

In [None]:
valid_counts = df.groupby(['valid_date', 'station']).count().compute()

In [None]:
by_station = valid_counts.groupby('station').median()

In [None]:
px.histogram(data_frame=by_station, x='latitude')

In [None]:
df['valid_day_of_week'] = df['valid'].dt.dayofweek

In [None]:
valid_counts = df.groupby(['valid_day_of_week', 'station']).count().compute()

In [None]:
smallest_day = valid_counts.groupby('station').count()

In [None]:
valid_counts

In [None]:
px.histogram(valid_counts, x='date')

In [None]:
by_valid_hour = df.groupby('valid_hour').mean().compute()

In [None]:
px.scatter(by_valid_hour, x=by_valid_hour.index, y='step_hour')

In [None]:
hourly_counts = df.groupby(['station', 'valid_hour']).count().compute()

In [None]:
hourly_counts = hourly_counts.reset_index()

In [None]:
nparray = hourly_counts.pivot(index='station', columns='valid_hour', values='index').to_numpy()

In [None]:
nparray.shape

In [None]:
px.imshow(nparray, width=600, height=4000, aspect='auto')

In [None]:
hourly_counts.groupby('station')