# Debias

Perform a basic *monthly debiasing* on GDPS and compare it to the raw model

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import dask.distributed
import dask_jobqueue
import numpy as np
import os
import pandas as pd
import pathlib
import plotly.express as px

In [None]:
DATA_DIR = pathlib.Path(os.getenv("DATA_DIR"))
INPUT_DATASET = DATA_DIR / pathlib.Path('2021-12-20-gdps-metar/')

In [None]:
input_path = pathlib.Path(INPUT_DATASET)

In [None]:
sample_path = next(iter(input_path.glob('*.parquet')))

In [None]:
sample = pd.read_parquet(sample_path)

In [None]:
columns = set(sample.columns)
columns -= set(['gdps_hpbl'])

GDPS HPBL is unavailable from 2019010100 to 2019031100. I think it's because the field was thinned out before we asked for it to be dearchived.
The partially available column causes problems in this notebook, so I remove the column.
Eventually we should make sure a gdps_hpbl with nans is available in all the dataframes.

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
)

In [None]:
cluster.scale(jobs=4)

In [None]:
client = dask.distributed.Client(cluster)

In [None]:
client

In [None]:
dataset_files = sorted(list(input_path.glob('*.parquet')))

In [None]:
df = dd.read_parquet(dataset_files, columns=columns)

In [None]:
df.head()

In [None]:
df = df.reset_index()

In [None]:
df['date'].max().compute()

In [None]:
df['step_hour'] = df['step'] / 3600
df['error_2t'] = df['obs_2t'] - df['gdps_2t']
df['squared_error_2t'] = (df['gdps_2t'] - df['obs_2t']) ** 2
df['rmse_2t'] = da.sqrt(df['squared_error_2t'])
df['mabs_2t'] = np.abs(df['error_2t'])
df['forecast_month'] = df['date'].dt.month

In [None]:
df = df.set_index('date')

In [None]:
df['step_td'] = dd.to_timedelta(df['step'], unit='S')

In [None]:
df['step_td']

In [None]:
train = df.loc['2019-01-01':'2021-01-01']
val = df.loc['2021-01-01':'2022-01-01']

In [None]:
monthly_bias = train.groupby(['station', 'forecast_month', 'step_hour']).agg({'error_2t': 'mean'}).compute()

In [None]:
monthly_bias = monthly_bias.rename(columns={'error_2t': 'bias_2t'})

In [None]:
monthly_bias = monthly_bias.reset_index()

In [None]:
train = train.merge(monthly_bias, on=['station', 'forecast_month', 'step_hour'])

In [None]:
train['gdps_corrected_2t'] = train['gdps_2t'] + train['bias_2t']

In [None]:
train['gdps_corrected_2t_error'] = train['obs_2t'] - train['gdps_corrected_2t']

In [None]:
train['gdps_corrected_2t_se'] = train['gdps_corrected_2t_error'] ** 2

In [None]:
errors = train.groupby('step_hour').agg({'gdps_corrected_2t_se': 'mean', 'squared_error_2t': 'mean'}).compute()

In [None]:
errors = errors.reset_index().melt(id_vars=['step_hour'])

In [None]:
px.line(data_frame=errors, x='step_hour', y='value', color='variable')

In [None]:
monthly_bias

In [None]:
val = val.merge(monthly_bias, on=['station', 'forecast_month', 'step_hour'])

In [None]:
val['gdps_corrected_2t'] = val['gdps_2t'] + val['bias_2t']
val['gdps_corrected_2t_error'] = val['obs_2t'] - val['gdps_corrected_2t']
val['gdps_corrected_2t_se'] = val['gdps_corrected_2t_error'] ** 2

In [None]:
errors = val.groupby('step_hour').agg({'gdps_corrected_2t_se': 'mean', 'squared_error_2t': 'mean'}).compute()


In [None]:
errors

In [None]:
melt_errors = errors.reset_index().melt(id_vars=['step_hour'])

In [None]:
px.line(data_frame=melt_errors, x='step_hour', y='value', color='variable')

In [None]:
errors.columns

In [None]:
errors['skill_score'] = 1.0 - (errors['gdps_corrected_2t_se'] / errors['squared_error_2t'])

In [None]:
errors

In [None]:
px.line(data_frame=errors, x=errors.index, y='skill_score')

The performance of a monthly debiasing model decreases pretty linearly with time.
I suspect all of our models will have the same phenomenon.
Next model to try is probably an EMOS model, but this will be a little more involved.
