# Time series model

Experiment to build a simple time-series based model for the post-processing problem.
The baseline is persistence: predict the same bias we had recently.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import hydra
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import tqdm.notebook as tqdm
import xarray as xr

In [None]:
from smc01.postprocessing.dataset import SMCParquetDataset
from smc01.postprocessing.transform import DataframeToXarray

In [None]:
with hydra.initialize_config_module('smc01.postprocessing.conf'):
    cfg = hydra.compose('validate', [
        'experiment/dataset=gdps_metar_timeseries_xarray',
        'experiment.dataset.begin=2019-02-01',
        'experiment.dataset.end=2020-01-01',
        'experiment.dataset.max_window_size=30',
    ])

In [None]:
d = hydra.utils.instantiate(cfg.experiment.dataset)

In [None]:
d[0]

In [None]:
small_series = d[0]

In [None]:
bias = small_series.obs_2t - small_series.gdps_2t

In [None]:
bias.mean(dim='station').assign_coords(step=bias.step.astype(float)).plot()

## Simple persistence model

In [None]:
target_date = batch.date[-1]
obs_exists_mask = batch.obs_valid < target_date

In [None]:
obs - forecast

In [None]:
raw_error

In [None]:
batch.gdps_2t.isel(date=-1)

In [None]:
bias

In [None]:
obs_exists_mask

In [None]:
error_dfs = []

for batch in tqdm.tqdm(d):
    forecast = batch.gdps_2t.isel(date=-1)
    obs = batch.obs_2t.isel(date=-1)
    
    target_date = batch.date[-1]
    obs_exists_mask = batch.obs_valid < target_date
    bias = batch.gdps_2t - batch.obs_2t.where(obs_exists_mask)
    bias = bias.mean(dim='date', skipna=True)
    
    corrected_forecast = forecast - bias
    
    corrected_error = obs - corrected_forecast
    raw_error = obs - forecast
    
    error_df = np.square(corrected_error).mean(dim=['station']).to_dataframe(name='corrected_squared_error')   
    error_df['raw_squared_error'] = np.square(raw_error).mean(dim=['station'])
    
    error_dfs.append(error_df)

In [None]:
error_df = pd.concat(error_dfs)

In [None]:
error_df_melt = error_df.reset_index().melt(id_vars=['date', 'step'], value_vars=['corrected_squared_error', 'raw_squared_error'])

In [None]:
grouped_error_df = np.sqrt(error_df_melt.groupby(['variable', 'step']).mean()).reset_index()

In [None]:
px.line(data_frame=grouped_error_df, x='step', y='value', color='variable', title='Error of persistence model and raw model (2019)', labels={'value': 'Average Error (°C)'})