# Try to fit real hindcasts

In [1]:
%cd /g/data/xv83/users/ds0092/active_projects/Squire_2022_correlation/notebooks/exploratory

/g/data/xv83/users/ds0092/active_projects/Squire_2022_correlation/notebooks/exploratory


In [2]:
import xarray as xr

import numpy as np

import pandas as pd

from src import utils, data, ar_model

import warnings

from statsmodels.tsa.api import VAR
from statsmodels.tsa.ar_model import AutoReg

warnings.filterwarnings("ignore")

In [3]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [4]:
DATA_DIR = "../../data/processed/"

# Develop/test code with some observation data

THe function developed has since been copied into `src.ar_model`. Here we retain a few tests

In [5]:
HadISST = xr.open_zarr(f"{DATA_DIR}/tos_HadISST.zarr", use_cftime=True)
AMV = (
    utils.calculate_period_AMV_index(
        HadISST["sst"],
        [12, 1, 2, 3],
    )
    .rename("AMV")
    .compute()
)
AMV = utils.round_to_start_of_month(AMV, dim="time")

HadSLP = xr.open_zarr(f"{DATA_DIR}/psl_HadSLP2r.zarr", use_cftime=True)
NAO = (
    utils.calculate_period_NAO_index(
        HadSLP["slp"],
        [12, 1, 2, 3],
    )
    .rename("NAO")
    .compute()
)

AMV, NAO = xr.align(AMV, NAO)
obsv = xr.merge((AMV, NAO)).isel(time=slice(148))  # Even times so can divide in half

### Check my AR model fit

In [6]:
n_lags = 2

In [7]:
my_params = ar_model.fit(obsv[["AMV"]], n_lags=n_lags, dim="time")
my_params.to_dataframe()

Unnamed: 0_level_0,AMV,model_order
params,Unnamed: 1_level_1,Unnamed: 2_level_1
AMV.lag2,0.254508,2
AMV.lag1,0.456688,2
AMV.noise_var,0.011765,2


In [8]:
their_fit = AutoReg(obsv[["AMV"]].to_dataframe(), lags=n_lags, trend="n").fit()
their_params = pd.concat(
    (their_fit.params, pd.Series(their_fit.sigma2, ["AMV.noise_var"]))
)
their_params.to_frame(name="AMV")

Unnamed: 0,AMV
AMV.L1,0.456688
AMV.L2,0.254508
AMV.noise_var,0.011765


### Check my VAR model fit

In [9]:
my_params = ar_model.fit(obsv, n_lags=n_lags, dim="time")
my_params.to_dataframe()

Unnamed: 0_level_0,AMV,NAO,model_order
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AMV.lag2,0.259422,-9.765535,2
AMV.lag1,0.451376,-1.992555,2
NAO.lag2,0.000801,-0.017539,2
NAO.lag1,-0.000845,0.060381,2
AMV.noise_var,0.012051,-0.084668,2
NAO.noise_var,-0.084668,35.364779,2


In [10]:
their_fit = VAR(obsv.to_dataframe()).fit(n_lags, trend="n")
their_params = pd.concat(
    (their_fit.params, their_fit.sigma_u.rename("sigma_u.{}".format))
)
their_params

Unnamed: 0,AMV,NAO
L1.AMV,0.451376,-1.992555
L1.NAO,-0.000845,0.060381
L2.AMV,0.259422,-9.765535
L2.NAO,0.000801,-0.017539
sigma_u.AMV,0.012051,-0.084668
sigma_u.NAO,-0.084668,35.364779


### Check that you get the same results when you duplicate data to `bystander` and `stack` dimensions

In [11]:
obsv_stacked = xr.concat([xr.concat([obsv] * 10, dim="member")] * 2, dim="x")

In [12]:
my_params = ar_model.fit(obsv_stacked, n_lags=n_lags, dim="time")
my_params.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,AMV,NAO,model_order
x,params,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,AMV.lag2,0.259422,-9.765535,2
0,AMV.lag1,0.451376,-1.992555,2
0,NAO.lag2,0.000801,-0.017539,2
0,NAO.lag1,-0.000845,0.060381,2
0,AMV.noise_var,0.011753,-0.082575,2
0,NAO.noise_var,-0.082575,34.490375,2
1,AMV.lag2,0.259422,-9.765535,2
1,AMV.lag1,0.451376,-1.992555,2
1,NAO.lag2,0.000801,-0.017539,2
1,NAO.lag1,-0.000845,0.060381,2


### Check that you get the same results when you split `time` into two members
Note one data point gets lost by doing this, so you won't get exactly the same answer

In [13]:
first_half = obsv.isel(time=slice(int(obsv.sizes["time"] / 2)))
first_half = first_half.assign_coords({"time": range(first_half.sizes["time"])})
second_half = obsv.isel(time=slice(int(obsv.sizes["time"] / 2), None))
second_half = second_half.assign_coords({"time": range(second_half.sizes["time"])})
obsv_stacked = xr.concat(
    [first_half, second_half],
    dim="member",
)

In [14]:
my_params = ar_model.fit(obsv_stacked, n_lags=n_lags, dim="time")
my_params.to_dataframe()

Unnamed: 0_level_0,AMV,NAO,model_order
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AMV.lag2,0.256853,-9.726328,2
AMV.lag1,0.451882,-2.171956,2
NAO.lag2,0.000786,-0.018835,2
NAO.lag1,-0.000856,0.059289,2
AMV.noise_var,0.012218,-0.085884,2
NAO.noise_var,-0.085884,35.84808,2


### Check that new select_order function works gives consistent results

In [15]:
ar_model.select_order(obsv[["AMV"]]).to_dataframe()

[1 2]


Unnamed: 0_level_0,AMV,model_order
params,Unnamed: 1_level_1,Unnamed: 2_level_1
AMV.lag1,0.456688,2
AMV.lag2,0.254508,2
noise_var,0.011765,2


In [16]:
ar_model.fit(obsv[["AMV"]], n_lags=2).to_dataframe()

Unnamed: 0_level_0,AMV,model_order
params,Unnamed: 1_level_1,Unnamed: 2_level_1
AMV.lag2,0.254508,2
AMV.lag1,0.456688,2
AMV.noise_var,0.011765,2


# Now let's try fit to some real hindcasts