In [None]:
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from tools.hybrid.misc_funs import natrun
from tools.hybrid.L96_plots import Hov_diagram
from tools.hybrid.L96_plots import timeseries_subplots
from tools.hybrid.miscobs import getHR, genobs
from tools.hybrid.L96_plots import show_observations
from tools.hybrid.misc_funs import getBc
from tools.hybrid.L96_plots import show_covariances
from tools.hybrid.var3dfile import var3d
from tools.hybrid.var4dfile import var4d
from tools.hybrid.L96_plots import add_timeseries, calculate_RMSE, compare_RMSE
from tools.hybrid.etkf16 import getlocmat
from tools.hybrid.etkf16 import etkf_l96
#from tools.rmse_spread import rmse_spread
from tools.hybrid.h4Dkf import etkf4DVar
from tools.hybrid.transmat import transmat_l96
from tools.hybrid.misc_funs import evolcov, covfamrun
from tools.hybrid.L96_plots import compare_covariances
from tools.hybrid.inc4DenV import envar

# Practical: Experiments with hybrid data assimilation methods

You will experiment with hybrid DA methods in the Lorenz 1996 model with 12 variables. This size allows for relatively easy visualisation while requiring localisation of the ensemble covariances. You will explore 2 types of hybrid methods:

 a) Those that use a hybrid background error covariance in a traditional variational minimisation framework. Here you will use 4DVar-LETKF.
 b) Those that avoid computing tangent linear and adjoint models and instead use 4-dimensional (cross-time) ensemble covariances to communicate the impact of observations to the initital time. Here, you will use SC-4DEnsemble Var (SC denotes strong constraint).

===========================================================================
# 1. Review of variational and Kalman-based methods
## 1.1 Nature run
Run the nature trajectory for the experiment with tmax = 14.

In [None]:
Nx = 12
tmax = 14
dt = 0.025
x, t, ut, ug0 = natrun(Nx, tmax)
Nsteps = np.size(t)
print("Nsteps", Nsteps)

# Plot time series for each model variable
ncols = 3  # Number of columns in subplots
plt.figure()
timeseries_subplots(Nx, t, ut, ncols, linewidth=1)

# Plot Hovmöller diagram (time against space)
plt.figure()
Hov_diagram(x, t, ut)

## 1.2 Generate synthetic observations


In [None]:
period_obs = 2
gridobs = "1010" #'all','1010', or 'landsea'
stdobs = 1

Nx_obs, loc_obs, H, R, Rsq, invR = getHR(gridobs, Nx, stdobs)
tobs, yobs = genobs(dt, ut, Nsteps, Nx_obs, H, period_obs, Rsq, myseed=1)

# Plot time series for each model variable
_, axs = timeseries_subplots(Nx, t, ut, ncols, linewidth=1)
# Show the observation period and which variables to observe
show_observations(Nx_obs, tobs, yobs, loc_obs, axs, size=3)

## 1.3 Generate background error covariances

In [None]:
Bc, Bc_sq = getBc(Nx)

# Plot background error covariance matrix
lim = max(abs(np.max(Bc)), abs(np.min(Bc)))
clim = [-lim, lim]
show_covariances(
    Bc,
    title="Climatological B",
    xlab="Grid points",
    ylab="Grid points",
    cmap="RdBu",
    clim=clim,
    fontsize=10,
)

## 1.4 Perform DA experiments with the 3DVar and 4DVar
What can you say about the performance of two methods?

In [None]:
# 3DVar
ub3, ua3 = var3d(ug0, t, x, H, yobs, period_obs, gridobs, Bc_sq, invR)
# 4DVar
obsperwin = 2
ub4, ua4 = var4d(ug0, t, x, H, yobs, period_obs, obsperwin, gridobs, Bc_sq, invR)

# Compare backgrounds, analyses and the truth
plt.figure()
loc_nobs = np.setdiff1d(x, loc_obs)
locs = [loc_obs, loc_nobs]
lwd = 0.5
prop_cycle = plt.rcParams["axes.prop_cycle"]
colors = prop_cycle.by_key()["color"]
fig, axs = timeseries_subplots(Nx, t, ut, ncols, linewidth=lwd)
add_timeseries(Nx, t, ub3, axs, linecolor=colors[1], linewidth=lwd)
add_timeseries(Nx, t, ua3, axs, linecolor=colors[2], linewidth=lwd)
add_timeseries(Nx, t, ub4, axs, linecolor=colors[3], linewidth=lwd)
add_timeseries(Nx, t, ua4, axs, linecolor=colors[4], linewidth=lwd)
labels = ["Truth", "bgd 3DV", "ana 3DV", "bgd 4DV", "ana 4DV"]
fig.legend(labels=labels, loc="upper center", ncol=5)

# Compare RMSE
plt.figure()
datalist = (ub3, ua3, ub4, ua4)
rmse_var = calculate_RMSE(Nsteps, ut, datalist, locs)
labels = ["bgd 3DV", "ana 3DV", "bgd 4DV", "ana 4DV"]
compare_RMSE(Nsteps, t, rmse_var, labels, colors, linewidth=lwd, lab_cols=4)

## 1.5 Generate localisation matrix in both the state space and in the mixed state/observation space
Try

<table>
 <thead>
   <tr>
     <th>loctype</th>
     <th>lambda</th>
   </tr>
 </thead>
 <tbody>
   <tr>
     <td>0</td>
     <td>0.1</td>
   </tr>
   <tr>
     <td>0</td>
     <td>2</td>
   </tr>
   <tr>
     <td>0</td>
     <td>10</td>
   </tr>
   <tr>
     <td>1</td>
     <td>0.1</td>
   </tr>
   <tr>
     <td>1</td>
     <td>2</td>
   </tr>
   <tr>
     <td>1</td>
     <td>10</td>
   </tr>
 </tbody>
</table>
Can you interpret these plots? After experimenting, let us settle for: loctype = 1 and lambda = 2.

In [None]:
lam = 2
# localisation halfwidth
loctype = 1  # Gaspari-Cohn
Lxx = getlocmat(Nx, Nx, np.eye(Nx), lam, loctype)  # get the localisation matrix
Lxy = getlocmat(Nx, Nx_obs, H, lam, loctype)  # get the localisation matrix

plt.figure()
show_covariances(
    Lxx,
    title="Localization in model space",
    xlab="Model space",
    ylab="Model space",
    cmap="Blues",
    clim=[0, 1],
    fontsize=10,
)
plt.figure()
show_covariances(
    Lxy,
    title="Localization in model/obs space",
    xlab="Observation space",
    ylab="Model space",
    cmap="Blues",
    clim=[0, 1],
    fontsize=10,
)

## 1.6 LETKF
Perform DA assimilation using the LETKF. The code has adaptive inflation implemented, so you do not have to worry about this parameter at all. What can you say about the RMSE plots?

In [None]:
M = 10  # ensemble size
Ubkf, ubkf, Uakf, uakf = etkf_l96(
    ug0, t, x, M, Nx_obs, H, R, yobs, period_obs, lam, Lxy
)

# Compare the ensemble mean of background and analysis and the truth
plt.figure()
lwd = 1
t_range = t[0:100]
fig, axs = timeseries_subplots(
    Nx, t_range, ut[0 : len(t_range), :], ncols, linewidth=lwd
)
add_timeseries(
    Nx, t_range, ubkf[0 : len(t_range), :], axs, linecolor=colors[1], linewidth=lwd
)
add_timeseries(
    Nx, t_range, uakf[0 : len(t_range), :], axs, linecolor=colors[2], linewidth=lwd
)
labels = ["Truth", "bgd mean", "ana mean"]
fig.legend(labels=labels, loc="upper center", ncol=3)

# Compute the RMSE
plt.figure()
datalist = (ubkf, uakf)
rmse_letkf = calculate_RMSE(Nsteps, ut, datalist, locs)
rmse_all = np.concatenate((rmse_var, rmse_letkf), axis=0)
labels = ["bgd 3DV", "ana 3DV", "bgd 4DV", "ana 4DV", "bgd LETKF", "ana LETKF"]
compare_RMSE(Nsteps, t, rmse_all, labels, colors, linewidth=lwd, lab_cols=3)

## 1.7 Compare the climatological B with some sample B's
One of the purposes of hybrid DA is to combine covariance information from a static yet full-rank source (the climatological Bc) used in the VAR methods, with the flow-dependent yet low-rank information coming from a sample of trajectories (the Pb obtained by ensemble methods). 
Compare the climatological Bc with that obtained by the LETKF (computed from the background ensemble) at different times. The raw and localised versions are ploted for different times instants. In this case you can modify the variables nsample, which is the number of instants in which you want to display the Pb(t). How would this change if you increase or decrease the number of ensemble members?


In [None]:
nsample = 3
ind = np.arange(period_obs, (nsample + 1) * period_obs, period_obs)
Pbs_kf = np.empty((Nx, Nx, nsample))
LPbs_kf = np.empty((Nx, Nx, nsample))
for j in range(nsample):
    aux = np.squeeze(Ubkf[ind[j], :, :])
    aux = np.cov(aux, ddof=1)
    Pbs_kf[:, :, j] = aux
    LPbs_kf[:, :, j] = Lxx * aux
del j

plt.figure()
lim = 1
my_cmap = matplotlib.colormaps["BrBG_r"]
plt.subplot(nsample, 3, 1)
plt.imshow(np.array(Bc), interpolation="nearest", cmap=my_cmap, vmin=-lim, vmax=lim)
plt.title("Bc")
plt.colorbar()
for j in range(nsample):
    plt.subplot(nsample, 3, 2 + (j * 3))
    plt.imshow(
        np.array(Pbs_kf[:, :, j]),
        interpolation="nearest",
        cmap=my_cmap,
        vmin=-lim,
        vmax=lim,
    )
    if j == 0:
        plt.title("Pb")
    plt.colorbar()
    plt.subplot(nsample, 3, 3 + (j * 3))
    plt.imshow(
        np.array(LPbs_kf[:, :, j]),
        interpolation="nearest",
        cmap=my_cmap,
        vmin=-lim,
        vmax=lim,
    )
    if j == 0:
        plt.title("Schur(L,Pb)")
    plt.colorbar()
del j
plt.subplots_adjust(
    top=0.955, bottom=0.08, left=0.11, right=0.9, hspace=0.465, wspace=0.345
)

# 2 4DVar-LETKF
You can modify the following parameters:
- loch: 0 if you do not want localisation in the ensemble part of the covariance, 1 if you do.
- M: ensemble size.
- obsperwin: observations in the assimilation window.
- beta: the coefficients for the static (first number) and the dynamic (second number) part of the covariance matrix.

In [None]:
lam = 1.5
loctype = 1
Lxx = getlocmat(Nx, Nx, np.eye(Nx), lam, loctype)
Lxy = getlocmat(Nx, Nx_obs, H, lam, loctype)

loch = 1
M = 10
obsperwin = 2
beta = [0.8, 0.4]

ubh, uah, Uaenh, uaenh = etkf4DVar(
    ug0,
    t,
    x,
    R,
    invR,
    H,
    yobs,
    period_obs,
    obsperwin,
    gridobs,
    Nx_obs,
    Bc,
    Bc_sq,
    lam,
    Lxx,
    Lxy,
    loch,
    M,
    beta,
)
lwd = 1
t_range = t[0:100]
plt.figure()
fig, axs = timeseries_subplots(
    Nx, t_range, ut[0 : len(t_range), :], ncols, linewidth=lwd
)
add_timeseries(
    Nx, t_range, ubh[0 : len(t_range), :], axs, linecolor=colors[1], linewidth=lwd
)  # 4DVar-LETKF background
add_timeseries(
    Nx, t_range, uah[0 : len(t_range), :], axs, linecolor=colors[2], linewidth=lwd
)  # 4DVar-LETKF analysis
labels = ["Truth", "bgd hybrid", "ana hybrid"]
fig.legend(labels=labels, loc="upper center", ncol=3)

# Compare the RMSE
plt.figure()
datalist = (ubh[:, :, 0], uah[:, :, 0])
rmse_hybrid = calculate_RMSE(Nsteps, ut, datalist, locs)
labels = ["bgd 4DVar-LETKF", "ana 4DVar-LETKF"]
compare_RMSE(Nsteps, t, rmse_hybrid, labels, colors, linewidth=lwd, lab_cols=2)

# 3. Exploring 4D background covariances
Now we move into more complicated hybrid DA methods. We will use SC-4DEnVar. Remember that this method avoids using the tangent linear and adjoint models by computing 4-dimensional ensemble covariances. Let us start by comparing the error evolution coming from 2 sources: (a) evolving Bc using tangent linear and adjoint models, and by evolving an ensemble run with different initial conditions (sampled from a normal distribution centered on the truth with covariance Bc). You can vary the parameters:
- M: ensemble size.
- lags: number of time steps for which you want to compute the covariance.
This cell will plot three rows of covariances. Can you tell what is being plotted in each row?

In [None]:
uref, tmat, seed = transmat_l96(
    ug0, t, x
)  # compute the TL matrix linearised about the background trajectory
lags = 5
Bt, B0t = evolcov(Bc, tmat, Nx, lags)

M = 10  # number of ensemble members
Ufam, Pbt, Pb0t = covfamrun(ug0, Nx, lags, Bc_sq, M)

lim = 3  # Colorbar

plt.figure()
compare_covariances(Bt, Pbt, Lxx, lags, lim, "RdBu", "")  # 3D covariances
plt.figure()
compare_covariances(B0t, Pb0t, Lxx, lags, lim, "RdBu", "0,")  # 4D covariances

# 4 4DEnVar
The final section runs SC-4DEnVar and computes the analysis RMSE of this method with respect to the truth. In this case you have to generate the localisation matrix (with the same options as before), and you can vary the next variables:
- lam: the localisation half width.
- M: the number of ensemble members.
- locenvar: whether you want localisation or not.
We use a fixed (in time) localisation. Remember this can be problematic when localising cross-time covariances in long assimilation windows. Can you think of a way to test this?


In [None]:
obsperwin = 2
lam = 1.5
loctype = 1
Lxx = getlocmat(Nx, Nx, np.eye(Nx), lam, loctype)
Lxy = getlocmat(Nx, Nx_obs, H, lam, loctype)

M = 10
locenvar = 1
ua4Den, ub4Den, Uaen4Den, uaen4Den, UFr4Den = envar(
    ug0,
    t,
    x,
    R,
    invR,
    H,
    yobs,
    period_obs,
    obsperwin,
    gridobs,
    Nx_obs,
    Bc_sq,
    lam,
    Lxx,
    Lxy,
    locenvar,
    M,
)

plt.figure()
lwd = 1
fig, axs = timeseries_subplots(Nx, t, ut, ncols, linewidth=lwd)
add_timeseries(Nx, t, ub4Den, axs, linecolor=colors[1], linewidth=lwd)
add_timeseries(Nx, t, ua4Den, axs, linecolor=colors[2], linewidth=lwd)
labels = ["Truth", "bgd 4DEnVar", "ana 4DEnVar"]
fig.legend(labels=labels, loc="upper center", ncol=3)

# Compare the RMSE
plt.figure()
datalist = (ub4Den[:, :, 0], ua4Den[:, :, 0])
rmse_4denvar = calculate_RMSE(Nsteps, ut, datalist, locs)
labels = ["bgd 4DVar-LETKF", "ana 4DVar-LETKF"]
compare_RMSE(Nsteps, t, rmse_4denvar, labels, colors, linewidth=lwd, lab_cols=2)