# Sample code for log-likelihood calibration


## About this document


In [None]:
from swift2.doc_helper import pkg_versions_info

print(pkg_versions_info("This document was generated from a jupyter notebook"))

## Setting up a calibration on daily data

We will use some sample data from (MMH) included in the package

In [None]:
import numpy as np
import pandas as pd
import xarray as xr

In [None]:
from cinterop.timeseries import as_timestamp
from swift2.doc_helper import get_free_params, sample_series, set_loglik_param_keys
from swift2.parameteriser import (
    concatenate_parameterisers,
    create_parameter_sampler,
    create_parameteriser,
    create_sce_termination_wila,
    extract_optimisation_log,
    get_default_sce_parameters,
    parameteriser_as_dataframe,
    sort_by_score,
)
from swift2.simulation import create_subarea
from swift2.utils import c, mk_full_data_id, paste0
from swift2.vis import OptimisationPlots

s = as_timestamp('1990-01-01')
e = as_timestamp('2005-12-31')

rain = sample_series('MMH', 'rain')[slice(s, e)]
evap = sample_series('MMH', 'evap')[slice(s, e)]
flow = sample_series('MMH', 'flow')[slice(s, e)]

In [None]:
rain.describe()

In [None]:
flow.describe()

We need to adjust the observed flow, as the SWIFTv1 legacy missing value code is `-99`. 

In [None]:
flow[flow < 0] = np.nan

In [None]:
flow

## Catchment setup

Let's create a single catchment setup, using daily data. We need to specify the simulation time step to be consistent with the daily input data.

In [None]:
ms = create_subarea('GR4J', 1.0)
from cinterop.timeseries import xr_ts_end, xr_ts_start

s = xr_ts_start(rain)
e = xr_ts_end(rain)
ms.set_simulation_span(s, e)
ms.set_simulation_time_step('daily')

Assign input time series

In [None]:
sa_name = ms.get_subarea_names()[0]
ms.play_subarea_input(rain, sa_name, "P")
ms.play_subarea_input(evap, sa_name, "E")

Model variables identifiers are hierarchical, with separators '.' and '|' supported. The "dot" notation should now be preferred, as some R functions producing data frames may change the variable names and replace some characters with '.'.

In [None]:
sa_id = paste0("subarea.", sa_name)
root_id = paste0(sa_id, ".")
print(ms.get_variable_ids(sa_id))

In [None]:
gr4_state_names = paste0(root_id, c('runoff', 'S', 'R', 'Perc'))
for name in gr4_state_names: 
    ms.record_state(name)

Let's check that one simulation runs fine, before we build a calibration definition.

In [None]:
ms.exec_simulation()
sState = ms.get_recorded(gr4_state_names[2])

In [None]:
sState.plot(figsize=(10,4))

Let's build the objective calculator that will guide the calibration process:

In [None]:
w = pd.Timestamp("1992-01-01")

In [None]:
runoff_depth_varname = 'subarea.Subarea.runoff'
mod_runoff = ms.get_recorded(runoff_depth_varname)
# zoo::index(flow) = zoo::index(mod_runoff)
objective = ms.create_objective(runoff_depth_varname, flow, 'log-likelihood', w, e)

In [None]:
mod_runoff.plot()

## Parameterisation

Define the feasible parameter space, using a generic parameter set for the model parameters. This is 'wrapped' by a log-likelihood parameter set with the extra parameters used in the log likelihood calculation, but which exposes all the parameters as 8 independent degrees of freedom to the optimiser.

In [None]:
pspec_gr4j = get_free_params('GR4J')
pspec_gr4j.Value = c(542.1981111, -0.4127542, 7.7403390, 1.2388548)
pspec_gr4j.Min = c(1,-30, 1,1)
pspec_gr4j.Max = c(3000, 30, 1000, 240)
pspec_gr4j.Name = paste0(root_id, pspec_gr4j.Name)


maxobs = np.max(flow)
p = create_parameteriser(type='Generic', specs=pspec_gr4j)
set_loglik_param_keys(a='a', b='b', m='m', s='s', ct="ct", censopt='censopt')
censor_threshold = maxobs / 100 # TBC
censopt = 0.0

loglik = create_parameteriser(type='no apply')
loglik.add_to_hypercube( 
          pd.DataFrame({ 
          "Name": c('b','m','s','a','maxobs','ct', 'censopt'),
          "Min": c(-30, 0, -10,    -20, maxobs, censor_threshold, censopt),
          "Max":  c(5,   0, 10, 0, maxobs, censor_threshold, censopt),
          "Value": c(-7,  0, 0,  -10, maxobs, censor_threshold, censopt),
          }
          ) )
p = concatenate_parameterisers(p, loglik)
p.as_dataframe()

Check that the objective calculator works, at least with the default values in the feasible parameter space:

In [None]:
score = objective.get_score(p)
print(score)

In [None]:
mod_runoff = ms.get_recorded(runoff_depth_varname)

In [None]:
from swift2.vis import plot_two_series

In [None]:
plot_two_series(flow, mod_runoff, ylab="obs/mod runoff", start_time = "2000-01-01", end_time = "2002-12-31", names=['observed','modelled'])

## Calibration

Build the optimiser definition, instrument with a logger.

In [None]:
# term = getMaxRuntimeTermination(max_hours = 0.3/60)  # ~20 second appears enough with SWIFT binaries in Release mode
# term = getMarginalTermination(tolerance = 1e-06, cutoff_no_improvement = 10, max_hours = 0.3/60) 
term = create_sce_termination_wila('relative standard deviation', c('0.005',str(1/60)))

sce_params = get_default_sce_parameters()
urs = create_parameter_sampler(0, p, 'urs')
optimiser = objective.create_sce_optim_swift(term, sce_params, urs)
calib_logger = optimiser.set_calibration_logger('')

In [None]:
%%time 
calib_results = optimiser.execute_optimisation()

In [None]:
opt_log = extract_optimisation_log(optimiser, fitness_name = 'Log-likelihood')
geom_ops = opt_log.subset_by_message(pattern= 'Initial.*|Reflec.*|Contrac.*|Add.*') 

In [None]:
import matplotlib.pyplot as plt

In [None]:
ll_max = max(geom_ops._data['Log-likelihood'].values)
ll_min = np.median(geom_ops._data['Log-likelihood'].values)

## Parameter plots

In [None]:
p_var_ids = p.as_dataframe().Name.values
v = OptimisationPlots(geom_ops)
for pVar in p_var_ids:
    g = v.parameter_evolution(pVar, obj_lims=[ll_min, ll_max])
    plt.gcf().set_size_inches(10,8)

Finally, get a visual of the runoff time series with the best known parameter set (the penultimate entry in the data frame with the log of the calibration process).

In [None]:
sortedResults = sort_by_score(calib_results, 'Log-likelihood')
sortedResults.as_dataframe().head().T

In [None]:
best_pset = calib_results.get_best_score('Log-likelihood').parameteriser
best_pset.apply_sys_config(ms)
ms.exec_simulation()
mod_runoff = ms.get_recorded(runoff_depth_varname)
# joki::plot_two_series(flow, mod_runoff, ylab="obs/mod runoff", startTime = start(flow), endTime = end(flow))

In [None]:
mod_runoff

In [None]:
mod_runoff.squeeze(drop=True).sel(time=slice(e - pd.offsets.DateOffset(years=1), e)).plot(figsize=(16,9))

In [None]:
plot_two_series(flow, mod_runoff, ylab="obs/mod runoff", start_time = "2000-01-01", end_time = "2002-12-31", names=['observed','modelled'])