analysis/regime_optimize_2.Rmd

---
title: "Regime changes 2.0"
author: "Francisco Bischoff"
date: "on `r format(Sys.time(), '%B %d, %Y')`"
output:
  bookdown::html_document2:
    base_format: workflowr::wflow_html
    toc: true
    fig_caption: yes
    number_sections: yes
bibliography: [../papers/references.bib]
link-citations: true
csl: ../thesis/csl/ama.csl
css: style.css
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = FALSE, fig.align = "center", autodep = TRUE,
  fig.height = 5, fig.width = 10,
  tidy = "styler",
  tidy.opts = list(strict = TRUE)
)

if (knitr::is_latex_output()) {
  knitr::opts_chunk$set(dev = "pdf")
} else {
  knitr::opts_chunk$set(dev = "svg")
}

rlang::check_installed(c(
  "here", "glue", "visNetwork", "tibble", "kableExtra", "gridExtra",
  "ggplot2", "dplyr", "dbarts", "vip", "pdp", "patchwork", "fastshap",
  "tune"
))

options(dplyr.summarise.inform = FALSE)

library(here)
library(glue)
library(visNetwork)
library(tibble)
library(kableExtra)
library(patchwork)
library(ggplot2)

my_graphics <- function(image_name, base_path = here::here("docs", "figure")) {
  file_path <- file.path(base_path, image_name)

  if (knitr::is_latex_output()) {
    if (file.exists(glue::glue("{file_path}.pdf"))) {
      file_path <- glue::glue("{file_path}.pdf")
    } else if (file.exists(glue::glue("{file_path}.png"))) {
      file_path <- glue::glue("{file_path}.png")
    } else {
      file_path <- glue::glue("{file_path}.jpg")
    }
  } else {
    if (file.exists(glue::glue("{file_path}.svg"))) {
      file_path <- glue::glue("{file_path}.svg")
    } else if (file.exists(glue::glue("{file_path}.png"))) {
      file_path <- glue::glue("{file_path}.png")
    } else {
      file_path <- glue::glue("{file_path}.jpg")
    }
  }

  knitr::include_graphics(file_path)
}

my_plot_html <- function(html, options) {
  structure(paste0(
    "<div class=\"figure\" style=\"text-align: ",
    options$fig.align, "\">", html, "<p class=\"caption\">(#",
    options$fig.lp, options$label, ")", options$fig.cap, "</p></div>"
  ),
  class = "knit_asis"
  )
}


my_kable <- function(title, label, content) {
  res <- glue(r"(<br><table class="tg"><caption>)", "(\\#tab:{label}) {title}", r"(</caption>{content}</table>)")
  out <- structure(res, format = "html", class = "knitr_kable")
  attr(out, "format") <- "html"
  out
}

surf_plot <- function() {
  # library(rsm)
  fit <- lm(mean ~ poly(window_size, mp_threshold, degree = 5), data = tree_data)
  persp(fit, mp_threshold ~ window_size, zlab = "mean", zlim = c(0, 30))
}

lst_to_df <- function(lst, keep_attributes = TRUE) {
  new_df <- dplyr::bind_rows(lst)

  if (keep_attributes) {
    nc <- nrow(new_df)
    attributes(new_df) <- attributes(lst[[1]])
    attr(new_df, "row.names") <- seq.int(1, nc)
  }

  new_df$tar_group <- NULL

  return(new_df)
}

train_models <- function(data, parallel = FALSE, v = 10, rep = 3, grid = 30, train = NULL, test = NULL) {
  if (is.null(train) && is.null(test)) {
    set.seed(616)
    initial_sampling <- rsample::initial_split(data, prop = 3 / 4)
    training_split <- rsample::training(initial_sampling)
    testing_split <- rsample::testing(initial_sampling)
  } else {
    training_split <- train
    testing_split <- test
  }

  set.seed(616)
  folds <- rsample::vfold_cv(training_split, v = v, rep = rep)

  model_spec <- parsnip::bart(trees = parsnip::tune()) %>%
    parsnip::set_mode("regression") %>%
    parsnip::set_engine("dbarts")

  model_set <- hardhat::extract_parameter_set_dials(model_spec)

  wflw <- workflows::workflow() %>%
    workflows::add_model(model_spec) %>%
    workflows::add_formula(mean ~ .)

  if (parallel) {
    doParallel::registerDoParallel(cores = parallelly::availableCores())
  }

  set.seed(2022)
  tune_search <- wflw %>%
    tune::tune_grid(
      resamples = folds,
      param_info = model_set,
      grid = grid,
      metrics = yardstick::metric_set(yardstick::rmse, yardstick::rsq),
      control = tune::control_grid(
        verbose = TRUE,
        allow_par = parallel,
        save_workflow = FALSE,
        save_pred = TRUE,
        parallel_over = "resamples"
      )
    )

  # uses the "one-standard error rule" (Breiman _et al._, 1984) that selects the most simple
  #  model that is within one standard error of the numerically optimal results.
  tune_best <- tune_search %>% tune::select_best(metric = "rmse") # tune::select_by_one_std_err(trees, metric = "rsq")

  final_flow <- wflw %>% tune::finalize_workflow(tune_best)

  if (parallel) {
    doParallel::stopImplicitCluster()
  }

  return(list(model = final_flow, training_data = training_split, testing_data = testing_split))
}

check_interactions <- function(model, train_data, features, parallel = FALSE) {
  if (parallel) {
    doParallel::registerDoParallel(cores = parallelly::availableCores())
  }
  # Quantify relative interaction strength
  set.seed(2022)
  interact <- suppressWarnings(vip::vint(model$fit$fit,
    type = "regression", parallel = parallel,
    feature_names = features,
    train = train_data
  ))
  if (parallel) {
    doParallel::stopImplicitCluster()
  }
  return(interact)
}

shap_explain <- function(model, train_data, test_data, features, nsim = 20, parallel = FALSE) {
  if (parallel) {
    doParallel::registerDoParallel(cores = parallelly::availableCores())
  }
  set.seed(2022)
  shap <- fastshap::explain(model,
    feature_names = features,
    X = data.matrix(train_data), nsim = nsim,
    pred_wrapper = function(object, newdata) {
      pred <- predict(object, newdata)
      pred$.pred
    }, adjust = TRUE,
    newdata = data.matrix(test_data),
    .parallel = parallel
  )
  if (parallel) {
    doParallel::stopImplicitCluster()
  }
  return(shap)
}

check_importance <- function(model, train_data, test_data, features, type = c("firm", "permute", "shap"), nsim = 20, parallel = FALSE) {
  type <- match.arg(type)

  if (parallel) {
    doParallel::registerDoParallel(cores = parallelly::availableCores())
  }
  importances <- NULL
  set.seed(2022)
  if (type == "firm") {
    importances <- vip::vip(
      object = model, # fitted model
      method = "firm",
      feature_names = features, # names of features
      pred.fun = function(object, newdata) {
        pred <- predict(object, newdata)
        return(pred$.pred)
      },
      type = "regression",
      parallel = parallel,
      ice = TRUE,
      train = train_data,
      mapping = aes_string(fill = "Variable"),
      aesthetics = list(color = "grey35", size = 0.8)
    )
  } else if (type == "permute") {
    importances <- vip::vip(
      object = model, # fitted model
      method = "permute",
      target = "mean",
      feature_names = features, # names of features
      type = "ratio",
      pred_wrapper = function(object, newdata) {
        pred <- predict(object, newdata)
        pred$.pred
      },
      nsim = nsim,
      metric = "rmse",
      parallel = parallel,
      keep = TRUE,
      geom = "boxplot",
      train = train_data,
      mapping = aes_string(fill = "Variable"),
      aesthetics = list(color = "grey35", size = 0.5)
    )
    importances$layers[[1]]$data <- importances$layers[[1]]$data %>%
      dplyr::filter(!grepl("int_.*", Variable)) # nolint
  } else if (type == "shap") {
    importances <- vip::vip(
      object = model, # fitted model
      method = "shap",
      feature_names = features, # names of features
      pred_wrapper = function(object, newdata) {
        pred <- predict(object, newdata)
        pred$.pred
      },
      nsim = nsim,
      train = as.data.frame(train_data),
      newdata = as.data.frame(test_data),
      .parallel = parallel,
      mapping = aes_string(fill = "Variable"),
      aesthetics = list(color = "grey35", size = 0.8)
    )
  }

  importances$data <- importances$data %>%
    dplyr::filter(!grepl("int_.*", Variable)) # nolint

  if (parallel) {
    doParallel::stopImplicitCluster()
  }
  return(importances)
}

tkplot <- function(object, interactive = FALSE, res = 50) {
  ecg <- read_ecg_with_atr(here::here("inst/extdata/afib_regimes", object$record), resample_from = 200, resample_to = res)
  value <- ecg[[1]]$II
  prop <- 250 / res
  mask <- seq.int(50, 100)
  value[1:5] <- median(value[mask])
  value[(length(value) - 5):length(value)] <- median(value[mask])
  time <- seq(1, floor(length(value) * prop), length.out = length(value))
  data <- tibble::tibble(time = time, value = value)
  min_data <- min(data$value)
  max_data <- max(data$value)
  truth <- clean_truth(attr(ecg[[1]], "regimes"), length(ecg[[1]]$II)) # object$truth[[1]]
  preds <- object$pred[[1]]

  title <- glue::glue(
    "Recording: {object$record} ",
    "#truth: {length(truth)}, ",
    "#preds: {length(preds)}, ",
    "length: {floor(length(value)*prop)} ",
    "FLOSS Score: {round(object$score, 3)}"
  )

  subtitle <- glue::glue(
    "Parameters: ",
    "MP window: {object$window_size}, ",
    "MP threshold: {object$mp_threshold}, ",
    "Time constraint: {object$time_constraint}, ",
    "Regime threshold: {object$regime_threshold}"
  )


  plot <- data %>%
    timetk::plot_time_series(
      time, value,
      .title = glue::glue(title, "<br><sup>{subtitle}</sup>"),
      .interactive = interactive,
      .smooth = FALSE,
      .line_alpha = 0.3,
      .line_size = 0.2,
      .plotly_slider = interactive
    )

  if (interactive) {
    plot <- plot %>%
      plotly::add_segments(
        x = preds, xend = preds, y = min_data,
        yend = max_data * 1.1,
        line = list(width = 2.5, color = "#0108c77f"),
        name = "Predicted"
      ) %>%
      plotly::add_segments(
        x = truth, xend = truth, y = min_data,
        yend = max_data,
        line = list(width = 2.5, color = "#ff00007f"),
        name = "Truth"
      )
  } else {
    plot <- plot +
      ggplot2::geom_segment(
        data = tibble::tibble(pre = preds),
        aes(
          x = pre, xend = pre,
          y = min_data, yend = max_data * 1.1
        ), size = 1, color = "#0108c77f"
      ) +
      ggplot2::geom_segment(
        data = tibble::tibble(tru = truth),
        aes(
          x = tru, xend = tru,
          y = min_data, yend = max_data
        ), size = 1, color = "#ff00007f"
      ) +
      ggplot2::theme_bw() +
      ggplot2::theme(
        legend.position = "none",
        plot.margin = margin(0, 0, 0, 10)
      ) +
      ggplot2::labs(title = title, subtitle = subtitle, y = ggplot2::element_blank())
  }
  plot
}

pbFinished <- function(msg) {
  RPushbullet::pbPost("note", "Alert", msg)
}

source(here::here("scripts", "common", "read_ecg.R"))
source(here::here("scripts", "common", "score_floss.R"))
```

```{r cached, echo=FALSE, cache=FALSE}
network_1 <- readRDS(here::here("output", "regime_network.rds"))
outputs_1 <- readRDS(here::here("output", "regime_outputs.rds"))
net_1 <- network_1 %>%
  visNetwork::visPhysics(hierarchicalRepulsion = list(
    springLength = 1,
    avoidOverlap = 0.5,
    nodeDistance = 120
  ))
fitted1_1 <- outputs_1$fitted_models[[1]]
fitted2_1 <- outputs_1$fitted_models[[2]]

predictors_names_1 <- c("time_constraint", "mp_threshold", "window_size", "regime_threshold")
predictors_names <- c("time_constraint", "regime_threshold", "mp_threshold", "window_size", "regime_landmark")
outcome_name <- "mean"

all_fitted_1 <- lst_to_df(outputs_1$fitted_models)
all_scores_1 <- all_fitted_1 %>%
  # dplyr::slice_head(n = 10) %>%
  tidyr::unnest(.predictions) %>%
  ## mp_threshold of 1 and time_constraint of 750 are unrealistic, so we filter them out
  dplyr::filter(dplyr::if_any(predictors_names_1[2]) <= 0.9, dplyr::if_any(predictors_names_1[1]) >= 800) %>%
  dplyr::select(
    id, rep, .sizes, .id,
    all_of(predictors_names_1),
    .config, .pred, truth
  ) %>%
  dplyr::rename(fold = id, size = .sizes, record = .id, model = .config, pred = .pred) %>%
  dplyr::distinct(rep, record, across(all_of(predictors_names_1)), .keep_all = TRUE) %>%
  dplyr::mutate(truth = clean_truth(truth, size), pred = clean_pred(pred)) %>%
  dplyr::mutate(score = score_regimes(truth, pred, 0))

holdout_scores_1 <- outputs_1$final_evaluation %>%
  dplyr::select(
    all_of(predictors_names_1),
    .estimate
  ) %>%
  dplyr::rename(score = .estimate) %>%
  dplyr::arrange(score) %>%
  dplyr::mutate(score = round(score, 3)) %>%
  dplyr::mutate(`#` = dplyr::row_number(), .before = 1)

# this is a join of the all_scores_1 with the results using the landmark
all_scores <- readRDS(here::here("output/regime_outputs_lmk.rds"))
```


# Regime changes optimization

In this article, we will interchangeably use the words _parameter_, _variable_, and _feature_.

## Current pipeline

```{r thepipeline, out.width="100%", fig.cap="FLOSS pipeline."}
visNetwork::visInteraction(net_1, hover = TRUE, multiselect = TRUE, tooltipDelay = 100)
```

## Tuning process

As we have seen previously, the FLOSS algorithm is built on top of the Matrix Profile (MP). Thus, we have proposed several parameters that may or not impact the FLOSS prediction performance.

The variables for building the MP are:

-  **`mp_threshold`**: the minimum similarity value to be considered for 1-NN.
-  **`time_constraint`**: the maximum distance to look for the nearest neighbor.
-  **`window_size`**: the default parameter always used to build an MP.

Later, the FLOSS algorithm also has a parameter that needs tuning to optimize the prediction:

-  **`regime_threshold`**: the threshold below which a regime change is considered.

Using the `tidymodels` framework, we performed a basic grid search on all these parameters, followed by a bayesian search trying to finetune the parameters.

The workflow is as follows:

-  From a total of 229 records, a set of 171 records were selected for tuning, and 58 records were held out.
-  From these 171 records, a 5-fold cross-validation was performed two times. Here is where the grid search was performed (Figs. \@ref(fig:marginalplot) and \@ref(fig:performanceplot)).
-  The best ten models from the cross-validation (5 models from each time) were then evaluated on the hold-out set. Table \@ref(tab:holdout).


Fig. \@ref(fig:marginalplot) shows the performance achieved individually during the cross-validation for each parameter. The plot shows the default performance metric (`floss_error_macro`) and another version (`floss_error_micro`) where the error is computed globally, being less prone to individual record errors.

```{r marginalplot, eval = FALSE, out.width="100%", cache=FALSE}
#| fig.cap="Marginal plot of all parameters searched during the cross-validation.
#|  The first line shows the default performance metric (`macro`), which is the average of the scores of every recording
#|  in the resamples. The line below shows another metric (`micro`) which does not take into account the length of every
#|  recording but is later normalized by the total length of the resample. Lower values are better."

plot_data <- all_fitted %>% # this will collect the macro and micro scores
  tidyr::unnest(.metrics) %>%
  ## mp_threshold of 1 and time_constraint of 750 are unrealistic, so we filter them out
  dplyr::filter(dplyr::if_any(predictors_names[2]) <= 0.9, dplyr::if_any(predictors_names[1]) >= 800) %>%
  dplyr::select(all_of(predictors_names), .metric, .estimate) %>%
  dplyr::rename(prediction = .estimate, metric = .metric) %>%
  tidyr::pivot_longer(names_to = "parameter", values_to = "value", cols = all_of(predictors_names))

aa <- ggplot2::ggplot(plot_data, ggplot2::aes(x = value, y = prediction, group = value)) +
  ggplot2::geom_boxplot(outlier.alpha = 0.2) +
  ggplot2::facet_grid(metric ~ parameter, scales = "free") +
  ggplot2::labs(title = "Marginal plot", x = "Parameter value", y = "Performance") +
  ggplot2::ylim(0, 30) +
  ggplot2::theme_bw()

hack <- rlang::env_get(aa$layers[[1]]$stat, "compute_group")
body(hack)[[16]][[3]] <- quote(dplyr::if_else(data$x[1] <= 10, 0.02021286,
  dplyr::if_else(data$x[1] <= 500, 5, 20)
))
rlang::env_poke(aa$layers[[1]]$stat, "compute_group", hack)
aa
body(hack)[[16]][[3]] <- quote(width) # unhack
rlang::env_poke(aa$layers[[1]]$stat, "compute_group", hack) # unhack
```

Fig. \@ref(fig:performanceplot) shows the performance across all cross-validation folds for every iteration of the bayes search.

```{r performanceplot, fig.height = 7.5, out.width="90%", cache=FALSE}
#| fig.cap="Parameters exploration using  Bayesian optimization. The plots show the performances across all cross-validation
#|  folds on every iteration of the bayes search. The first line shows the results of the first repetition, and
#|  the second line during the second repetition. The values on the left are shown in the default metric (`macro`).
#|  On the right side, the values are shown in the `micro` metric. The iteration 'Zero' is the initial grid search."

fit1 <- tune::autoplot(fitted1, type = "performance", width = 0.8) +
  ggplot2::labs(title = "Performances - Repetition 1", x = "Iterations", y = "Performance") +
  ggplot2::ylim(0, 45) + ggplot2::theme_bw()

fit1$layers[[1]]$geom$default_aes$alpha <- 0.2
fit1$layers[[2]]$geom$default_aes$alpha <- 0.5

fit2 <- tune::autoplot(fitted2, type = "performance", width = 0.8) +
  ggplot2::labs(title = "Performances - Repetition 2", x = "Iterations", y = "Performance") +
  ggplot2::ylim(0, 45) + ggplot2::theme_bw()

fit1 / fit2

fit1$layers[[1]]$geom$default_aes$alpha <- NA
fit1$layers[[2]]$geom$default_aes$alpha <- NA
```

Table \@ref(tab:holdout) shows the performance of the best ten models on the hold-out set (a set of records that was never used for training).

```{r holdout, eval = FALSE, cache=FALSE}
kableExtra::kbl(holdout_scores_1,
  booktabs = TRUE,
  col.names = c("#", "Time Constraint", "MP Threshold", "Window Size", "Regime Threshold", "FLOSS Score"),
  caption = "Holdout results of the 10 best models from cross-validation (less is better)",
  align = "c",
  position = "ht"
) %>%
  kableExtra::row_spec(0, bold = TRUE) %>%
  kableExtra::kable_styling(full_width = TRUE)
```

## Parameters analysis

The above process was an example of parameter tuning seeking the best model for a given set of parameters. It used a nested cross-validation procedure that aims to find the best combination of parameters and avoid overfitting.

While this process is powerful and robust, it does not show us the importance of each parameter. At least one parameter has been introduced by reasoning about the problem (`mp_threshold`), but how important it (and other parameters) is for predicting regime changes?

For example, the process above took 4 days, 20 hours, and 15 minutes to complete the grid search using an Intel(R) Xeon(R) Silver 4210R @ 2.40 GHz server. Notice that about 133 different combinations of parameters were tested on computing the MP (not FLOSS, the `regime_threshold`), 5 folds, 2 times each. That sums up about 35.2 x 10^9^ all-pairs Euclidean distances computed on less than 5 days (on CPU, not GPU). Not bad.

Another side note on the above process, it is not a "release" environment, so we must consider lots of overhead in computation and memory usage that must be taken into account during these five days of grid search. Thus, much time can be saved if we know what parameters are essential for the problem.

In order to check the effect of the parameters on the model, we need to compute the _importance_ of each parameter.

Wei _et al._ published a comprehensive review on variable importance analysis [@Wei2015].

Our case is not a typical case of variable importance analysis, where a set of _features_ are tested against an _outcome_. Instead, we have to proxy our analysis by using as _outcome_ the FLOSS performance score and as _features_ (or _predictors_) the tuning parameters that lead to that score.

That is accomplished by fitting a model using the tuning parameters to predict the FLOSS score and then applying the techniques to compute the importance of each parameter.

For this matter, a Bayesian Additive Regression Trees (BART) model was chosen after an experimental trial with a set of regression models (including glmnet, gbm, mlp) and for its inherent characteristics, which allows being used for model-free variable selection [@Chipman2010]. The best BART model was selected using 10-fold cross-validation repeated 3 times, having great predictive power with an RMSE around 0.2 and an R^2^ around 0.99. With this fitted model, we could evaluate each parameter's importance.

### Interactions

Before starting the parameter importance analysis, we need to consider the parameter interactions since this is usually the weak spot of the analysis techniques, as will be discussed later.

The first BART model was fitted using the following parameters:

\begin{equation}
\begin{aligned}
E( score ) &= \alpha + time\_constraint\\
 &\quad + mp\_threshold + window\_size\\
 &\quad + regime\_threshold
\end{aligned}
(\#eq:first)
\end{equation}

After checking the interactions, this is the refitted model:

\begin{equation}
\begin{aligned}
E( score ) &= \alpha + time\_constraint\\
&\quad + mp\_threshold + window\_size\\
&\quad + regime\_threshold + \left(mp\_threshold \times window\_size\right)
\end{aligned}
(\#eq:refitted)
\end{equation}

Fig. \@ref(fig:interaction) shows the variable interaction strength between pairs of variables. That allows us to verify if there are any significant interactions between the variables. Using the information from the first model fit, equation \@ref(eq:first), we see that `mp_threshold` interacts strongly with `window_size`. After refitting, taking into account this interaction, we see that the interaction strength graphic is much better, equation \@ref(eq:refitted).

```{r modelbart, message=FALSE, cache=FALSE}
tree_data <- all_scores %>%
  dplyr::group_by(across(all_of(predictors_names))) %>%
  dplyr::summarize(mean = mean(score)) %>%
  dplyr::ungroup()

trained_model <- NULL
# Caching ===========
if (file.exists(here("output", "dbarts_fitted_lmk.rds"))) {
  trained_model <- readRDS(here("output", "dbarts_fitted_lmk.rds"))
} else {
  trained_model <- train_models(tree_data, parallel = TRUE, v = 5, rep = 1, grid = 30)
  saveRDS(trained_model, file = here("output", "dbarts_fitted_lmk.rds"))
}

train_data <- trained_model$training_data
testing_data <- trained_model$testing_data
set.seed(102)
best_fit <- generics::fit(trained_model$model, train_data)

# Caching ===========
if (file.exists(here("output", "importances_lmk.rds"))) {
  interactions <- readRDS(here("output", "importances_lmk.rds"))
  importance_firm <- interactions$importance_firm
  importance_perm <- interactions$importance_perm
  importance_shap <- interactions$importance_shap
  shap_html_test <- interactions$shap_html_test
  shap_fastshap_all_test <- interactions$shap_fastshap_all_test
  interactions <- interactions$interactions
} else {
  interactions <- check_interactions(best_fit, train_data, predictors_names, parallel = TRUE)
  importance_firm <- check_importance(best_fit, testing_data, testing_data, predictors_names,
    type = "firm", parallel = TRUE
  )
  importance_firm <- ggplot2::ggplot_build(importance_firm)$plot$data

  importance_perm <- check_importance(best_fit, testing_data, testing_data, predictors_names,
    type = "permute", nsim = 100, parallel = TRUE
  )
  importance_perm <- ggplot2::ggplot_build(importance_perm)$plot$data
  importance_perm <- attr(importance_perm, "raw_scores")
  importance_perm <- tibble::as_tibble(t(importance_perm)) %>%
    dplyr::select(all_of(predictors_names)) %>%
    tidyr::pivot_longer(everything(), names_to = "Variable", values_to = "Importance")

  importance_shap <- check_importance(best_fit, train_data, testing_data[, predictors_names], predictors_names,
    type = "shap", nsim = 400, parallel = TRUE
  )
  importance_shap <- ggplot2::ggplot_build(importance_shap)$plot$data

  shap_fastshap_all_test <- shap_explain(best_fit, train_data[, predictors_names], testing_data[, predictors_names],
    predictors_names,
    nsim = 400, parallel = TRUE
  )
  preds_test <- predict(best_fit, testing_data[, predictors_names])
  shap_html_test <- fastshap::force_plot(
    object = shap_fastshap_all_test, feature_values = testing_data[, predictors_names],
    baseline = mean(preds_test$.pred), display = "html"
  )
  shap_html_test <- stringr::str_remove(shap_html_test, "<meta.+?>")

  saveRDS(list(
    interactions = interactions,
    importance_firm = importance_firm,
    importance_perm = importance_perm,
    importance_shap = importance_shap,
    shap_fastshap_all_test = shap_fastshap_all_test,
    shap_html_test = shap_html_test
  ), file = here("output", "importances_lmk.rds"))
}

# tree_data2 <- tree_data %>%
#   dplyr::mutate(
#     int_mp_w = mp_threshold * window_size,
#     # int_mp_rt = mp_threshold * regime_threshold
#     # int_mp_tc = mp_threshold * time_constraint
#     .before = mean
#   )

# A tibble: 10 × 2
#    Variables                        Interaction
#    <chr>                                  <dbl>
#  1 regime_threshold*regime_landmark       1.51
#  2 mp_threshold*window_size               1.20
#  3 mp_threshold*regime_landmark           1.16
#  4 window_size*regime_landmark            0.985
#  5 time_constraint*regime_threshold       0.887
#  6 regime_threshold*window_size           0.850
#  7 regime_threshold*mp_threshold          0.810
#  8 time_constraint*regime_landmark        0.671
#  9 time_constraint*mp_threshold           0.619
# 10 time_constraint*window_size            0.389

# # A tibble: 10 × 2
#    Variables                        Interaction
#    <chr>                                  <dbl>
#  1 mp_threshold*regime_landmark           1.46
#  2 regime_threshold*window_size           0.992
#  3 time_constraint*regime_threshold       0.683
#  4 regime_threshold*regime_landmark       0.624
#  5 time_constraint*mp_threshold           0.590
#  6 time_constraint*regime_landmark        0.449
#  7 mp_threshold*window_size               0.329
#  8 regime_threshold*mp_threshold          0.288
#  9 window_size*regime_landmark            0.256
# 10 time_constraint*window_size            0.251


train_data2 <- train_data %>%
  dplyr::mutate(
    int_rt_rl = regime_threshold * regime_landmark,
    int_mp_w = mp_threshold * window_size,
    int_mp_rl = mp_threshold * regime_landmark,
    # int_mp_rt = mp_threshold * regime_threshold
    # int_mp_tc = mp_threshold * time_constraint
    .before = mean
  )

testing_data2 <- testing_data %>%
  dplyr::mutate(
    int_rt_rl = regime_threshold * regime_landmark,
    int_mp_w = mp_threshold * window_size,
    int_mp_rl = mp_threshold * regime_landmark,
    # int_mp_rt = mp_threshold * regime_threshold
    # int_mp_tc = mp_threshold * time_constraint
    .before = mean
  )

predictor_names_int <- c(predictors_names, "int_rt_rl", "int_mp_w", "int_mp_rl")

trained_model2 <- NULL
# Caching ==========
if (file.exists(here("output", "dbarts_fitted2_lmk.rds"))) {
  trained_model2 <- readRDS(here("output", "dbarts_fitted2_lmk.rds"))
} else {
  trained_model2 <- train_models(NULL, parallel = TRUE, v = 5, rep = 1, grid = 30, train_data2, testing_data2)
  saveRDS(trained_model2, file = here("output", "dbarts_fitted2_lmk.rds"))
}

train_data2 <- trained_model2$training_data
testing_data2 <- trained_model2$testing_data
set.seed(102)
best_fit2 <- generics::fit(trained_model2$model, train_data2)

# Caching ===========
if (file.exists(here("output", "importances2_lmk.rds"))) {
  interactions2 <- readRDS(here("output", "importances2_lmk.rds"))
  importance_firm2 <- interactions2$importance_firm2
  importance_perm2 <- interactions2$importance_perm2
  importance_shap2 <- interactions2$importance_shap2
  shap_fastshap_all_test2 <- interactions2$shap_fastshap_all_test2
  shap_html_test2 <- interactions2$shap_html_test2
  interactions2 <- interactions2$interactions2
} else {
  interactions2 <- check_interactions(best_fit2, train_data2, predictors_names, parallel = TRUE)

  importance_firm2 <- check_importance(best_fit2, testing_data2, testing_data2, predictors_names, type = "firm", parallel = TRUE)
  importance_firm2 <- ggplot2::ggplot_build(importance_firm2)$plot$data

  importance_perm2 <- check_importance(best_fit2, testing_data2, testing_data2, predictor_names_int,
    type = "permute",
    nsim = 100, parallel = TRUE
  )
  importance_perm2 <- ggplot2::ggplot_build(importance_perm2)$plot$data
  importance_perm2 <- attr(importance_perm2, "raw_scores")
  importance_perm2 <- tibble::as_tibble(t(importance_perm2)) %>%
    dplyr::select(all_of(predictors_names)) %>%
    tidyr::pivot_longer(everything(), names_to = "Variable", values_to = "Importance")

  importance_shap2 <- check_importance(best_fit2, train_data2[, predictor_names_int], testing_data2[, predictor_names_int], predictor_names_int,
    type = "shap", nsim = 400, parallel = TRUE
  )
  importance_shap2 <- ggplot2::ggplot_build(importance_shap2)$plot$data

  shap_fastshap_all_test2 <- shap_explain(best_fit2, train_data2[, predictor_names_int], testing_data2[, predictor_names_int], predictors_names, nsim = 400, parallel = TRUE)

  preds_test2 <- predict(best_fit2, testing_data2[, predictor_names_int])
  shap_html_test2 <- fastshap::force_plot(object = shap_fastshap_all_test2, feature_values = testing_data2[, predictors_names], baseline = mean(preds_test2$.pred), display = "html")
  shap_html_test2 <- stringr::str_remove(shap_html_test2, "<meta.+?>")

  saveRDS(list(
    interactions2 = interactions2,
    importance_firm2 = importance_firm2,
    importance_perm2 = importance_perm2,
    importance_shap2 = importance_shap2,
    shap_fastshap_all_test2 = shap_fastshap_all_test2,
    shap_html_test2 = shap_html_test2
  ), file = here("output", "importances2_lmk.rds"))
}
```


```{r interaction, fig.height = 5, fig.width= 8, out.width="100%", cache=FALSE}
#| fig.cap="Variable interactions strenght using feature importance ranking measure (FIRM) approach [@Greenwell2018].
#|  A) Shows strong interaction between `mp_threshold` and `window_size`.
#|  B) Refitting the model with this interaction taken into account, the strength is substantially reduced."

interactions_plot <- ggplot2::ggplot(interactions, ggplot2::aes(
  x = reorder(Variables, Interaction),
  y = Interaction, fill = Variables
)) +
  ggplot2::geom_col(color = "grey35", size = 0.2) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    title = "Normal fit",
    y = ggplot2::element_blank(),
    x = ggplot2::element_blank()
  ) +
  ggplot2::ylim(0, 1.65) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "none")

interactions2_plot <- ggplot2::ggplot(interactions2, ggplot2::aes(
  x = reorder(Variables, Interaction),
  y = Interaction, fill = Variables
)) +
  ggplot2::geom_col(color = "grey35", size = 0.2) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    title = "Taking into account the interactions",
    y = "Interaction strength",
    x = ggplot2::element_blank()
  ) +
  ggplot2::ylim(0, 1.65) +
  ggplot2::theme_bw() +
  ggplot2::theme(legend.position = "none")

inter <- interactions_plot / interactions2_plot
inter + plot_annotation(
  title = "Variable Interaction Strength",
  tag_levels = c("A", "1"),
  theme = ggplot2::theme_bw()
)
```


### Importance

After evaluating the interactions, we then can perform the analysis of the variable importance. The goal is to understand how the FLOSS score behaves when we change the parameters.

Here is a brief overview of the different techniques:

####  Feature Importance Ranking Measure (FIRM)

The FIRM is a variance-based method. This implementation uses the ICE curves to quantify each feature effect which is more robust than partial dependance plots (PDP) [@Greenwell2020].

It is also helpful to inspect the ICE curves to uncover some heterogeneous relationships with the outcome [@Molnar2022].

**Advantages:**

*  Has a causal interpretation (for the model, not for the real world)
*  ICE curves can uncover heterogeneous relationships

**Disadvantages:**

*  The method does not take into account interactions.

#### Permutation

The Permutation method was introduced by Breiman in 2001 [@Breiman2001] for Random Forest, and the implementation used here is a model-agnostic version introduced by Fisher _et al._ in 2019 [@Fisher2018]. A feature is "unimportant" if shuffling its values leaves the model error unchanged, assuming that the model has ignored the feature for the prediction.

**Advantages:**

*  Easy interpretation: the importance is the increase in model error when the feature's information is destroyed.
*  No interactions: the interaction effects are also destroyed by permuting the feature values.

**Disadvantages:**

*  It is linked to the model error: not a disadvantage _per se_, but may lead to misinterpretation if the goal is to understand how the output varies, regardless of the model's performance. For example, if we want to measure the robustness of the model when someone tampers the features, we want to know the _model variance_ explained by the features. Model variance (explained by the features) and feature importance correlate strongly when the model generalizes well (it is not overfitting).
*  Correlations: If features are correlated, the permutation feature importance can be biased by unrealistic data instances. Thus we need to be careful if there are strong correlations between features.

#### SHAP

The SHAP feature importance [@Lundberg2017] is an alternative to permutation feature importance. The difference between both is that Permutation feature importance is based on the decrease in model performance, while SHAP is based on the magnitude of feature attributions.

**Advantages:**

*  It is not linked to the model error: as the underlying concept of SHAP is the Shapley value, the value attributed to each feature is related to its contribution to the output value. If a feature is important, its addition will significantly affect the output.

**Disadvantages:**

*  Computer time: Shapley value is a computationally expensive method and usually is computed using Montecarlo simulations.
*  The Shapley value can be misinterpreted: The Shapley value of a feature value **is not** the difference of the predicted value after removing the feature from the model training. The interpretation of the Shapley value is: "Given the current set of feature values, the contribution of a feature value to the difference between the actual prediction and the mean prediction is the estimated Shapley value" [@Molnar2022].
*  Correlations: As with other permutation methods, the SHAP feature importance can be biased by unrealistic data instances when features are correlated.

### Importance analysis

Using the three techniques above simultaneously allows a broad comparison of the model behavior [@Greenwell2020]. All three methods are model-agnostic (separates interpretation from the model), but as we have seen above, each method has its advantages and disadvantages [@Molnar2022].

Fig. \@ref(fig:importance) then shows the variable importance using three methods: Feature Importance Ranking Measure (FIRM) using Individual Conditional Expectation (ICE), Permutation-based, and Shapley Additive explanations (SHAP). The first line of this figure shows that the interaction between `mp_threshold` and `window_size` obscures the results, where except for `time_constraint`, the other variables have similar importance. In the second line, the most important feature that all three methods agree on is the `regime_threshold`.


```{r importance, fig.height = 7, fig.width= 15, out.width="100%", cache=FALSE}
#| fig.cap="Variables importances using three different methods. A) Feature Importance Ranking Measure
#|  using ICE curves. B) Permutation method. C) SHAP (400 iterations). Line 1 refers to the original
#|  fit, and line 2 to the re-fit, taking into account the interactions between variables
#|  (Fig. \\@ref(fig:interaction))."


importance_firm_plot <- ggplot2::ggplot(importance_firm, aes(
  x = reorder(Variable, Importance),
  y = Importance, fill = Variable
)) +
  ggplot2::geom_col(colour = "grey35", size = 0.8, show.legend = FALSE) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    title = "Feature Importance Ranking Measure",
    subtitle = "Individual Conditional Expectation",
    x = "",
    y = ggplot2::element_blank()
  ) +
  ggplot2::ylim(0, 3.5) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    legend.position = "none",
    plot.margin = margin(0, 0, 0, 10)
  )

importance_perm_plot <- ggplot2::ggplot(importance_perm, aes(
  x = reorder(Variable, Importance),
  y = Importance, fill = Variable
)) +
  ggplot2::geom_boxplot(colour = "grey35", size = 0.5, show.legend = FALSE) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    title = "Permutation-based (100x)",
    x = "",
    y = ggplot2::element_blank()
  ) +
  ggplot2::ylim(2, 15) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    legend.position = "none",
    plot.margin = margin(0, 0, 0, 10)
  )

importance_shap_plot <- ggplot2::ggplot(importance_shap, aes(
  x = reorder(Variable, Importance),
  y = Importance, fill = Variable
)) +
  ggplot2::geom_col(colour = "grey35", size = 0.8, show.legend = FALSE) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    title = "SHAP (400 iterations)",
    x = "",
    y = ggplot2::element_blank()
  ) +
  ggplot2::ylim(0, 1.6) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    legend.position = "none",
    plot.margin = margin(0, 0, 0, 10)
  )

importance_firm2_plot <- ggplot2::ggplot(importance_firm2, aes(
  x = reorder(Variable, Importance),
  y = Importance, fill = Variable
)) +
  ggplot2::geom_col(colour = "grey35", size = 0.8, show.legend = FALSE) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    x = "",
    y = "Importance"
  ) +
  ggplot2::ylim(0, 3.5) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    legend.position = "none",
    plot.margin = margin(0, 0, 0, 10)
  )

importance_perm2_plot <- ggplot2::ggplot(importance_perm2, aes(
  x = reorder(Variable, Importance),
  y = Importance, fill = Variable
)) +
  ggplot2::geom_boxplot(colour = "grey35", size = 0.5, show.legend = FALSE) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    x = "",
    y = "Importance"
  ) +
  ggplot2::ylim(2, 15) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    legend.position = "none",
    plot.margin = margin(0, 0, 0, 10)
  )

importance_shap2_plot <- ggplot2::ggplot(importance_shap2, aes(
  x = reorder(Variable, Importance),
  y = Importance, fill = Variable
)) +
  ggplot2::geom_col(colour = "grey35", size = 0.8, show.legend = FALSE) +
  ggplot2::coord_flip() +
  ggplot2::labs(
    x = "",
    y = "Importance"
  ) +
  ggplot2::ylim(0, 1.6) +
  ggplot2::theme_bw() +
  ggplot2::theme(
    legend.position = "none",
    plot.margin = margin(0, 0, 0, 10)
  )


all <- (importance_firm_plot / importance_firm2_plot + plot_layout(tag_level = "new")) |
  (importance_perm_plot / importance_perm2_plot + plot_layout(tag_level = "new")) |
  (importance_shap_plot / importance_shap2_plot + plot_layout(tag_level = "new")) +
    plot_layout(guides = "collect")
all + plot_annotation(
  title = "Variable importances",
  tag_levels = c("A", "1"),
  theme = ggplot2::theme_bw() + ggplot2::theme(
    plot.title = ggplot2::element_text(size = 20)
  )
)
```


Fig. \@ref(fig:importanceshap) and \@ref(fig:importanceshap2) show the effect of each feature on the FLOSS score. The main differences before and after removing the interactions are the magnitude of the less important features and the shape of `time_constraint` that initially had a valley around 1600. However, it seems that it flats out.


```{r importanceshap, message=FALSE, fig.height = 6, fig.width= 10, out.width="100%", cache=FALSE}
#| fig.cap="This shows the effect each variable has on FLOSS score. This plot doesn't take into account the
#|  variable interactions."
t1 <- autoplot(shap_fastshap_all_test,
  type = "dependence",
  X = testing_data, feature = predictors_names[2], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::theme_bw()
t2 <- autoplot(shap_fastshap_all_test,
  type = "dependence",
  X = testing_data, feature = predictors_names[1], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::labs(y = ggplot2::element_blank()) + ggplot2::theme_bw()
t3 <- autoplot(shap_fastshap_all_test,
  type = "dependence",
  X = testing_data, feature = predictors_names[4], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::theme_bw()
t4 <- autoplot(shap_fastshap_all_test,
  type = "dependence",
  X = testing_data, feature = predictors_names[3], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::labs(y = ggplot2::element_blank()) + ggplot2::theme_bw()

all <- (t1 + t2) /
  (t3 + t4) + plot_layout(guides = "collect")

all + plot_annotation(
  title = "Shapley value vs. variable values",
  subtitle = "Original fit",
  theme = ggplot2::theme_bw()
)
```

```{r importanceshap2, message=FALSE, fig.height = 6, fig.width= 10, out.width="100%", cache=FALSE}
#| fig.cap="This shows the effect each variable has on FLOSS score taking into account the interactions."

t1 <- autoplot(shap_fastshap_all_test2,
  type = "dependence",
  X = testing_data2, feature = predictors_names[2], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::theme_bw()
t2 <- autoplot(shap_fastshap_all_test2,
  type = "dependence",
  X = testing_data2, feature = predictors_names[1], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::labs(y = ggplot2::element_blank()) + ggplot2::theme_bw()
t3 <- autoplot(shap_fastshap_all_test2,
  type = "dependence",
  X = testing_data2, feature = predictors_names[4], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::theme_bw()
t4 <- autoplot(shap_fastshap_all_test2,
  type = "dependence",
  X = testing_data2, feature = predictors_names[3], alpha = 0.2
) + ggplot2::geom_smooth(method = loess) + ggplot2::labs(y = ggplot2::element_blank()) + ggplot2::theme_bw()

all <- (t1 + t2) /
  (t3 + t4) + plot_layout(guides = "collect")

all + plot_annotation(
  title = "Shapley value vs. variable values",
  subtitle = "Re-fit for iteractions",
  theme = ggplot2::theme_bw()
)
```

We then can see that in this setting, the only variables worthing tuning are the `window_size` that is a required parameter for the Matrix Profile and seems to give better results with lower window sizes, and the `regime_threshold` that is a required parameter for FLOSS and seems to have a better performance around 0.4. As `time_constraint` flats out, this parameter can be, at this moment, removed from the model. `mp_threshold` also seems not to be a good parameter to keep so that we can leave it at its default of zero.

According to the FLOSS paper [@gharghabi2018], the `window_size` is a feature that can be tuned; nevertheless, the results appear to be similar in a reasonably wide range of windows. At this point, we see that the window size of 100 (which is less than half of a second) did not reach a minimum, suggesting that smaller windows could have better performances.

On the other hand, the `regime_threshold` seems to have an optimal value, but due to a lack of resources until this date, we could not test another variable (that is worth tuning, now that we know that we can have a shorter window size) that is the `regime_landmark`.

This parameter was fixed to the value of 3 seconds from the last observation (at 250hz, this means the position 750 from the end). It can be shifted, for example, from 1 second to a maximum of 10 seconds (the official limit to fire the alarm) and assess the _Corrected Arc Counts_ in a location with more information about the new regime and is less distorted by the edges.

A preliminary result of the ongoing tuning (with `regime_landmark`) is depicted below (still a work in progress). It shows us two prediction paths, and assuming (by eye) that the more vertical the line, the less interference will be on the prediction, this means that having settled the `window_size` and a `regime_landmark`, the `regime_threshold` will have the best tuning value for each case.

```{r preliminar,fig.height=10, fig.width=15, out.width="100%", results="asis", cache=FALSE}
#| fig.cap="Multioutput decision plot."
my_graphics("shapfurr", "figure")
```

## Visualizing the predictions

At this point, the grid search tested a total of 233 models with resulting (individual) scores from 0.0009 to 1279.4 (Q25: 0.3498, Q50: 1.0930, Q75: 2.4673).


### By recording

First, we will visualize how the models (in general) performed throughout the individual recordings.

Fig. \@ref(fig:global) shows a violin plot of equal areas clipped to the minimum value. The blue color indicates the recordings with a small IQR (interquartile range) of model scores. We see on the left half 10% of the recordings with the worst minimum score, and on the right half, 10% of the recordings with the best minimum score.

Next, we will visualize some of these predictions to understand why some recordings were difficult to segment. For us to have a simple baseline: a recording with just one regime change, and the model predicts exactly one regime change, but far from the truth, the score will be roughly 1.


```{r global, eval=TRUE, fig.height=5, fig.width=10, out.width="100%", cache=FALSE}
#| fig.cap="Violin plot showing the distribution of the FLOSS score achieved by all tested models by
#|  recording.  The left half shows the recordings that were difficult to predict (10% overall), whereas
#|  the right half shows the recordings that at least one model could achieve a good prediction (10%
#|  overall).  The recordings are sorted (left-right) by the minimum (best) score achieved in descending
#|  order, and ties are sorted by the median of all recording scores.  The blue color highlights
#|  recordings where models had an IQR variability of less than one.  As a simple example, a recording
#|  with just one regime change, and the model predicts exactly one change, far from the truth, the
#|  score will be roughly 1."

scores_stats <- all_scores %>%
  dplyr::select(record, score) %>%
  dplyr::group_by(record) %>%
  dplyr::summarize(
    score = score, min = min(score), q25 = quantile(score, 0.25),
    median = quantile(score, 0.5), q75 = quantile(score, 0.75),
    mean = mean(score), max = max(score)
  ) %>%
  dplyr::ungroup()

records_factors <- forcats::as_factor(scores_stats$record)

scores_stats$id <- sprintf("%02d", (as.numeric(records_factors)))

scores_stats %>%
  dplyr::mutate(low_iqr = q75 - q25 < 1) %>%
  dplyr::filter(min > quantile(min, 0.9) | min < quantile(min, 0.1)) %>%
  ggplot2::ggplot(aes(x = reorder(reorder(id, -median), -min), y = score, colour = low_iqr)) +
  ggplot2::geom_violin() +
  ggplot2::coord_cartesian(ylim = c(0, 5)) +
  ggplot2::theme_bw() +
  ggplot2::labs(title = "Scores by recording", colour = "IQR < 1", x = "Recording ID", y = "Score distribution")
```

Fig. \@ref(fig:worst) shows the best effort in predicting the most complex recordings. One information not declared before is that if the model does not predict any change, it will put a mark on the zero position. On the other side, the truth markers positioned at the beginning and the end of the recording were removed, as these locations lack information and do not represent a streaming setting.

```{r worst, eval=TRUE, fig.height=15, fig.width=12, out.width="100%", dev="png", cache=FALSE}
#| fig.cap="Prediction of the worst 10% of recordings (red is the truth, blue are the predictions)."

worst <- scores_stats %>%
  dplyr::filter(min > quantile(min, 0.9)) %>%
  dplyr::group_by(record) %>%
  dplyr::slice_head() %>%
  dplyr::ungroup() %>%
  dplyr::arrange(desc(min), desc(median)) %>%
  dplyr::slice_head(n = 10) %>%
  dplyr::pull(record)
# "data_25_1.par"  "data_32_12.par" "data_85_1.par"  "data_90_1.par"  "data_68_2.par"

worst_data <- all_scores %>%
  dplyr::filter(record %in% worst) %>%
  dplyr::group_by(record) %>%
  dplyr::slice_min(n = 1, order_by = score, with_ties = FALSE) %>%
  dplyr::arrange(desc(score)) %>%
  dplyr::ungroup()

plots <- list()
for (i in seq_len(nrow(worst_data))) {
  plots[[i]] <- tkplot(worst_data[i, ], FALSE, 50)
}

wrap_plots(plots, ncol = 1)
```


Fig. \@ref(fig:best) shows the best performances of the best recordings. Notice that there are recordings with a significant duration and few regime changes, making it hard for a "trivial model" to predict randomly.


```{r best, eval=TRUE, fig.height=15, fig.width=12, out.width="100%", dev="png", cache=FALSE}
#| fig.cap="Prediction of the best 10% of recordings (red is the truth, blue are the predictions)."

bests <- scores_stats %>%
  dplyr::filter(min < quantile(min, 0.1)) %>%
  dplyr::group_by(record) %>%
  dplyr::slice_head() %>%
  dplyr::ungroup() %>%
  dplyr::arrange(desc(min), desc(median)) %>%
  dplyr::slice_head(n = 10) %>%
  dplyr::pull(record)

bests_data <- all_scores %>%
  dplyr::filter(record %in% bests) %>%
  dplyr::group_by(record) %>%
  dplyr::slice_min(n = 1, order_by = score, with_ties = FALSE) %>%
  dplyr::arrange(desc(score)) %>%
  dplyr::ungroup()

plots <- list()
for (i in seq_len(nrow(bests_data))) {
  plots[[i]] <- tkplot(bests_data[i, ], FALSE, 50)
}

wrap_plots(plots, ncol = 1)
```

An online interactive version of all the datasets and predictions can be accessed at [Shiny app](https://franzbischoff.shinyapps.io/FLOSS/).

A work-in-progress version for the finer grid-search over the `regime_landmark` is available here: [Shiny app](https://franzbischoff.shinyapps.io/FLOSS_land/), which shows promising results.

### By model

Fig. \@ref(fig:globalmodel) shows the distribution of the FLOSS score of the 10% worst (left side) and 10% best models across the recordings (right side). The bluish color highlights the models with SD below 3 and IQR below 1.

```{r globalmodel, eval=TRUE, fig.height=5, fig.width=10, out.width="100%", cache=FALSE}
#| fig.cap="Violin plot showing the distribution of the FLOSS score achieved by all tested models during the
#|  inner ressample.  The left half shows the models with the worst performances (10% overall), whereas
#|  the right half shows the models with the best performances (10% overall).
#|  The models are sorted (left-right) by the mean score (top) and by the median (below). Ties are
#|  sorted by the SD and IQR, respectively.  The bluish colors highlights models with an SD below 3
#|  and IQR below 1."

scores_stats_model <- all_scores %>%
  dplyr::distinct(across(all_of(predictors_names)), record, score) %>%
  dplyr::group_by(across(all_of(predictors_names))) %>%
  dplyr::mutate(model = glue("{window_size}_{time_constraint}_{mp_threshold}_{regime_threshold}")) %>%
  dplyr::summarize(
    record = record,
    model = model,
    score = score, min = min(score), q25 = quantile(score, 0.25),
    median = quantile(score, 0.5), q75 = quantile(score, 0.75),
    iqr = q75 - q25,
    mean = mean(score), max = max(score),
    sd = sd(score)
  ) %>%
  dplyr::ungroup()


scores_stats_model$id <- (sprintf("%03d", (as.numeric(as.factor(scores_stats_model$model)))))
scores_stats_model$id_text <- (sprintf("Model_%03d", (as.numeric(as.factor(scores_stats_model$model)))))
scores_stats_model$record <- (sprintf("%02d", (as.numeric(factor(scores_stats_model$record, labels = levels(records_factors))))))
scores_stats_model <- scores_stats_model %>% dplyr::select(-model)

model_mean <- scores_stats_model %>%
  dplyr::mutate(low_sd = sd < 3) %>%
  dplyr::filter(mean > quantile(mean, 0.9) | mean < quantile(mean, 0.1)) %>%
  ggplot2::ggplot(aes(x = reorder(reorder(id, -sd), -mean), y = score, colour = low_sd)) +
  ggplot2::scale_colour_manual(values = c("FALSE" = "#ff0000c2", "TRUE" = "#0000ffb5")) +
  ggplot2::geom_violin() +
  ggplot2::coord_cartesian(ylim = c(0, 3)) +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1)) +
  ggplot2::labs(subtitle = "Ordered by Mean and SD", colour = "SD < 3", x = ggplot2::element_blank(), y = "Score distribution")

model_median <- scores_stats_model %>%
  dplyr::mutate(low_iqr = q75 - q25 < 1) %>%
  dplyr::filter(median > quantile(median, 0.9) | median < quantile(median, 0.1)) %>%
  ggplot2::ggplot(aes(x = reorder(reorder(id, -iqr), -median), y = score, colour = low_iqr)) +
  ggplot2::geom_violin() +
  ggplot2::coord_cartesian(ylim = c(0, 3)) +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1)) +
  ggplot2::labs(subtitle = "Ordered by Median and IQR", colour = "IQR < 1", x = "Model ID", y = "Score distribution")

(model_mean / model_median) + plot_layout(guides = "auto") +
  plot_annotation(
    title = "Scores grouped by model",
    theme = ggplot2::theme_bw()
  )
```

Fig. \@ref(fig:bestmodels) the performance of the six best models across all inner resamples. They are ordered from left to right, from the worst record to the best record. The top model is the one with the lowest mean across the scores. The blue line indicates the mean score, and the red line the median score. The scores above 3 are squished in the plot and colored according to the scale in the legend.

```{r bestmodels, eval=TRUE, fig.height=22, fig.width=18, out.width="100%", cache=FALSE}
#| fig.cap="Performances of the best 6 models across all inner resample of recordings.
#|  The recordings are ordered by score, from the worst to the best.
#|  Each plot shows one model, starting from the best one.
#|  The red line indicates the median score of the model. The blue line indicates the mean score of the model.
#|  The gray line limits the zero-score region. The plot is limited on the \"y\" axis, and the scores above this
#|  limit are shown in color."

best_models <- scores_stats_model %>%
  dplyr::filter(mean < quantile(mean, 0.1)) %>%
  dplyr::arrange(mean, sd) %>%
  dplyr::pull(id_text) %>%
  unique() %>%
  .[1:6]

plots <- list()
for (i in seq_len(length(best_models))) {
  dd <- scores_stats_model %>% dplyr::filter(id_text == best_models[i])
  plots[[i]] <- ggplot2::ggplot(dd, aes(x = reorder(record, -score), y = score, colour = score)) +
    ggplot2::geom_point(size = 2) +
    ggplot2::geom_hline(aes(yintercept = median), colour = "red") +
    ggplot2::geom_hline(aes(yintercept = mean), colour = "blue") +
    ggplot2::geom_hline(aes(yintercept = 0), colour = "gray50") +
    ggplot2::scale_y_continuous(
      limits = c(0, 3),
      oob = scales::oob_squish,
      expand = c(0.1, 0.05, 0.1, -0.1)
    ) +
    ggplot2::scale_color_gradientn(colors = c("#ffd500db", "#ff8800", "#ff5e00", "#ff0000"), limits = c(3.1, 100)) +
    ggplot2::theme_bw(base_size = 15) +
    ggplot2::theme(axis.text.x = element_text(size = 9, angle = 90, vjust = 0.5, hjust = 1)) +
    ggplot2::labs(
      title = best_models[i],
      colour = "Score out\nof bounds",
      x = ifelse(i == length(best_models), "Record ID", ""),
      y = ggplot2::element_blank()
    )
}

wrap_plots(plots, ncol = 1, guides = "collect") + plot_annotation(
  title = "Performances of the 6 best models",
  theme = ggplot2::theme_bw()
)

# 26 50 132 124     85 171 113 09 114 16    30     130  45       127 21 163
# 26 50     124 122 85 171 113 09 114 16 35    32  130     48
# 26    132 124 122 85 171 113 09 114    35    32          48 29 127        18
# 26 50 132 124 122 85 171 113 09 114 16    30     130  45                     170
# 26 50 132 124 122 85 171 113 09 114 16 35 30     130
# 26        124     85 171 113 09 114 16 35    32          48 29                   162
```

We can see that some records (namely #26, #45, #122, #124, #132, #162) are contained in the set of "difficult" records shown in Fig. \@ref(fig:global).


```{r pytest, eval=FALSE}
shap <- reticulate::import("shap")
matlib <- reticulate::import("matplotlib")
matlib$use("gtk3agg")


predictors_names <- c("time_constraint", "regime_threshold", "mp_threshold", "window_size", "regime_landmark")
a <- preds %>% dplyr::select(window_size, time_constraint, mp_threshold, regime_threshold, regime_landmark, score)
a <- a %>%
  dplyr::group_by(window_size, time_constraint, mp_threshold, regime_threshold, regime_landmark) %>%
  dplyr::summarize(mean = mean(score, na.rm = TRUE))
trained_model <- train_models(a, parallel = TRUE, v = 2, rep = 1, grid = 10)
train_data <- trained_model$training_data
testing_data <- trained_model$testing_data
## mp_threshold of 1 and time_constraint of 750 are unrealistic, so we filter them out
train_x <- train_data # %>% dplyr::filter(mp_threshold <= 0.9, time_constraint >= 800)
set.seed(102)
best_fit <- generics::fit(trained_model$model, train_x)
predictions <- predict(best_fit, train_x[, 1:5])$.pred
expected <- mean(testing_data$mean)
yardstick::rmse_vec(train_x[, 6]$mean, predictions)
yardstick::rsq_vec(train_x[, 6]$mean, predictions)

shap_values <- shap_explain(best_fit, train_x[, 1:5], train_x[, 1:5], predictors_names, nsim = 50, parallel = TRUE)
shap_samples <- shap_values %>%
  tibble::as_tibble() %>%
  dplyr::slice_sample(n = 1000)

mask <- which(train_x$mean <= 2 | train_x$mean > 10)
expected <- mean(train_x[mask, ]$mean)
# shap_values <- shap_fastshap_all_test
# feature_idx <- c(2L, 0L, 3L, 1L)
shap$decision_plot(expected, data.matrix(shap_values[mask, ]), predictors_names,
  feature_order = "hclust", legend_location = "lower right"
)
```


```{r shaptest, eval=FALSE, fig.height=10, fig.width=15, out.width="100%", results="asis"}
#| fig.cap="Interactive plot for SHAP values. Original fit."

my_plot_html(shap_html_test, knitr::opts_current$get())
```

```{r shaptest2, eval=FALSE, fig.height=10, fig.width=15, out.width="100%", results="asis"}
#| fig.cap="Interactive plot for SHAP values. Re-fit taking into account interactions."

my_plot_html(shap_html_test2, knitr::opts_current$get())
```


```{r tests, eval=FALSE, echo=FALSE, out.width="80%", fig.cap="FLOSS pipeline."}


# all_scores2 <- all_scores %>% dplyr::mutate(test = score_regimes_precision(truth, pred, 0,
#   window = 250,
#   delta_prec = "flat",
#   delta_rec = "back",
#   gamma = "reciprocal",
#   alpha = 0.5
# ))

# all_scores3 <- all_scores2 %>%
#   tidyr::unnest(test)

# all_scores <- all_scores %>%
#   dplyr::mutate(test3 = score_regimes_precision(truth, pred, 0,
#     window = 250,
#     delta_prec = "flat",
#     delta_rec = "back",
#     gamma = "reciprocal",
#     alpha = 0.5
#   ))

# all_scores2 <- all_scores %>%
#   tidyr::unnest(test3)
# dplyr::arrange(desc(f1))

ppp_mp <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  grid.resolution = 51,
  parallel = TRUE,
  pred.var = "mp_threshold",
  train = tree_data
)

ppp_w <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  grid.resolution = 51,
  parallel = TRUE,
  pred.var = "window_size",
  train = tree_data
)

ppp_rt <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  grid.resolution = 51,
  parallel = TRUE,
  pred.var = "regime_threshold",
  train = tree_data
)

ppp_tc <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  grid.resolution = 51,
  parallel = TRUE,
  pred.var = "time_constraint",
  train = tree_data
)

ppp_mp_w <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("mp_threshold", "window_size"),
  train = tree_data
)

pdp::plotPartial(ppp_mp_w, levelplot = FALSE, zlim = c(0, 15))

ppp_mp_rt <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("mp_threshold", "regime_threshold"),
  train = tree_data
)

ppp_mp_tc <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("mp_threshold", "time_constraint"),
  train = tree_data
)
ppp_rt_w <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("regime_threshold", "window_size"),
  train = tree_data
)

ppp_rt_tc <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("regime_threshold", "time_constraint"),
  train = tree_data
)

ppp_tc_w <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("time_constraint", "window_size"),
  train = tree_data
)

ppp_mp_w_tc <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("mp_threshold", "window_size", "time_constraint"),
  train = tree_data
)

ppp_mp_rt_tc <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("mp_threshold", "regime_threshold", "time_constraint"),
  train = tree_data
)

ppp_w_rt_tc <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("window_size", "regime_threshold", "time_constraint"),
  train = tree_data
)

ppp_mp_w_rt <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("mp_threshold", "window_size", "regime_threshold"),
  train = tree_data
)

ppp_mp_tc_rt <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("mp_threshold", "time_constraint", "regime_threshold"),
  train = tree_data
)

ppp_tc_w_rt <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("time_constraint", "window_size", "regime_threshold"),
  train = tree_data
)

ppp_w_tc_mp <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("window_size", "time_constraint", "mp_threshold"),
  train = tree_data
)

ppp_w_rt_mp <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("window_size", "regime_threshold", "mp_threshold"),
  train = tree_data
)

ppp_tc_rt_mp <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("time_constraint", "regime_threshold", "mp_threshold"),
  train = tree_data
)

ppp_tc_mp_w <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("time_constraint", "mp_threshold", "window_size"),
  train = tree_data
)

ppp_rt_mp_w <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("regime_threshold", "mp_threshold", "window_size"),
  train = tree_data
)

ppp_tc_rt_w <- pdp::partial(best_fit$fit$fit,
  type = "regression",
  # grid.resolution = 51,
  parallel = TRUE,
  pred.var = c("time_constraint", "regime_threshold", "window_size"),
  train = tree_data
)


# ppp_mp_rt_tc <- pdp::partial(best_fit$fit$fit,
#   type = "regression",
#   grid.resolution = 51,
#   parallel = FALSE,
#   pred.var = c("regime_landmark", "regime_threshold", "window_size"),
#   train = tree_data
# )
# pdp::plotPartial(ppp_mp_rt_tc, train = tree_data, rug = TRUE)


partials <- list(
  pdp_mp = ppp_mp,
  pdp_w = ppp_w,
  pdp_rt = ppp_rt,
  pdp_tc = ppp_tc,
  pdp_mp_w = ppp_mp_w,
  pdp_mp_rt = ppp_mp_rt,
  pdp_mp_tc = ppp_mp_tc,
  pdp_rt_w = ppp_rt_w,
  pdp_rt_tc = ppp_rt_tc,
  pdp_tc_w = ppp_tc_w,
  pdp_mp_w_tc = ppp_mp_w_tc,
  pdp_mp_rt_tc = ppp_mp_rt_tc,
  pdp_w_rt_tc = ppp_w_rt_tc,
  pdp_mp_w_rt = ppp_mp_w_rt,
  pdp_mp_tc_rt = ppp_mp_tc_rt,
  pdp_tc_w_rt = ppp_tc_w_rt,
  pdp_w_tc_mp = ppp_w_tc_mp,
  pdp_w_rt_mp = ppp_w_rt_mp,
  pdp_tc_rt_mp = ppp_tc_rt_mp,
  pdp_tc_mp_w = ppp_tc_mp_w,
  pdp_rt_mp_w = ppp_rt_mp_w,
  pdp_tc_rt_w = ppp_tc_rt_w
)

# saveRDS(partials, file = here::here("dev", "partials.rds"))
partials <- readRDS(file = here::here("dev", "partials.rds"))

pdp::plotPartial(partials$pdp_rt, train = tree_data, rug = TRUE)
pdp::plotPartial(partials$pdp_tc, train = tree_data, rug = TRUE)
pdp::plotPartial(partials$pdp_mp, train = tree_data, rug = TRUE)
pdp::plotPartial(partials$pdp_w, train = tree_data, rug = TRUE)
pdp::plotPartial(partials$pdp_mp_w, train = tree_data, levelplot = FALSE)
pdp::plotPartial(partials$pdp_mp_rt, train = tree_data, levelplot = FALSE)
pdp::plotPartial(partials$pdp_mp_tc, train = tree_data, levelplot = FALSE)
pdp::plotPartial(partials$pdp_rt_w, train = tree_data, levelplot = TRUE, rug = TRUE)
pdp::plotPartial(partials$pdp_rt_tc, train = tree_data, levelplot = FALSE)
pdp::plotPartial(partials$pdp_tc_w, train = tree_data, levelplot = FALSE)
pdp::plotPartial(partials$pdp_mp_w_tc, train = tree_data, levelplot = FALSE)
pdp::plotPartial(partials$pdp_tc_mp_w, train = tree_data, levelplot = FALSE)
pdp::plotPartial(partials$pdp_w_tc_mp, train = tree_data, levelplot = FALSE)

#  dbarts:rmse: 0.3191 mp_threshold and window_size interact strongly
#   shap: regime mp window time
#   firm: mp regime window time
#   perm: mp time regime window
#  bag_tree rpart: rmse: 1.273935;  mp_threshold and window_size interact medium
#    shap: regime, mp (big), win time
#    firm: mp, regime (big), win time
#    perm: mp regime window time
#  kknn reg: rmse 1.344419?  mp_threshold and window_size interact strongly
#    shap: mp, regime (big), win time
#    firm: mp, regime (big), win time
#    perm: mp, (big), win/regime (mid) time
#  mlp reg: rmse 1.689  mp x win, mp x reg, reg x win, mp x time ~ 1.77,1.63,1.55,1.25
#    model: regime!!! mp, time, win (smalls)
#    shap: mp, regime (big), win time
#    firm: mp, regime (big), win time
#    perm: mp, regime (big), win time
#  kernlab reg: rmse 1.9033 mp_threshold and window_size interact +- as others
#    shap: mp, regime, win (big), time
#    firm: mp, regime, win (big), time
#    perm: mp, win/regime (big), time
#  mars reg: rmse 3.42  mp_threshold and window_size interact strongly
#    model: w mp reg -time
#    shap: mp reg w  -time
#    firm: reg mp w  -time
#    perm: mp reg w  -time
#  decision_tree rpart: rmse: 3.600 regime_threshold and window_size interact strongly
#    model: mp regime window time importance
#    firm: mp regime window time
#    perm: mp time regime window
#    shap: mp regime time window
#  glmnet: rmse: 4.48 ?? 1 regime_threshold*window_size        1.81e-16
#    model: mp regime window time
#   shap: time constraint
#    firm: mp, regime (big), win time
#    perm: mp regime window time
#  keras reg: rmse 5.836752  no interactions
#    shap: window(big), time, mp, regime
#    firm: window(big), mp, time, regime
#    perm: window(big), mp, regime, time
#  brulee reg: rmse 7.40 fail interactions
#    shap:fail interactions
```


```{r parametersplot, eval=FALSE, out.width="100%"}
#| fig.cap="Parameters exploration using Bayesian optimization to attempt to finetune the parameters "

fit3 <- tune::autoplot(fitted1, type = "parameters") + ggplot2::facet_wrap(~name, ncol = 4, scales = "free_y") +
  ggplot2::labs(title = "Parameter search - Repetition 1", x = "Iterations", y = "Parameter value") + ggplot2::theme_bw()

fit3$layers[[1]]$geom$default_aes$alpha <- 0.2

fit4 <- tune::autoplot(fitted2, type = "parameters") + ggplot2::facet_wrap(~name, ncol = 4, scales = "free_y") +
  ggplot2::labs(title = "Parameter search - Repetition 2", x = "Iterations", y = "Parameter value") + ggplot2::theme_bw()

fit3 / fit4

fit3$layers[[1]]$geom$default_aes$alpha <- NA
```

# Current status

That is the current status of the project. The optimization using the `regime_landmark` feature is in progress, and the following article will discuss all the findings.

In parallel, another score measure is being developed based on the concept of Precision and Recall, but for time-series [@Tatbul2018]. It is expected that such a score measure will help to choose the best final model where most of the significant regime changes are detected, keeping a reasonable amount of false positives that will be ruled out further by the classification algorithm.

# References {-}