From accdf252c8f3904b031527c28e6f31e8fd7ee55a Mon Sep 17 00:00:00 2001 From: Bernardo Lares Date: Thu, 27 Jun 2024 07:30:02 +0000 Subject: [PATCH] version 3.11.1 --- DESCRIPTION | 7 +- LICENSE | 6 +- MD5 | 90 +- NAMESPACE | 336 ++--- R/allocator.R | 2108 ++++++++++++++-------------- R/auxiliary.R | 192 +-- R/calibration.R | 270 ++-- R/checks.R | 2202 ++++++++++++++--------------- R/clusters.R | 950 ++++++------- R/convergence.R | 440 +++--- R/data.R | 126 +- R/exports.R | 440 +++--- R/imports.R | 102 +- R/json.R | 860 ++++++------ R/model.R | 2714 ++++++++++++++++++------------------ R/outputs.R | 656 ++++----- R/pareto.R | 1240 ++++++++-------- R/plots.R | 3 +- R/refresh.R | 1208 ++++++++-------- R/response.R | 804 +++++------ R/transformation.R | 884 ++++++------ R/zzz.R | 16 +- man/Robyn.Rd | 62 +- man/adstocks.Rd | 204 +-- man/dt_prophet_holidays.Rd | 76 +- man/dt_simulated_weekly.Rd | 74 +- man/fit_spend_exposure.Rd | 58 +- man/hyper_limits.Rd | 36 +- man/hyper_names.Rd | 202 +-- man/mic_men.Rd | 76 +- man/prophet_decomp.Rd | 90 +- man/robyn_allocator.Rd | 312 ++--- man/robyn_clusters.Rd | 168 +-- man/robyn_converge.Rd | 118 +- man/robyn_inputs.Rd | 426 +++--- man/robyn_mmm.Rd | 202 +-- man/robyn_outputs.Rd | 304 ++-- man/robyn_refresh.Rd | 334 ++--- man/robyn_response.Rd | 316 ++--- man/robyn_run.Rd | 260 ++-- man/robyn_save.Rd | 120 +- man/robyn_train.Rd | 196 +-- man/robyn_update.Rd | 48 +- man/robyn_write.Rd | 156 +-- man/saturation_hill.Rd | 86 +- man/set_holidays.Rd | 46 +- 46 files changed, 9812 insertions(+), 9812 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index dd65efa..64ac33d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: Robyn Type: Package Title: Semi-Automated Marketing Mix Modeling (MMM) from Meta Marketing Science -Version: 3.11.0 +Version: 3.11.1 Authors@R: c( person("Gufeng", "Zhou", , "gufeng@meta.com", c("aut")), person("Bernardo", "Lares", , "laresbernardo@gmail.com", c("cre","aut")), @@ -15,7 +15,6 @@ Depends: R (>= 4.0.0) Imports: doParallel, doRNG, dplyr, foreach, ggplot2, ggridges, glmnet, jsonlite, lares, lubridate, minpack.lm, nloptr, patchwork, prophet, reticulate, stringr, tidyr -Suggests: shiny Config/reticulate: list( packages = list( list(package = "nevergrad", pip = TRUE) ) ) URL: https://github.com/facebookexperimental/Robyn, @@ -26,11 +25,11 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true NeedsCompilation: no -Packaged: 2024-06-19 14:26:44 UTC; bl896211 +Packaged: 2024-06-27 07:12:58 UTC; bernardo Author: Gufeng Zhou [aut], Bernardo Lares [cre, aut], Leonel Sentana [aut], Igor Skokan [aut], Meta Platforms, Inc. [cph, fnd] Repository: CRAN -Date/Publication: 2024-06-19 15:30:07 UTC +Date/Publication: 2024-06-27 07:30:02 UTC diff --git a/LICENSE b/LICENSE index 14f60c0..9a44497 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,3 @@ - -YEAR: 2020-present -COPYRIGHT HOLDER: Meta Platforms, Inc. + +YEAR: 2020-present +COPYRIGHT HOLDER: Meta Platforms, Inc. diff --git a/MD5 b/MD5 index 31bcd5e..8ef3047 100644 --- a/MD5 +++ b/MD5 @@ -1,51 +1,51 @@ -737843db82b01554f1331972045d29c0 *DESCRIPTION -d264ad4711633bfda83de27ccc6b6cc4 *LICENSE -42211253efb82ec54ed1282d4a68a35c *NAMESPACE -6734429abfe5540a5569f668a9715aec *R/allocator.R -8a8a155a03e80c86a5f06fb9df9e6200 *R/auxiliary.R -a5ead184b70c0cb74248e3ebb89b61ff *R/calibration.R -95ac9e79ae4a61a81961a0f6e6d11226 *R/checks.R -3940162e8bd93f7927bb40c6ea3d5e7c *R/clusters.R -4cbbdd7865af8681d7614026b73bde7f *R/convergence.R -ee3609f4dc78f506ab3ac13c8e44cc86 *R/data.R -b618e84417ccd36da0c174f728e13d75 *R/exports.R -e37d0896cfe9ca89c747189a16d7825c *R/imports.R +dca064d90474703401cd923600241ff0 *DESCRIPTION +752944595c6a3c34b7052a9c991addc0 *LICENSE +523fc7a3af477ac065b53ee9f670fe85 *NAMESPACE +c81672597cbbd2439e8b70388e03d42f *R/allocator.R +ac5c30e795179a92d3077bb4589ba2e7 *R/auxiliary.R +f47047c522afa4b4753d0f7a7dacea3e *R/calibration.R +deb960faf5341693085acac238a72b3b *R/checks.R +d2e652296a38ebc9ddbe56fc505de303 *R/clusters.R +d1caaa21e4fe9b738156c455144ee7f7 *R/convergence.R +f2e6a374e18f9f1079f597d80649825b *R/data.R +0e591a5c30be634133f5d7c21dcdff8e *R/exports.R +de5015813452e5d0d218757a3d3c8fad *R/imports.R 01ecaf8987749c16e61c4e6cd0bcc767 *R/inputs.R -6f00c67e60893b2d3d2d039abe08ca40 *R/json.R -337ee51a2e8adfa787ce186d49a1b348 *R/model.R -c1a3847d65ea3159d7fb074fb85dd39d *R/outputs.R -05d16be67d634c2ee88ca4fac2ded880 *R/pareto.R -dc2c6c76234c4ff2339fb9341592a013 *R/plots.R -42ec0bd4fb41b9b680e6a6cb04e45400 *R/refresh.R -43c5892f3436b59d645414f7eb56a26f *R/response.R -7d61c52ebba0671793aff37086b39211 *R/transformation.R -21f1fe027f2cb8f308b855969faf093c *R/zzz.R +bc1adc37c6d4b252a07c5110d8fff555 *R/json.R +f11bcfcce424ca09d1d5570992d89b80 *R/model.R +3ac6df57fb7658dada8bb15155d192aa *R/outputs.R +603b24878435efbd574284881c6d069f *R/pareto.R +d59e7ef16f02f789db40b5c800ef0267 *R/plots.R +712f5ea5425ede4269ddafc28d4dfaa4 *R/refresh.R +ca55cfc199646a4a69daf79d7c41158c *R/response.R +4831ef07bdc8f6ad2a432b82f7fca8c4 *R/transformation.R +b134a72bc96231603de287aec3577f18 *R/zzz.R 7d698820107058ceba4916e9e31afaa3 *README.md d3b67ad183dd0eca1bd6de8d2da458ad *data/dt_prophet_holidays.RData cd4fe2d7af93f360a6169023445efeac *data/dt_simulated_weekly.RData -fcd730b803138926fcc65dd1bd6acba0 *man/Robyn.Rd -69baa4dcde5c4557d1228c6d026d8b5b *man/adstocks.Rd -e33f7344a259797dfda20dc7b61a361e *man/dt_prophet_holidays.Rd -5ecae3324f6e16e03aa65d8166991585 *man/dt_simulated_weekly.Rd +efc54e76d479b91520b8ab34562e77d2 *man/Robyn.Rd +5523cbd2c0567817c287d49bf1b9d6a9 *man/adstocks.Rd +7eb7c141021737dee990e1b2306e90ca *man/dt_prophet_holidays.Rd +687560b63bc6effd5cc7c830eccc93d0 *man/dt_simulated_weekly.Rd 91e1ce1df1285930941caabfea53c04b *man/figures/logo.png 5d6121eaa51b71372ae97375af745029 *man/figures/logo_old.png -905067f66aaec75444a07f2221718333 *man/fit_spend_exposure.Rd -81fef924e76ccaf9e8e1d09f3ea18d2f *man/hyper_limits.Rd -22f6cdfeb88dd936c5b224b05fc64656 *man/hyper_names.Rd -773d001647a7cca56045be53747ee0f7 *man/mic_men.Rd -f12ddc6acdec0f70213e6fc45ac820e8 *man/prophet_decomp.Rd -d2a2473e645cc4eb82bdb35afca5969a *man/robyn_allocator.Rd -ca863b1e1767fb0f599dd5f57a78a7e4 *man/robyn_clusters.Rd -c3d83ba32c1f49e53f2417f26dffb632 *man/robyn_converge.Rd -4e89989bd56f30e7ee622cbbecf613a5 *man/robyn_inputs.Rd -3dee70eadb621d53117a9c6f05874dd0 *man/robyn_mmm.Rd -dddb92f9b10c19d6be77c79b49896cec *man/robyn_outputs.Rd -295d5f531a747a9a3efa50dde785c6fd *man/robyn_refresh.Rd -37fd4ea63999545c0156bacab1ea2842 *man/robyn_response.Rd -b6cb09ea120a6a44f3731eb12456f069 *man/robyn_run.Rd -33083e4978e7cb744f625a35e1f1c301 *man/robyn_save.Rd -c7c6580513bb95f7a34c661504858b42 *man/robyn_train.Rd -53e114189619425fa9fa9d1124362e68 *man/robyn_update.Rd -86eb1489a3401af3c7dee734c37425d7 *man/robyn_write.Rd -83b9e1dcf75f650ac050996b3c03121f *man/saturation_hill.Rd -b1c55e07072e7f5ffdca192e0a964809 *man/set_holidays.Rd +f1e76e51465d3ef46b252b2801ed2793 *man/fit_spend_exposure.Rd +0baea02a135437e13ffa07c9304369be *man/hyper_limits.Rd +0b957d1878d5582db0c2e470bed15436 *man/hyper_names.Rd +1221d30edc9434dc506b9079a2fdfede *man/mic_men.Rd +b06aa2bd643cf44b31c59e56c46883dc *man/prophet_decomp.Rd +11f934aacf7e302dd76b6759c6e113f2 *man/robyn_allocator.Rd +93a45807e671b92d2b639971733d2926 *man/robyn_clusters.Rd +d362996022bebcf4339364b7d2fa8c9f *man/robyn_converge.Rd +f8b2457f9c11cdbd1d2511c08f8560ff *man/robyn_inputs.Rd +3f0b39f299145977c21a76f7cb3c2831 *man/robyn_mmm.Rd +1b8553291a5549a5dc378fa31423f555 *man/robyn_outputs.Rd +ce18b7afa0848ba5fd7f04fa64a7e728 *man/robyn_refresh.Rd +1f725dc7734c592b63131543c82fc428 *man/robyn_response.Rd +d1f816ed303b65d4a20c9f6fcf8c17ad *man/robyn_run.Rd +f9252fe8d0c25be4ee46f90a46c241ee *man/robyn_save.Rd +2f30b524e581816955516a782673d7dc *man/robyn_train.Rd +82dd89bb64b23248f39355592118008a *man/robyn_update.Rd +1b33081c446a5e1c1dad43aabcdec54e *man/robyn_write.Rd +99420410d2f126624362b5194415ea6d *man/saturation_hill.Rd +60c5d7973692ee48997b5cfbf75b9cf1 *man/set_holidays.Rd diff --git a/NAMESPACE b/NAMESPACE index 824c570..7bbbfe7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,168 +1,168 @@ -# Generated by roxygen2: do not edit by hand - -S3method(plot,robyn_allocator) -S3method(plot,robyn_refresh) -S3method(plot,robyn_save) -S3method(print,robyn_allocator) -S3method(print,robyn_inputs) -S3method(print,robyn_models) -S3method(print,robyn_outputs) -S3method(print,robyn_read) -S3method(print,robyn_refresh) -S3method(print,robyn_save) -S3method(print,robyn_write) -export(adstock_geometric) -export(adstock_weibull) -export(decomp_plot) -export(hyper_limits) -export(hyper_names) -export(mic_men) -export(plot_adstock) -export(plot_saturation) -export(robyn_allocator) -export(robyn_clusters) -export(robyn_converge) -export(robyn_csv) -export(robyn_inputs) -export(robyn_load) -export(robyn_mmm) -export(robyn_onepagers) -export(robyn_outputs) -export(robyn_plots) -export(robyn_read) -export(robyn_recreate) -export(robyn_refresh) -export(robyn_response) -export(robyn_run) -export(robyn_save) -export(robyn_train) -export(robyn_update) -export(robyn_write) -export(saturation_hill) -export(transform_adstock) -export(ts_validation) -import(ggplot2) -importFrom(doParallel,registerDoParallel) -importFrom(doParallel,stopImplicitCluster) -importFrom(doRNG,"%dorng%") -importFrom(dplyr,across) -importFrom(dplyr,all_of) -importFrom(dplyr,any_of) -importFrom(dplyr,arrange) -importFrom(dplyr,as_tibble) -importFrom(dplyr,bind_cols) -importFrom(dplyr,bind_rows) -importFrom(dplyr,case_when) -importFrom(dplyr,contains) -importFrom(dplyr,desc) -importFrom(dplyr,distinct) -importFrom(dplyr,ends_with) -importFrom(dplyr,everything) -importFrom(dplyr,filter) -importFrom(dplyr,group_by) -importFrom(dplyr,lag) -importFrom(dplyr,left_join) -importFrom(dplyr,mutate) -importFrom(dplyr,mutate_at) -importFrom(dplyr,n) -importFrom(dplyr,n_distinct) -importFrom(dplyr,pull) -importFrom(dplyr,rename) -importFrom(dplyr,row_number) -importFrom(dplyr,select) -importFrom(dplyr,slice) -importFrom(dplyr,starts_with) -importFrom(dplyr,summarise) -importFrom(dplyr,summarise_all) -importFrom(dplyr,tally) -importFrom(dplyr,ungroup) -importFrom(foreach,"%dopar%") -importFrom(foreach,foreach) -importFrom(foreach,getDoParWorkers) -importFrom(foreach,registerDoSEQ) -importFrom(ggridges,geom_density_ridges) -importFrom(ggridges,geom_density_ridges_gradient) -importFrom(glmnet,glmnet) -importFrom(jsonlite,fromJSON) -importFrom(jsonlite,read_json) -importFrom(jsonlite,toJSON) -importFrom(jsonlite,write_json) -importFrom(lares,`%>%`) -importFrom(lares,check_opts) -importFrom(lares,clusterKmeans) -importFrom(lares,formatNum) -importFrom(lares,freqs) -importFrom(lares,glued) -importFrom(lares,num_abbr) -importFrom(lares,ohse) -importFrom(lares,removenacols) -importFrom(lares,scale_x_abbr) -importFrom(lares,scale_x_percent) -importFrom(lares,scale_y_abbr) -importFrom(lares,scale_y_percent) -importFrom(lares,theme_lares) -importFrom(lares,try_require) -importFrom(lares,v2t) -importFrom(lubridate,day) -importFrom(lubridate,floor_date) -importFrom(lubridate,is.Date) -importFrom(minpack.lm,nlsLM) -importFrom(nloptr,nloptr) -importFrom(parallel,detectCores) -importFrom(patchwork,guide_area) -importFrom(patchwork,plot_annotation) -importFrom(patchwork,plot_layout) -importFrom(patchwork,wrap_plots) -importFrom(prophet,add_regressor) -importFrom(prophet,add_seasonality) -importFrom(prophet,fit.prophet) -importFrom(prophet,prophet) -importFrom(reticulate,conda_create) -importFrom(reticulate,conda_install) -importFrom(reticulate,import) -importFrom(reticulate,py_install) -importFrom(reticulate,py_module_available) -importFrom(reticulate,tuple) -importFrom(reticulate,use_condaenv) -importFrom(reticulate,use_virtualenv) -importFrom(reticulate,virtualenv_create) -importFrom(stats,AIC) -importFrom(stats,BIC) -importFrom(stats,coef) -importFrom(stats,complete.cases) -importFrom(stats,dgamma) -importFrom(stats,dnorm) -importFrom(stats,dweibull) -importFrom(stats,end) -importFrom(stats,lm) -importFrom(stats,median) -importFrom(stats,model.matrix) -importFrom(stats,na.omit) -importFrom(stats,nls.control) -importFrom(stats,predict) -importFrom(stats,pweibull) -importFrom(stats,qt) -importFrom(stats,quantile) -importFrom(stats,qunif) -importFrom(stats,reorder) -importFrom(stats,rnorm) -importFrom(stats,sd) -importFrom(stats,setNames) -importFrom(stats,start) -importFrom(stringr,str_count) -importFrom(stringr,str_detect) -importFrom(stringr,str_extract) -importFrom(stringr,str_remove) -importFrom(stringr,str_replace) -importFrom(stringr,str_split) -importFrom(stringr,str_to_title) -importFrom(stringr,str_which) -importFrom(tidyr,pivot_longer) -importFrom(tidyr,pivot_wider) -importFrom(utils,askYesNo) -importFrom(utils,flush.console) -importFrom(utils,head) -importFrom(utils,setTxtProgressBar) -importFrom(utils,tail) -importFrom(utils,txtProgressBar) -importFrom(utils,write.csv) +# Generated by roxygen2: do not edit by hand + +S3method(plot,robyn_allocator) +S3method(plot,robyn_refresh) +S3method(plot,robyn_save) +S3method(print,robyn_allocator) +S3method(print,robyn_inputs) +S3method(print,robyn_models) +S3method(print,robyn_outputs) +S3method(print,robyn_read) +S3method(print,robyn_refresh) +S3method(print,robyn_save) +S3method(print,robyn_write) +export(adstock_geometric) +export(adstock_weibull) +export(decomp_plot) +export(hyper_limits) +export(hyper_names) +export(mic_men) +export(plot_adstock) +export(plot_saturation) +export(robyn_allocator) +export(robyn_clusters) +export(robyn_converge) +export(robyn_csv) +export(robyn_inputs) +export(robyn_load) +export(robyn_mmm) +export(robyn_onepagers) +export(robyn_outputs) +export(robyn_plots) +export(robyn_read) +export(robyn_recreate) +export(robyn_refresh) +export(robyn_response) +export(robyn_run) +export(robyn_save) +export(robyn_train) +export(robyn_update) +export(robyn_write) +export(saturation_hill) +export(transform_adstock) +export(ts_validation) +import(ggplot2) +importFrom(doParallel,registerDoParallel) +importFrom(doParallel,stopImplicitCluster) +importFrom(doRNG,"%dorng%") +importFrom(dplyr,across) +importFrom(dplyr,all_of) +importFrom(dplyr,any_of) +importFrom(dplyr,arrange) +importFrom(dplyr,as_tibble) +importFrom(dplyr,bind_cols) +importFrom(dplyr,bind_rows) +importFrom(dplyr,case_when) +importFrom(dplyr,contains) +importFrom(dplyr,desc) +importFrom(dplyr,distinct) +importFrom(dplyr,ends_with) +importFrom(dplyr,everything) +importFrom(dplyr,filter) +importFrom(dplyr,group_by) +importFrom(dplyr,lag) +importFrom(dplyr,left_join) +importFrom(dplyr,mutate) +importFrom(dplyr,mutate_at) +importFrom(dplyr,n) +importFrom(dplyr,n_distinct) +importFrom(dplyr,pull) +importFrom(dplyr,rename) +importFrom(dplyr,row_number) +importFrom(dplyr,select) +importFrom(dplyr,slice) +importFrom(dplyr,starts_with) +importFrom(dplyr,summarise) +importFrom(dplyr,summarise_all) +importFrom(dplyr,tally) +importFrom(dplyr,ungroup) +importFrom(foreach,"%dopar%") +importFrom(foreach,foreach) +importFrom(foreach,getDoParWorkers) +importFrom(foreach,registerDoSEQ) +importFrom(ggridges,geom_density_ridges) +importFrom(ggridges,geom_density_ridges_gradient) +importFrom(glmnet,glmnet) +importFrom(jsonlite,fromJSON) +importFrom(jsonlite,read_json) +importFrom(jsonlite,toJSON) +importFrom(jsonlite,write_json) +importFrom(lares,`%>%`) +importFrom(lares,check_opts) +importFrom(lares,clusterKmeans) +importFrom(lares,formatNum) +importFrom(lares,freqs) +importFrom(lares,glued) +importFrom(lares,num_abbr) +importFrom(lares,ohse) +importFrom(lares,removenacols) +importFrom(lares,scale_x_abbr) +importFrom(lares,scale_x_percent) +importFrom(lares,scale_y_abbr) +importFrom(lares,scale_y_percent) +importFrom(lares,theme_lares) +importFrom(lares,try_require) +importFrom(lares,v2t) +importFrom(lubridate,day) +importFrom(lubridate,floor_date) +importFrom(lubridate,is.Date) +importFrom(minpack.lm,nlsLM) +importFrom(nloptr,nloptr) +importFrom(parallel,detectCores) +importFrom(patchwork,guide_area) +importFrom(patchwork,plot_annotation) +importFrom(patchwork,plot_layout) +importFrom(patchwork,wrap_plots) +importFrom(prophet,add_regressor) +importFrom(prophet,add_seasonality) +importFrom(prophet,fit.prophet) +importFrom(prophet,prophet) +importFrom(reticulate,conda_create) +importFrom(reticulate,conda_install) +importFrom(reticulate,import) +importFrom(reticulate,py_install) +importFrom(reticulate,py_module_available) +importFrom(reticulate,tuple) +importFrom(reticulate,use_condaenv) +importFrom(reticulate,use_virtualenv) +importFrom(reticulate,virtualenv_create) +importFrom(stats,AIC) +importFrom(stats,BIC) +importFrom(stats,coef) +importFrom(stats,complete.cases) +importFrom(stats,dgamma) +importFrom(stats,dnorm) +importFrom(stats,dweibull) +importFrom(stats,end) +importFrom(stats,lm) +importFrom(stats,median) +importFrom(stats,model.matrix) +importFrom(stats,na.omit) +importFrom(stats,nls.control) +importFrom(stats,predict) +importFrom(stats,pweibull) +importFrom(stats,qt) +importFrom(stats,quantile) +importFrom(stats,qunif) +importFrom(stats,reorder) +importFrom(stats,rnorm) +importFrom(stats,sd) +importFrom(stats,setNames) +importFrom(stats,start) +importFrom(stringr,str_count) +importFrom(stringr,str_detect) +importFrom(stringr,str_extract) +importFrom(stringr,str_remove) +importFrom(stringr,str_replace) +importFrom(stringr,str_split) +importFrom(stringr,str_to_title) +importFrom(stringr,str_which) +importFrom(tidyr,pivot_longer) +importFrom(tidyr,pivot_wider) +importFrom(utils,askYesNo) +importFrom(utils,flush.console) +importFrom(utils,head) +importFrom(utils,setTxtProgressBar) +importFrom(utils,tail) +importFrom(utils,txtProgressBar) +importFrom(utils,write.csv) diff --git a/R/allocator.R b/R/allocator.R index 48672e4..b7a52e6 100644 --- a/R/allocator.R +++ b/R/allocator.R @@ -1,1054 +1,1054 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Budget Allocator -#' -#' \code{robyn_allocator()} function returns a new split of media -#' variable spends that maximizes the total media response. -#' -#' @inheritParams robyn_run -#' @inheritParams robyn_outputs -#' @param robyn_object Character or List. Path of the \code{Robyn.RDS} object -#' that contains all previous modeling information or the imported list. -#' @param select_build Integer. Default to the latest model build. \code{select_build = 0} -#' selects the initial model. \code{select_build = 1} selects the first refresh model. -#' @param InputCollect List. Contains all input parameters for the model. -#' Required when \code{robyn_object} is not provided. -#' @param OutputCollect List. Containing all model result. -#' Required when \code{robyn_object} is not provided. -#' @param select_model Character. A model \code{SolID}. When \code{robyn_object} -#' is provided, \code{select_model} defaults to the already selected \code{SolID}. When -#' \code{robyn_object} is not provided, \code{select_model} must be provided with -#' \code{InputCollect} and \code{OutputCollect}, and must be one of -#' \code{OutputCollect$allSolutions}. -#' @param optim_algo Character. Default to \code{"SLSQP_AUGLAG"}, short for "Sequential Least-Squares -#' Quadratic Programming" and "Augmented Lagrangian". Alternatively, "\code{"MMA_AUGLAG"}, -#' short for "Methods of Moving Asymptotes". More details see the documentation of -#' NLopt \href{https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/}{here}. -#' @param scenario Character. Accepted options are: \code{"max_response"}, \code{"target_efficiency"}. -#' Scenario \code{"max_response"} answers the question: -#' "What's the potential revenue/conversions lift with the same (or custom) spend level -#' in \code{date_range} and what is the allocation and expected response mix?" -#' Scenario \code{"target_efficiency"} optimizes ROAS or CPA and answers the question: -#' "What's the potential revenue/conversions lift and spend levels based on a -#' \code{target_value} for CPA/ROAS and what is the allocation and expected response mix?" -#' Deprecated scenario: \code{"max_response_expected_spend"}. -#' @param total_budget Numeric. Total marketing budget for all paid channels for the -#' period in \code{date_range}. -#' @param target_value Numeric. When using the scenario \code{"target_efficiency"}, -#' target_value is the desired ROAS or CPA with no upper spend limit. Default is set to 80\% of -#' initial ROAS or 120\% of initial CPA, when \code{"target_value = NULL"}. -#' @param date_range Character. Date(s) to apply adstocked transformations and pick mean spends -#' per channel. Set one of: "all", "last", or "last_n" (where -#' n is the last N dates available), date (i.e. "2022-03-27"), or date range -#' (i.e. \code{c("2022-01-01", "2022-12-31")}). Default to "all". -#' @param channel_constr_low,channel_constr_up Numeric vectors. The lower and upper bounds -#' for each paid media variable when maximizing total media response. For example, -#' \code{channel_constr_low = 0.7} means minimum spend of the variable is 70% of historical -#' average, using non-zero spend values, within \code{date_min} and \code{date_max} date range. -#' Both constrains must be length 1 (same for all values) OR same length and order as -#' \code{paid_media_spends}. It's not recommended to 'exaggerate' upper bounds, especially -#' if the new level is way higher than historical level. Lower bound must be >=0.01, -#' and upper bound should be < 5. -#' @param channel_constr_multiplier Numeric. Default to 3. For example, if -#' \code{channel_constr_low} and \code{channel_constr_up} are 0.8 to 1.2, the range is 0.4. -#' The allocator will also show the optimum solution for a larger constraint range of -#' 0.4 x 3 = 1.2, or 0.4 to 1.6, to show the optimization potential to support allocation -#' interpretation and decision. -#' @param maxeval Integer. The maximum iteration of the global optimization algorithm. -#' Defaults to 100000. -#' @param constr_mode Character. Options are \code{"eq"} or \code{"ineq"}, -#' indicating constraints with equality or inequality. -#' @param plots Boolean. Generate plots? -#' @return A list object containing allocator result. -#' @examples -#' \dontrun{ -#' # Having InputCollect and OutputCollect results -#' AllocatorCollect <- robyn_allocator( -#' InputCollect = InputCollect, -#' OutputCollect = OutputCollect, -#' select_model = "1_2_3", -#' scenario = "max_response", -#' channel_constr_low = 0.7, -#' channel_constr_up = c(1.2, 1.5, 1.5, 1.5, 1.5), -#' channel_constr_multiplier = 4, -#' date_range = "last_26", -#' export = FALSE -#' ) -#' # Print a summary -#' print(AllocatorCollect) -#' # Plot the allocator one-pager -#' plot(AllocatorCollect) -#' } -#' @return List. Contains optimized allocation results and plots. -#' @export -robyn_allocator <- function(robyn_object = NULL, - select_build = 0, - InputCollect = NULL, - OutputCollect = NULL, - select_model = NULL, - json_file = NULL, - scenario = "max_response", - total_budget = NULL, - target_value = NULL, - date_range = "all", - channel_constr_low = NULL, - channel_constr_up = NULL, - channel_constr_multiplier = 3, - optim_algo = "SLSQP_AUGLAG", - maxeval = 100000, - constr_mode = "eq", - plots = TRUE, - plot_folder = NULL, - plot_folder_sub = NULL, - export = TRUE, - quiet = FALSE, - ui = FALSE, - ...) { - ### Use previously exported model using json_file - if (!is.null(json_file)) { - if (is.null(InputCollect)) { - InputCollect <- robyn_inputs( - json_file = json_file, quiet = TRUE, ... - ) - } - if (is.null(OutputCollect)) { - if (is.null(plot_folder)) { - json <- robyn_read(json_file, step = 2, quiet = TRUE) - plot_folder <- dirname(json$ExportedModel$plot_folder) - if (!is.null(plot_folder_sub)) plot_folder_sub <- NULL - } - OutputCollect <- robyn_run( - json_file = json_file, export = export, plot_folder = plot_folder, plot_folder_sub = plot_folder_sub, ... - ) - } - if (is.null(select_model)) select_model <- OutputCollect$selectID - } - - ## Collect inputs - # if (!is.null(robyn_object) && (is.null(InputCollect) && is.null(OutputCollect))) { - # if ("robyn_exported" %in% class(robyn_object)) { - # imported <- robyn_object - # robyn_object <- imported$robyn_object - # } else { - # imported <- robyn_load(robyn_object, select_build, quiet) - # } - # InputCollect <- imported$InputCollect - # OutputCollect <- imported$OutputCollect - # select_model <- imported$select_model - # } else { - if (is.null(select_model) && length(OutputCollect$allSolutions == 1)) { - select_model <- OutputCollect$allSolutions - } - if (any(is.null(InputCollect), is.null(OutputCollect), is.null(select_model))) { - stop("When 'robyn_object' is not provided, then InputCollect, OutputCollect, select_model must be provided") - } - # } - - if (length(InputCollect$paid_media_spends) <= 1) { - stop("Must have a valid model with at least two 'paid_media_spends'") - } - - if (!quiet) message(paste(">>> Running budget allocator for model ID", select_model, "...")) - - ## Set local data & params values - paid_media_spends <- InputCollect$paid_media_spends - media_order <- order(paid_media_spends) - mediaSpendSorted <- paid_media_spends[media_order] - dep_var_type <- InputCollect$dep_var_type - if (is.null(channel_constr_low)) { - channel_constr_low <- case_when( - scenario == "max_response" ~ 0.5, - scenario == "target_efficiency" ~ 0.1 - ) - } - if (is.null(channel_constr_up)) { - channel_constr_up <- case_when( - scenario == "max_response" ~ 2, - scenario == "target_efficiency" ~ 10 - ) - } - if (length(channel_constr_low) == 1) channel_constr_low <- rep(channel_constr_low, length(paid_media_spends)) - if (length(channel_constr_up) == 1) channel_constr_up <- rep(channel_constr_up, length(paid_media_spends)) - check_allocator_constrains(channel_constr_low, channel_constr_up) - names(channel_constr_low) <- paid_media_spends - names(channel_constr_up) <- paid_media_spends - channel_constr_low <- channel_constr_low[media_order] - channel_constr_up <- channel_constr_up[media_order] - dt_hyppar <- filter(OutputCollect$resultHypParam, .data$solID == select_model) - dt_bestCoef <- filter(OutputCollect$xDecompAgg, .data$solID == select_model, .data$rn %in% paid_media_spends) - - ## Check inputs and parameters - scenario <- check_allocator( - OutputCollect, select_model, paid_media_spends, scenario, - channel_constr_low, channel_constr_up, constr_mode - ) - - ## Sort media - dt_coef <- select(dt_bestCoef, .data$rn, .data$coef) - get_rn_order <- order(dt_bestCoef$rn) - dt_coefSorted <- dt_coef[get_rn_order, ] - dt_bestCoef <- dt_bestCoef[get_rn_order, ] - coefSelectorSorted <- dt_coefSorted$coef > 0 - names(coefSelectorSorted) <- dt_coefSorted$rn - - dt_hyppar <- select(dt_hyppar, hyper_names(InputCollect$adstock, mediaSpendSorted)) %>% - select(sort(colnames(.))) - dt_bestCoef <- dt_bestCoef[dt_bestCoef$rn %in% mediaSpendSorted, ] - channelConstrLowSorted <- channel_constr_low[mediaSpendSorted] - channelConstrUpSorted <- channel_constr_up[mediaSpendSorted] - - ## Get hill parameters for each channel - hills <- get_hill_params( - InputCollect, OutputCollect, dt_hyppar, dt_coef, mediaSpendSorted, select_model - ) - alphas <- hills$alphas - inflexions <- hills$inflexions - coefs_sorted <- hills$coefs_sorted - - # Spend values based on date range set - window_loc <- InputCollect$rollingWindowStartWhich:InputCollect$rollingWindowEndWhich - dt_optimCost <- slice(InputCollect$dt_mod, window_loc) - new_date_range <- check_metric_dates(date_range, dt_optimCost$ds, InputCollect$dayInterval, quiet = quiet, is_allocator = TRUE) - date_min <- head(new_date_range$date_range_updated, 1) - date_max <- tail(new_date_range$date_range_updated, 1) - check_daterange(date_min, date_max, dt_optimCost$ds) - if (is.null(date_min)) date_min <- min(dt_optimCost$ds) - if (is.null(date_max)) date_max <- max(dt_optimCost$ds) - if (date_min < min(dt_optimCost$ds)) date_min <- min(dt_optimCost$ds) - if (date_max > max(dt_optimCost$ds)) date_max <- max(dt_optimCost$ds) - histFiltered <- filter(dt_optimCost, .data$ds >= date_min & .data$ds <= date_max) - - histSpendAll <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSpendSorted)), sum)) - histSpendAllTotal <- sum(histSpendAll) - histSpendAllUnit <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSpendSorted)), mean)) - histSpendAllUnitTotal <- sum(histSpendAllUnit) - histSpendAllShare <- histSpendAllUnit / histSpendAllUnitTotal - - histSpendWindow <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), sum)) - histSpendWindowTotal <- sum(histSpendWindow) - initSpendUnit <- histSpendWindowUnit <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), mean)) - histSpendWindowUnitTotal <- sum(histSpendWindowUnit) - histSpendWindowShare <- histSpendWindowUnit / histSpendWindowUnitTotal - - simulation_period <- initial_mean_period <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), length)) - nDates <- lapply(mediaSpendSorted, function(x) histFiltered$ds) - names(nDates) <- mediaSpendSorted - if (!quiet) { - message(sprintf( - "Date Window: %s:%s (%s %ss)", - date_min, date_max, unique(initial_mean_period), InputCollect$intervalType - )) - } - zero_spend_channel <- names(histSpendWindow[histSpendWindow == 0]) - - initSpendUnitTotal <- sum(initSpendUnit) - initSpendShare <- initSpendUnit / initSpendUnitTotal - total_budget_unit <- ifelse(is.null(total_budget), initSpendUnitTotal, total_budget / unique(simulation_period)) - total_budget_window <- total_budget_unit * unique(simulation_period) - - ## Get use case based on inputs - usecase <- which_usecase(initSpendUnit[1], date_range) - if (usecase == "all_historical_vec") { - ndates_loc <- which(InputCollect$dt_mod$ds %in% histFiltered$ds) - } else { - ndates_loc <- seq_along(histFiltered$ds) - } - usecase <- paste(usecase, ifelse(!is.null(total_budget), "+ defined_budget", "+ historical_budget")) - - # Response values based on date range -> mean spend - initResponseUnit <- NULL - initResponseMargUnit <- NULL - hist_carryover <- list() - qa_carryover <- list() - for (i in seq_along(mediaSpendSorted)) { - resp <- robyn_response( - json_file = json_file, - # robyn_object = robyn_object, - select_build = select_build, - select_model = select_model, - metric_name = mediaSpendSorted[i], - # metric_value = initSpendUnit[i] * simulation_period[i], - # date_range = date_range, - dt_hyppar = OutputCollect$resultHypParam, - dt_coef = OutputCollect$xDecompAgg, - InputCollect = InputCollect, - OutputCollect = OutputCollect, - quiet = TRUE, - is_allocator = TRUE, - ... - ) - # val <- sort(resp$response_total)[round(length(resp$response_total) / 2)] - # histSpendUnit[i] <- resp$input_immediate[which(resp$response_total == val)] - hist_carryover_temp <- resp$input_carryover[window_loc] - qa_carryover[[i]] <- round(resp$input_total[window_loc]) - names(hist_carryover_temp) <- resp$date[window_loc] - hist_carryover[[i]] <- hist_carryover_temp - # get simulated response - # if (resp$input_immediate[1] == initSpendUnit[i]) { - # x_input <- initSpendUnit[i] - # } else { - # x_input <- mean(resp$input_immediate) - # } - x_input <- initSpendUnit[i] - resp_simulate <- fx_objective( - x = x_input, - coeff = coefs_sorted[[mediaSpendSorted[i]]], - alpha = alphas[[paste0(mediaSpendSorted[i], "_alphas")]], - inflexion = inflexions[[paste0(mediaSpendSorted[i], "_gammas")]], - x_hist_carryover = mean(hist_carryover_temp), - get_sum = FALSE - ) - resp_simulate_plus1 <- fx_objective( - x = x_input + 1, - coeff = coefs_sorted[[mediaSpendSorted[i]]], - alpha = alphas[[paste0(mediaSpendSorted[i], "_alphas")]], - inflexion = inflexions[[paste0(mediaSpendSorted[i], "_gammas")]], - x_hist_carryover = mean(hist_carryover_temp), - get_sum = FALSE - ) - initResponseUnit <- c(initResponseUnit, resp_simulate) - initResponseMargUnit <- c(initResponseMargUnit, resp_simulate_plus1 - resp_simulate) - } - qa_carryover <- do.call(cbind, qa_carryover) %>% as.data.frame() - names(initResponseUnit) <- names(hist_carryover) <- names(qa_carryover) <- mediaSpendSorted - # QA adstock: simulated adstock should be identical to model adstock - # qa_carryover_origin <- OutputCollect$mediaVecCollect %>% - # filter(.data$solID == select_model & .data$type == "adstockedMedia") %>% - # select(mediaSpendSorted) %>% slice(window_loc) %>% round %>% as.data.frame() - # identical(qa_carryover, qa_carryover_origin) - if (length(zero_spend_channel) > 0 && !quiet) { - message("Media variables with 0 spending during date range: ", v2t(zero_spend_channel)) - # hist_carryover[zero_spend_channel] <- 0 - } - - ## Set initial values and bounds - channelConstrLowSortedExt <- ifelse( - 1 - (1 - channelConstrLowSorted) * channel_constr_multiplier < 0, - 0, 1 - (1 - channelConstrLowSorted) * channel_constr_multiplier - ) - channelConstrUpSortedExt <- ifelse( - 1 + (channelConstrUpSorted - 1) * channel_constr_multiplier < 0, - channelConstrUpSorted * channel_constr_multiplier, - 1 + (channelConstrUpSorted - 1) * channel_constr_multiplier - ) - - target_value_ext <- target_value - if (scenario == "target_efficiency") { - channelConstrLowSortedExt <- channelConstrLowSorted - channelConstrUpSortedExt <- channelConstrUpSorted - if (dep_var_type == "conversion") { - if (is.null(target_value)) { - target_value <- sum(initSpendUnit) / sum(initResponseUnit) * 1.2 - } - target_value_ext <- target_value * 1.5 - } else { - if (is.null(target_value)) { - target_value <- sum(initResponseUnit) / sum(initSpendUnit) * 0.8 - } - target_value_ext <- 1 - } - } - temp_init <- temp_init_all <- initSpendUnit - # if no spend within window as initial spend, use historical average - if (length(zero_spend_channel) > 0) temp_init_all[zero_spend_channel] <- histSpendAllUnit[zero_spend_channel] - # Exclude channels with 0 coef from optimisation - temp_ub <- temp_ub_all <- channelConstrUpSorted - temp_lb <- temp_lb_all <- channelConstrLowSorted - temp_ub_ext <- temp_ub_ext_all <- channelConstrUpSortedExt - temp_lb_ext <- temp_lb_ext_all <- channelConstrLowSortedExt - - x0 <- x0_all <- lb <- lb_all <- temp_init_all * temp_lb_all - ub <- ub_all <- temp_init_all * temp_ub_all - x0_ext <- x0_ext_all <- lb_ext <- lb_ext_all <- temp_init_all * temp_lb_ext_all - ub_ext <- ub_ext_all <- temp_init_all * temp_ub_ext_all - - ## Exclude 0 coef and 0 constraint channels for the optimisation - skip_these <- (channel_constr_low == 0 & channel_constr_up == 0) - zero_constraint_channel <- mediaSpendSorted[skip_these] - if (any(skip_these) && !quiet) { - message( - "Excluded variables (constrained to 0): ", - paste(zero_constraint_channel, collapse = ", ") - ) - } - if (!all(coefSelectorSorted)) { - zero_coef_channel <- setdiff(names(coefSelectorSorted), mediaSpendSorted[coefSelectorSorted]) - if (!quiet) { - message( - "Excluded variables (coefficients are 0): ", - paste(zero_coef_channel, collapse = ", ") - ) - } - } else { - zero_coef_channel <- as.character() - } - channel_to_drop_loc <- mediaSpendSorted %in% c(zero_coef_channel, zero_constraint_channel) - channel_for_allocation <- mediaSpendSorted[!channel_to_drop_loc] - if (any(channel_to_drop_loc)) { - temp_init <- temp_init_all[channel_for_allocation] - temp_ub <- temp_ub_all[channel_for_allocation] - temp_lb <- temp_lb_all[channel_for_allocation] - temp_ub_ext <- temp_ub_ext_all[channel_for_allocation] - temp_lb_ext <- temp_lb_ext_all[channel_for_allocation] - x0 <- x0_all[channel_for_allocation] - lb <- lb_all[channel_for_allocation] - ub <- ub_all[channel_for_allocation] - x0_ext <- x0_ext_all[channel_for_allocation] - lb_ext <- lb_ext_all[channel_for_allocation] - ub_ext <- ub_ext_all[channel_for_allocation] - } - - x0 <- lb <- temp_init * temp_lb - ub <- temp_init * temp_ub - x0_ext <- lb_ext <- temp_init * temp_lb_ext - ub_ext <- temp_init * temp_ub_ext - - # Gather all values that will be used internally on optim (nloptr) - coefs_eval <- coefs_sorted[channel_for_allocation] - alphas_eval <- alphas[paste0(channel_for_allocation, "_alphas")] - inflexions_eval <- inflexions[paste0(channel_for_allocation, "_gammas")] - hist_carryover_eval <- hist_carryover[channel_for_allocation] - - eval_list <- list( - coefs_eval = coefs_eval, - alphas_eval = alphas_eval, - inflexions_eval = inflexions_eval, - # mediaSpendSortedFiltered = mediaSpendSorted, - total_budget = total_budget, - total_budget_unit = total_budget_unit, - hist_carryover_eval = hist_carryover_eval, - target_value = target_value, - target_value_ext = target_value_ext, - dep_var_type = dep_var_type - ) - # So we can implicitly use these values within eval_f() - options("ROBYN_TEMP" = eval_list) - - ## Set optim options - if (optim_algo == "MMA_AUGLAG") { - local_opts <- list( - "algorithm" = "NLOPT_LD_MMA", - "xtol_rel" = 1.0e-10 - ) - } else if (optim_algo == "SLSQP_AUGLAG") { - local_opts <- list( - "algorithm" = "NLOPT_LD_SLSQP", - "xtol_rel" = 1.0e-10 - ) - } - - ## Run optim - x_hist_carryover <- unlist(lapply(hist_carryover_eval, mean)) - if (scenario == "max_response") { - ## bounded optimisation - nlsMod <- nloptr::nloptr( - x0 = x0, - eval_f = eval_f, - eval_g_eq = if (constr_mode == "eq") eval_g_eq else NULL, - eval_g_ineq = if (constr_mode == "ineq") eval_g_ineq else NULL, - lb = lb, ub = ub, - opts = list( - "algorithm" = "NLOPT_LD_AUGLAG", - "xtol_rel" = 1.0e-10, - "maxeval" = maxeval, - "local_opts" = local_opts - ), - target_value = NULL - ) - ## unbounded optimisation - nlsModUnbound <- nloptr::nloptr( - x0 = x0_ext, - eval_f = eval_f, - eval_g_eq = if (constr_mode == "eq") eval_g_eq else NULL, - eval_g_ineq = if (constr_mode == "ineq") eval_g_ineq else NULL, - lb = lb_ext, ub = ub_ext, - opts = list( - "algorithm" = "NLOPT_LD_AUGLAG", - "xtol_rel" = 1.0e-10, - "maxeval" = maxeval, - "local_opts" = local_opts - ), - target_value = NULL - ) - } - - if (scenario == "target_efficiency") { - ## bounded optimisation - nlsMod <- nloptr::nloptr( - x0 = x0, - eval_f = eval_f, - eval_g_eq = if (constr_mode == "eq") eval_g_eq_effi else NULL, - eval_g_ineq = if (constr_mode == "ineq") eval_g_eq_effi else NULL, - lb = lb, - ub = x0 * channel_constr_up[1], # Large enough, but not infinite (customizable) - opts = list( - "algorithm" = "NLOPT_LD_AUGLAG", - "xtol_rel" = 1.0e-10, - "maxeval" = maxeval, - "local_opts" = local_opts - ), - target_value = target_value - ) - ## unbounded optimisation - nlsModUnbound <- nloptr::nloptr( - x0 = x0, - eval_f = eval_f, - eval_g_eq = if (constr_mode == "eq") eval_g_eq_effi else NULL, - eval_g_ineq = if (constr_mode == "ineq") eval_g_eq_effi else NULL, - lb = lb, - ub = x0 * channel_constr_up[1], # Large enough, but not infinite (customizable) - opts = list( - "algorithm" = "NLOPT_LD_AUGLAG", - "xtol_rel" = 1.0e-10, - "maxeval" = maxeval, - "local_opts" = local_opts - ), - target_value = target_value_ext - ) - } - - ## get marginal - optmSpendUnit <- nlsMod$solution - optmResponseUnit <- -eval_f(optmSpendUnit)[["objective.channel"]] - optmSpendUnitUnbound <- nlsModUnbound$solution - optmResponseUnitUnbound <- -eval_f(optmSpendUnitUnbound)[["objective.channel"]] - - optmResponseMargUnit <- mapply( - fx_objective, - x = optmSpendUnit + 1, - coeff = coefs_eval, - alpha = alphas_eval, - inflexion = inflexions_eval, - x_hist_carryover = x_hist_carryover, - get_sum = FALSE, - SIMPLIFY = TRUE - ) - optmResponseUnit - optmResponseMargUnitUnbound <- mapply( - fx_objective, - x = optmSpendUnitUnbound + 1, - coeff = coefs_eval, - alpha = alphas_eval, - inflexion = inflexions_eval, - x_hist_carryover = x_hist_carryover, - get_sum = FALSE, - SIMPLIFY = TRUE - ) - optmResponseUnitUnbound - - ## Collect output - names(optmSpendUnit) <- names(optmResponseUnit) <- names(optmResponseMargUnit) <- - names(optmSpendUnitUnbound) <- names(optmResponseUnitUnbound) <- - names(optmResponseMargUnitUnbound) <- channel_for_allocation - mediaSpendSorted %in% names(optmSpendUnit) - optmSpendUnitOut <- optmResponseUnitOut <- optmResponseMargUnitOut <- - optmSpendUnitUnboundOut <- optmResponseUnitUnboundOut <- - optmResponseMargUnitUnboundOut <- initSpendUnit - optmSpendUnitOut[channel_to_drop_loc] <- - optmResponseUnitOut[channel_to_drop_loc] <- - optmResponseMargUnitOut[channel_to_drop_loc] <- - optmSpendUnitUnboundOut[channel_to_drop_loc] <- - optmResponseUnitUnboundOut[channel_to_drop_loc] <- - optmResponseMargUnitUnboundOut[channel_to_drop_loc] <- 0 - optmSpendUnitOut[!channel_to_drop_loc] <- optmSpendUnit - optmResponseUnitOut[!channel_to_drop_loc] <- optmResponseUnit - optmResponseMargUnitOut[!channel_to_drop_loc] <- optmResponseMargUnit - optmSpendUnitUnboundOut[!channel_to_drop_loc] <- optmSpendUnitUnbound - optmResponseUnitUnboundOut[!channel_to_drop_loc] <- optmResponseUnitUnbound - optmResponseMargUnitUnboundOut[!channel_to_drop_loc] <- optmResponseMargUnitUnbound - - dt_optimOut <- data.frame( - solID = select_model, - dep_var_type = dep_var_type, - channels = mediaSpendSorted, - date_min = date_min, - date_max = date_max, - periods = sprintf("%s %ss", initial_mean_period, InputCollect$intervalType), - constr_low = temp_lb_all, - constr_low_abs = lb_all, - constr_up = temp_ub_all, - constr_up_abs = ub_all, - unconstr_mult = channel_constr_multiplier, - constr_low_unb = temp_lb_ext_all, - constr_low_unb_abs = lb_ext_all, - constr_up_unb = temp_ub_ext_all, - constr_up_unb_abs = ub_ext_all, - # Historical spends - histSpendAll = histSpendAll, - histSpendAllTotal = histSpendAllTotal, - histSpendAllUnit = histSpendAllUnit, - histSpendAllUnitTotal = histSpendAllUnitTotal, - histSpendAllShare = histSpendAllShare, - histSpendWindow = histSpendWindow, - histSpendWindowTotal = histSpendWindowTotal, - histSpendWindowUnit = histSpendWindowUnit, - histSpendWindowUnitTotal = histSpendWindowUnitTotal, - histSpendWindowShare = histSpendWindowShare, - # Initial spends for allocation - initSpendUnit = initSpendUnit, - initSpendUnitTotal = initSpendUnitTotal, - initSpendShare = initSpendShare, - initSpendTotal = initSpendUnitTotal * unique(simulation_period), - # initSpendUnitRaw = histSpendUnitRaw, - # adstocked = adstocked, - # adstocked_start_date = as.Date(ifelse(adstocked, head(resp$date, 1), NA), origin = "1970-01-01"), - # adstocked_end_date = as.Date(ifelse(adstocked, tail(resp$date, 1), NA), origin = "1970-01-01"), - # adstocked_periods = length(resp$date), - initResponseUnit = initResponseUnit, - initResponseUnitTotal = sum(initResponseUnit), - initResponseMargUnit = initResponseMargUnit, - initResponseTotal = sum(initResponseUnit) * unique(simulation_period), - initResponseUnitShare = initResponseUnit / sum(initResponseUnit), - initRoiUnit = initResponseUnit / initSpendUnit, - initCpaUnit = initSpendUnit / initResponseUnit, - # Budget change - total_budget_unit = total_budget_unit, - total_budget_unit_delta = total_budget_unit / initSpendUnitTotal - 1, - # Optimized - optmSpendUnit = optmSpendUnitOut, - optmSpendUnitDelta = (optmSpendUnitOut / initSpendUnit - 1), - optmSpendUnitTotal = sum(optmSpendUnitOut), - optmSpendUnitTotalDelta = sum(optmSpendUnitOut) / initSpendUnitTotal - 1, - optmSpendShareUnit = optmSpendUnitOut / sum(optmSpendUnitOut), - optmSpendTotal = sum(optmSpendUnitOut) * unique(simulation_period), - optmSpendUnitUnbound = optmSpendUnitUnboundOut, - optmSpendUnitDeltaUnbound = (optmSpendUnitUnboundOut / initSpendUnit - 1), - optmSpendUnitTotalUnbound = sum(optmSpendUnitUnboundOut), - optmSpendUnitTotalDeltaUnbound = sum(optmSpendUnitUnboundOut) / initSpendUnitTotal - 1, - optmSpendShareUnitUnbound = optmSpendUnitUnboundOut / sum(optmSpendUnitUnboundOut), - optmSpendTotalUnbound = sum(optmSpendUnitUnboundOut) * unique(simulation_period), - optmResponseUnit = optmResponseUnitOut, - optmResponseMargUnit = optmResponseMargUnitOut, - optmResponseUnitTotal = sum(optmResponseUnitOut), - optmResponseTotal = sum(optmResponseUnitOut) * unique(simulation_period), - optmResponseUnitShare = optmResponseUnitOut / sum(optmResponseUnitOut), - optmRoiUnit = optmResponseUnitOut / optmSpendUnitOut, - optmCpaUnit = optmSpendUnitOut / optmResponseUnitOut, - optmResponseUnitLift = (optmResponseUnitOut / initResponseUnit) - 1, - optmResponseUnitUnbound = optmResponseUnitUnboundOut, - optmResponseMargUnitUnbound = optmResponseMargUnitUnboundOut, - optmResponseUnitTotalUnbound = sum(optmResponseUnitUnboundOut), - optmResponseTotalUnbound = sum(optmResponseUnitUnboundOut) * unique(simulation_period), - optmResponseUnitShareUnbound = optmResponseUnitUnboundOut / sum(optmResponseUnitUnboundOut), - optmRoiUnitUnbound = optmResponseUnitUnboundOut / optmSpendUnitUnboundOut, - optmCpaUnitUnbound = optmSpendUnitUnboundOut / optmResponseUnitUnboundOut, - optmResponseUnitLiftUnbound = (optmResponseUnitUnboundOut / initResponseUnit) - 1 - ) %>% - mutate( - optmResponseUnitTotalLift = (.data$optmResponseUnitTotal / .data$initResponseUnitTotal) - 1, - optmResponseUnitTotalLiftUnbound = (.data$optmResponseUnitTotalUnbound / .data$initResponseUnitTotal) - 1 - ) - .Options$ROBYN_TEMP <- NULL # Clean auxiliary method - - ## Calculate curves and main points for each channel - if (scenario == "max_response") { - levs1 <- c("Initial", "Bounded", paste0("Bounded x", channel_constr_multiplier)) - } else if (scenario == "target_efficiency") { - if (dep_var_type == "revenue") { - levs1 <- c( - "Initial", paste0("Hit ROAS ", round(target_value, 2)), - paste0("Hit ROAS ", target_value_ext) - ) - } else { - levs1 <- c( - "Initial", paste0("Hit CPA ", round(target_value, 2)), - paste0("Hit CPA ", round(target_value_ext, 2)) - ) - } - } - eval_list$levs1 <- levs1 - - dt_optimOutScurve <- rbind( - select(dt_optimOut, .data$channels, .data$initSpendUnit, .data$initResponseUnit) %>% - mutate(x = levs1[1]) %>% as.matrix(), - select(dt_optimOut, .data$channels, .data$optmSpendUnit, .data$optmResponseUnit) %>% - mutate(x = levs1[2]) %>% as.matrix(), - select(dt_optimOut, .data$channels, .data$optmSpendUnitUnbound, .data$optmResponseUnitUnbound) %>% - mutate(x = levs1[3]) %>% as.matrix() - ) %>% - `colnames<-`(c("channels", "spend", "response", "type")) %>% - rbind(data.frame(channels = dt_optimOut$channels, spend = 0, response = 0, type = "Carryover")) %>% - mutate(spend = as.numeric(.data$spend), response = as.numeric(.data$response)) %>% - group_by(.data$channels) - - plotDT_scurve <- list() - for (i in channel_for_allocation) { # i <- channels[i] - carryover_vec <- eval_list$hist_carryover_eval[[i]] - dt_optimOutScurve <- dt_optimOutScurve %>% - mutate(spend = ifelse( - .data$channels == i & .data$type %in% levs1, - .data$spend + mean(carryover_vec), ifelse( - .data$channels == i & .data$type == "Carryover", - mean(carryover_vec), .data$spend - ) - )) - get_max_x <- max(filter(dt_optimOutScurve, .data$channels == i)$spend) * 1.5 - simulate_spend <- seq(0, get_max_x, length.out = 100) - simulate_response <- fx_objective( - x = simulate_spend, - coeff = eval_list$coefs_eval[[i]], - alpha = eval_list$alphas_eval[[paste0(i, "_alphas")]], - inflexion = eval_list$inflexions_eval[[paste0(i, "_gammas")]], - x_hist_carryover = 0, - get_sum = FALSE - ) - simulate_response_carryover <- fx_objective( - x = mean(carryover_vec), - coeff = eval_list$coefs_eval[[i]], - alpha = eval_list$alphas_eval[[paste0(i, "_alphas")]], - inflexion = eval_list$inflexions_eval[[paste0(i, "_gammas")]], - x_hist_carryover = 0, - get_sum = FALSE - ) - plotDT_scurve[[i]] <- data.frame( - channel = i, spend = simulate_spend, - mean_carryover = mean(carryover_vec), - carryover_response = simulate_response_carryover, - total_response = simulate_response - ) - dt_optimOutScurve <- dt_optimOutScurve %>% - mutate(response = ifelse( - .data$channels == i & .data$type == "Carryover", - simulate_response_carryover, .data$response - )) - } - eval_list[["plotDT_scurve"]] <- plotDT_scurve <- as_tibble(bind_rows(plotDT_scurve)) - mainPoints <- dt_optimOutScurve %>% - rename("response_point" = "response", "spend_point" = "spend", "channel" = "channels") - temp_caov <- mainPoints %>% filter(.data$type == "Carryover") - mainPoints$mean_spend <- mainPoints$spend_point - temp_caov$spend_point - mainPoints$mean_spend <- ifelse(mainPoints$type == "Carryover", mainPoints$spend_point, mainPoints$mean_spend) - if (levs1[2] == levs1[3]) levs1[3] <- paste0(levs1[3], ".") - mainPoints$type <- factor(mainPoints$type, levels = c("Carryover", levs1)) - mainPoints$roi_mean <- mainPoints$response_point / mainPoints$mean_spend - mresp_caov <- filter(mainPoints, .data$type == "Carryover")$response_point - mresp_init <- filter(mainPoints, .data$type == levels(mainPoints$type)[2])$response_point - mresp_caov - mresp_b <- filter(mainPoints, .data$type == levels(mainPoints$type)[3])$response_point - mresp_caov - mresp_unb <- filter(mainPoints, .data$type == levels(mainPoints$type)[4])$response_point - mresp_caov - mainPoints$marginal_response <- c(mresp_init, mresp_b, mresp_unb, rep(0, length(mresp_init))) - mainPoints$roi_marginal <- mainPoints$marginal_response / mainPoints$mean_spend - mainPoints$cpa_marginal <- mainPoints$mean_spend / mainPoints$marginal_response - eval_list[["mainPoints"]] <- mainPoints - - # Exporting directory - if (export) { - if (is.null(json_file) & !is.null(plot_folder)) { - if (is.null(plot_folder_sub)) plot_folder_sub <- basename(OutputCollect$plot_folder) - plot_folder <- gsub("//+", "/", paste0(plot_folder, "/", plot_folder_sub, "/")) - } else { - plot_folder <- gsub("//+", "/", paste0(OutputCollect$plot_folder, "/")) - } - - # if (!is.null(json_file)) { - # plot_folder <- gsub("//+", "/", paste0(OutputCollect$plot_folder, "/")) - # } else if (is.null(json_file) & is.null(plot_folder) & is.null(plot_folder_sub)) { - # plot_folder <- gsub("//+", "/", paste0(OutputCollect$plot_folder, "/")) - # } else { - # if (is.null(plot_folder_sub)) plot_folder_sub <- basename(OutputCollect$plot_folder) - # plot_folder <- gsub("//+", "/", paste0(plot_folder, "/", plot_folder_sub, "/")) - # } - if (!dir.exists(plot_folder)) { - message("Creating directory for allocator: ", plot_folder) - dir.create(plot_folder) - } - ## Export results into CSV - export_dt_optimOut <- dt_optimOut - if (dep_var_type == "conversion") { - colnames(export_dt_optimOut) <- gsub("Roi", "CPA", colnames(export_dt_optimOut)) - } - write.csv(export_dt_optimOut, paste0(plot_folder, select_model, "_", scenario, "_reallocated.csv")) - } - - ## Plot allocator results - if (plots) { - plots <- allocation_plots( - InputCollect, OutputCollect, - dt_optimOut, - # filter(dt_optimOut, .data$channels %in% channel_for_allocation), - select_model, scenario, eval_list, - export, plot_folder, quiet - ) - } else { - plots <- NULL - } - - output <- list( - dt_optimOut = dt_optimOut, - mainPoints = mainPoints, - nlsMod = nlsMod, - plots = plots, - scenario = scenario, - usecase = usecase, - total_budget = ifelse(is.null(total_budget), total_budget_window, total_budget), - skipped_coef0 = zero_coef_channel, - skipped_constr = zero_constraint_channel, - no_spend = zero_spend_channel, - ui = if (ui) plots else NULL - ) - - class(output) <- c("robyn_allocator", class(output)) - return(output) -} - -#' @rdname robyn_allocator -#' @aliases robyn_allocator -#' @param x \code{robyn_allocator()} output. -#' @export -print.robyn_allocator <- function(x, ...) { - temp <- x$dt_optimOut[!is.nan(x$dt_optimOut$optmRoiUnit), ] - coef0 <- if (length(x$skipped_coef0) > 0) paste("Coefficient 0:", v2t(x$skipped_coef0, quotes = FALSE)) else NULL - constr <- if (length(x$skipped_constr) > 0) paste("Constrained @0:", v2t(x$skipped_constr, quotes = FALSE)) else NULL - nospend <- if (length(x$no_spend) > 0) paste("Spend = 0:", v2t(x$no_spend, quotes = FALSE)) else NULL - media_skipped <- paste(c(coef0, constr, nospend), collapse = " | ") - media_skipped <- ifelse(is.null(media_skipped), "None", media_skipped) - - print(glued( - " -Model ID: {x$dt_optimOut$solID[1]} -Scenario: {x$scenario} -Use case: {x$usecase} -Window: {x$dt_optimOut$date_min[1]}:{x$dt_optimOut$date_max[1]} ({x$dt_optimOut$periods[1]}) - -Dep. Variable Type: {temp$dep_var_type[1]} -Media Skipped: {media_skipped} -Relative Spend Increase: {spend_increase_p}% ({spend_increase}) -Total Response Increase (Optimized): {signif(100 * x$dt_optimOut$optmResponseUnitTotalLift[1], 3)}% - -Allocation Summary: - {summary} -", - spend_increase_p = num_abbr(100 * x$dt_optimOut$optmSpendUnitTotalDelta[1], 3), - spend_increase = formatNum( - sum(x$dt_optimOut$optmSpendUnitTotal) - sum(x$dt_optimOut$initSpendUnitTotal), - abbr = TRUE, sign = TRUE - ), - summary = paste(sprintf( - " -- %s: - Optimizable bound: [%s%%, %s%%], - Initial spend share: %s%% -> Optimized bounded: %s%% - Initial response share: %s%% -> Optimized bounded: %s%% - Initial abs. mean spend: %s -> Optimized: %s [Delta = %s%%]", - temp$channels, - 100 * temp$constr_low - 100, - 100 * temp$constr_up - 100, - signif(100 * temp$initSpendShare, 3), - signif(100 * temp$optmSpendShareUnit, 3), - signif(100 * temp$initResponseUnitShare, 3), - signif(100 * temp$optmResponseUnitShare, 3), - formatNum(temp$initSpendUnit, 3, abbr = TRUE), - formatNum(temp$optmSpendUnit, 3, abbr = TRUE), - formatNum(100 * temp$optmSpendUnitDelta, signif = 2) - ), collapse = "\n ") - )) -} - -#' @rdname robyn_allocator -#' @aliases robyn_allocator -#' @param x \code{robyn_allocator()} output. -#' @export -plot.robyn_allocator <- function(x, ...) plot(x$plots$plots, ...) - -eval_f <- function(X, target_value) { - # eval_list <- get("eval_list", pos = as.environment(-1)) - eval_list <- getOption("ROBYN_TEMP") - coefs_eval <- eval_list[["coefs_eval"]] - alphas_eval <- eval_list[["alphas_eval"]] - inflexions_eval <- eval_list[["inflexions_eval"]] - # mediaSpendSortedFiltered <- eval_list[["mediaSpendSortedFiltered"]] - hist_carryover_eval <- eval_list[["hist_carryover_eval"]] - - objective <- -sum(mapply( - fx_objective, - x = X, - coeff = coefs_eval, - alpha = alphas_eval, - inflexion = inflexions_eval, - x_hist_carryover = hist_carryover_eval, - SIMPLIFY = TRUE - )) - - gradient <- c(mapply( - fx_gradient, - x = X, - coeff = coefs_eval, - alpha = alphas_eval, - inflexion = inflexions_eval, - x_hist_carryover = hist_carryover_eval, - SIMPLIFY = TRUE - )) - - objective.channel <- mapply( - fx_objective.chanel, - x = X, - coeff = coefs_eval, - alpha = alphas_eval, - inflexion = inflexions_eval, - x_hist_carryover = hist_carryover_eval, - SIMPLIFY = TRUE - ) - - optm <- list(objective = objective, gradient = gradient, objective.channel = objective.channel) - return(optm) -} - -fx_objective <- function(x, coeff, alpha, inflexion, x_hist_carryover, get_sum = TRUE) { - # Apply Michaelis Menten model to scale spend to exposure - # if (criteria) { - # xScaled <- mic_men(x = x, Vmax = vmax, Km = km) # vmax * x / (km + x) - # } else if (chnName %in% names(mm_lm_coefs)) { - # xScaled <- x * mm_lm_coefs[chnName] - # } else { - # xScaled <- x - # } - - # Adstock scales - xAdstocked <- x + mean(x_hist_carryover) - # Hill transformation - if (get_sum) { - xOut <- coeff * sum((1 + inflexion**alpha / xAdstocked**alpha)**-1) - } else { - xOut <- coeff * ((1 + inflexion**alpha / xAdstocked**alpha)**-1) - } - return(xOut) -} - -# https://www.derivative-calculator.net/ on the objective function 1/(1+gamma^alpha / x^alpha) -fx_gradient <- function(x, coeff, alpha, inflexion, x_hist_carryover - # , chnName, vmax, km, criteria -) { - # Apply Michaelis Menten model to scale spend to exposure - # if (criteria) { - # xScaled <- mic_men(x = x, Vmax = vmax, Km = km) # vmax * x / (km + x) - # } else if (chnName %in% names(mm_lm_coefs)) { - # xScaled <- x * mm_lm_coefs[chnName] - # } else { - # xScaled <- x - # } - - # Adstock scales - xAdstocked <- x + mean(x_hist_carryover) - xOut <- -coeff * sum((alpha * (inflexion**alpha) * (xAdstocked**(alpha - 1))) / (xAdstocked**alpha + inflexion**alpha)**2) - return(xOut) -} - -fx_objective.chanel <- function(x, coeff, alpha, inflexion, x_hist_carryover - # , chnName, vmax, km, criteria -) { - # Apply Michaelis Menten model to scale spend to exposure - # if (criteria) { - # xScaled <- mic_men(x = x, Vmax = vmax, Km = km) # vmax * x / (km + x) - # } else if (chnName %in% names(mm_lm_coefs)) { - # xScaled <- x * mm_lm_coefs[chnName] - # } else { - # xScaled <- x - # } - - # Adstock scales - xAdstocked <- x + mean(x_hist_carryover) - xOut <- -coeff * sum((1 + inflexion**alpha / xAdstocked**alpha)**-1) - return(xOut) -} - -eval_g_eq <- function(X, target_value) { - eval_list <- getOption("ROBYN_TEMP") - constr <- sum(X) - eval_list$total_budget_unit - grad <- rep(1, length(X)) - return(list( - "constraints" = constr, - "jacobian" = grad - )) -} - -eval_g_ineq <- function(X, target_value) { - eval_list <- getOption("ROBYN_TEMP") - constr <- sum(X) - eval_list$total_budget_unit - grad <- rep(1, length(X)) - return(list( - "constraints" = constr, - "jacobian" = grad - )) -} - -eval_g_eq_effi <- function(X, target_value) { - eval_list <- getOption("ROBYN_TEMP") - sum_response <- sum(mapply( - fx_objective, - x = X, - coeff = eval_list$coefs_eval, - alpha = eval_list$alphas_eval, - inflexion = eval_list$inflexions_eval, - x_hist_carryover = eval_list$hist_carryover_eval, - SIMPLIFY = TRUE - )) - - if (is.null(target_value)) { - if (eval_list$dep_var_type == "conversion") { - constr <- sum(X) - sum_response * eval_list$target_value - } else { - constr <- sum(X) - sum_response / eval_list$target_value - } - } else { - if (eval_list$dep_var_type == "conversion") { - constr <- sum(X) - sum_response * target_value - } else { - constr <- sum(X) - sum_response / target_value - } - } - - grad <- rep(1, length(X)) - mapply( - fx_gradient, - x = X, - coeff = eval_list$coefs_eval, - alpha = eval_list$alphas_eval, - inflexion = eval_list$inflexions_eval, - x_hist_carryover = eval_list$hist_carryover_eval, - SIMPLIFY = TRUE - ) - - # constr <- sum(X) - eval_list$total_budget_unit - # grad <- rep(1, length(X)) - return(list( - "constraints" = constr, - "jacobian" = grad - )) -} - - -get_adstock_params <- function(InputCollect, dt_hyppar) { - if (InputCollect$adstock == "geometric") { - getAdstockHypPar <- unlist(select(dt_hyppar, na.omit(str_extract(names(dt_hyppar), ".*_thetas")))) - } else if (InputCollect$adstock %in% c("weibull_cdf", "weibull_pdf")) { - getAdstockHypPar <- unlist(select(dt_hyppar, na.omit(str_extract(names(dt_hyppar), ".*_shapes|.*_scales")))) - } - return(getAdstockHypPar) -} - -get_hill_params <- function(InputCollect, OutputCollect = NULL, dt_hyppar, dt_coef, mediaSpendSorted, select_model, chnAdstocked = NULL) { - hillHypParVec <- unlist(select(dt_hyppar, na.omit(str_extract(names(dt_hyppar), ".*_alphas|.*_gammas")))) - alphas <- hillHypParVec[paste0(mediaSpendSorted, "_alphas")] - gammas <- hillHypParVec[paste0(mediaSpendSorted, "_gammas")] - if (is.null(chnAdstocked)) { - chnAdstocked <- filter( - OutputCollect$mediaVecCollect, - .data$type == "adstockedMedia", - .data$solID == select_model - ) %>% - select(all_of(mediaSpendSorted)) %>% - slice(InputCollect$rollingWindowStartWhich:InputCollect$rollingWindowEndWhich) - } - inflexions <- unlist(lapply(seq(ncol(chnAdstocked)), function(i) { - c(range(chnAdstocked[, i]) %*% c(1 - gammas[i], gammas[i])) - })) - names(inflexions) <- names(gammas) - coefs <- dt_coef$coef - names(coefs) <- dt_coef$rn - coefs_sorted <- coefs[mediaSpendSorted] - return(list( - alphas = alphas, - inflexions = inflexions, - coefs_sorted = coefs_sorted - )) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Budget Allocator +#' +#' \code{robyn_allocator()} function returns a new split of media +#' variable spends that maximizes the total media response. +#' +#' @inheritParams robyn_run +#' @inheritParams robyn_outputs +#' @param robyn_object Character or List. Path of the \code{Robyn.RDS} object +#' that contains all previous modeling information or the imported list. +#' @param select_build Integer. Default to the latest model build. \code{select_build = 0} +#' selects the initial model. \code{select_build = 1} selects the first refresh model. +#' @param InputCollect List. Contains all input parameters for the model. +#' Required when \code{robyn_object} is not provided. +#' @param OutputCollect List. Containing all model result. +#' Required when \code{robyn_object} is not provided. +#' @param select_model Character. A model \code{SolID}. When \code{robyn_object} +#' is provided, \code{select_model} defaults to the already selected \code{SolID}. When +#' \code{robyn_object} is not provided, \code{select_model} must be provided with +#' \code{InputCollect} and \code{OutputCollect}, and must be one of +#' \code{OutputCollect$allSolutions}. +#' @param optim_algo Character. Default to \code{"SLSQP_AUGLAG"}, short for "Sequential Least-Squares +#' Quadratic Programming" and "Augmented Lagrangian". Alternatively, "\code{"MMA_AUGLAG"}, +#' short for "Methods of Moving Asymptotes". More details see the documentation of +#' NLopt \href{https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/}{here}. +#' @param scenario Character. Accepted options are: \code{"max_response"}, \code{"target_efficiency"}. +#' Scenario \code{"max_response"} answers the question: +#' "What's the potential revenue/conversions lift with the same (or custom) spend level +#' in \code{date_range} and what is the allocation and expected response mix?" +#' Scenario \code{"target_efficiency"} optimizes ROAS or CPA and answers the question: +#' "What's the potential revenue/conversions lift and spend levels based on a +#' \code{target_value} for CPA/ROAS and what is the allocation and expected response mix?" +#' Deprecated scenario: \code{"max_response_expected_spend"}. +#' @param total_budget Numeric. Total marketing budget for all paid channels for the +#' period in \code{date_range}. +#' @param target_value Numeric. When using the scenario \code{"target_efficiency"}, +#' target_value is the desired ROAS or CPA with no upper spend limit. Default is set to 80\% of +#' initial ROAS or 120\% of initial CPA, when \code{"target_value = NULL"}. +#' @param date_range Character. Date(s) to apply adstocked transformations and pick mean spends +#' per channel. Set one of: "all", "last", or "last_n" (where +#' n is the last N dates available), date (i.e. "2022-03-27"), or date range +#' (i.e. \code{c("2022-01-01", "2022-12-31")}). Default to "all". +#' @param channel_constr_low,channel_constr_up Numeric vectors. The lower and upper bounds +#' for each paid media variable when maximizing total media response. For example, +#' \code{channel_constr_low = 0.7} means minimum spend of the variable is 70% of historical +#' average, using non-zero spend values, within \code{date_min} and \code{date_max} date range. +#' Both constrains must be length 1 (same for all values) OR same length and order as +#' \code{paid_media_spends}. It's not recommended to 'exaggerate' upper bounds, especially +#' if the new level is way higher than historical level. Lower bound must be >=0.01, +#' and upper bound should be < 5. +#' @param channel_constr_multiplier Numeric. Default to 3. For example, if +#' \code{channel_constr_low} and \code{channel_constr_up} are 0.8 to 1.2, the range is 0.4. +#' The allocator will also show the optimum solution for a larger constraint range of +#' 0.4 x 3 = 1.2, or 0.4 to 1.6, to show the optimization potential to support allocation +#' interpretation and decision. +#' @param maxeval Integer. The maximum iteration of the global optimization algorithm. +#' Defaults to 100000. +#' @param constr_mode Character. Options are \code{"eq"} or \code{"ineq"}, +#' indicating constraints with equality or inequality. +#' @param plots Boolean. Generate plots? +#' @return A list object containing allocator result. +#' @examples +#' \dontrun{ +#' # Having InputCollect and OutputCollect results +#' AllocatorCollect <- robyn_allocator( +#' InputCollect = InputCollect, +#' OutputCollect = OutputCollect, +#' select_model = "1_2_3", +#' scenario = "max_response", +#' channel_constr_low = 0.7, +#' channel_constr_up = c(1.2, 1.5, 1.5, 1.5, 1.5), +#' channel_constr_multiplier = 4, +#' date_range = "last_26", +#' export = FALSE +#' ) +#' # Print a summary +#' print(AllocatorCollect) +#' # Plot the allocator one-pager +#' plot(AllocatorCollect) +#' } +#' @return List. Contains optimized allocation results and plots. +#' @export +robyn_allocator <- function(robyn_object = NULL, + select_build = 0, + InputCollect = NULL, + OutputCollect = NULL, + select_model = NULL, + json_file = NULL, + scenario = "max_response", + total_budget = NULL, + target_value = NULL, + date_range = "all", + channel_constr_low = NULL, + channel_constr_up = NULL, + channel_constr_multiplier = 3, + optim_algo = "SLSQP_AUGLAG", + maxeval = 100000, + constr_mode = "eq", + plots = TRUE, + plot_folder = NULL, + plot_folder_sub = NULL, + export = TRUE, + quiet = FALSE, + ui = FALSE, + ...) { + ### Use previously exported model using json_file + if (!is.null(json_file)) { + if (is.null(InputCollect)) { + InputCollect <- robyn_inputs( + json_file = json_file, quiet = TRUE, ... + ) + } + if (is.null(OutputCollect)) { + if (is.null(plot_folder)) { + json <- robyn_read(json_file, step = 2, quiet = TRUE) + plot_folder <- dirname(json$ExportedModel$plot_folder) + if (!is.null(plot_folder_sub)) plot_folder_sub <- NULL + } + OutputCollect <- robyn_run( + json_file = json_file, export = export, plot_folder = plot_folder, plot_folder_sub = plot_folder_sub, ... + ) + } + if (is.null(select_model)) select_model <- OutputCollect$selectID + } + + ## Collect inputs + # if (!is.null(robyn_object) && (is.null(InputCollect) && is.null(OutputCollect))) { + # if ("robyn_exported" %in% class(robyn_object)) { + # imported <- robyn_object + # robyn_object <- imported$robyn_object + # } else { + # imported <- robyn_load(robyn_object, select_build, quiet) + # } + # InputCollect <- imported$InputCollect + # OutputCollect <- imported$OutputCollect + # select_model <- imported$select_model + # } else { + if (is.null(select_model) && length(OutputCollect$allSolutions == 1)) { + select_model <- OutputCollect$allSolutions + } + if (any(is.null(InputCollect), is.null(OutputCollect), is.null(select_model))) { + stop("When 'robyn_object' is not provided, then InputCollect, OutputCollect, select_model must be provided") + } + # } + + if (length(InputCollect$paid_media_spends) <= 1) { + stop("Must have a valid model with at least two 'paid_media_spends'") + } + + if (!quiet) message(paste(">>> Running budget allocator for model ID", select_model, "...")) + + ## Set local data & params values + paid_media_spends <- InputCollect$paid_media_spends + media_order <- order(paid_media_spends) + mediaSpendSorted <- paid_media_spends[media_order] + dep_var_type <- InputCollect$dep_var_type + if (is.null(channel_constr_low)) { + channel_constr_low <- case_when( + scenario == "max_response" ~ 0.5, + scenario == "target_efficiency" ~ 0.1 + ) + } + if (is.null(channel_constr_up)) { + channel_constr_up <- case_when( + scenario == "max_response" ~ 2, + scenario == "target_efficiency" ~ 10 + ) + } + if (length(channel_constr_low) == 1) channel_constr_low <- rep(channel_constr_low, length(paid_media_spends)) + if (length(channel_constr_up) == 1) channel_constr_up <- rep(channel_constr_up, length(paid_media_spends)) + check_allocator_constrains(channel_constr_low, channel_constr_up) + names(channel_constr_low) <- paid_media_spends + names(channel_constr_up) <- paid_media_spends + channel_constr_low <- channel_constr_low[media_order] + channel_constr_up <- channel_constr_up[media_order] + dt_hyppar <- filter(OutputCollect$resultHypParam, .data$solID == select_model) + dt_bestCoef <- filter(OutputCollect$xDecompAgg, .data$solID == select_model, .data$rn %in% paid_media_spends) + + ## Check inputs and parameters + scenario <- check_allocator( + OutputCollect, select_model, paid_media_spends, scenario, + channel_constr_low, channel_constr_up, constr_mode + ) + + ## Sort media + dt_coef <- select(dt_bestCoef, .data$rn, .data$coef) + get_rn_order <- order(dt_bestCoef$rn) + dt_coefSorted <- dt_coef[get_rn_order, ] + dt_bestCoef <- dt_bestCoef[get_rn_order, ] + coefSelectorSorted <- dt_coefSorted$coef > 0 + names(coefSelectorSorted) <- dt_coefSorted$rn + + dt_hyppar <- select(dt_hyppar, hyper_names(InputCollect$adstock, mediaSpendSorted)) %>% + select(sort(colnames(.))) + dt_bestCoef <- dt_bestCoef[dt_bestCoef$rn %in% mediaSpendSorted, ] + channelConstrLowSorted <- channel_constr_low[mediaSpendSorted] + channelConstrUpSorted <- channel_constr_up[mediaSpendSorted] + + ## Get hill parameters for each channel + hills <- get_hill_params( + InputCollect, OutputCollect, dt_hyppar, dt_coef, mediaSpendSorted, select_model + ) + alphas <- hills$alphas + inflexions <- hills$inflexions + coefs_sorted <- hills$coefs_sorted + + # Spend values based on date range set + window_loc <- InputCollect$rollingWindowStartWhich:InputCollect$rollingWindowEndWhich + dt_optimCost <- slice(InputCollect$dt_mod, window_loc) + new_date_range <- check_metric_dates(date_range, dt_optimCost$ds, InputCollect$dayInterval, quiet = quiet, is_allocator = TRUE) + date_min <- head(new_date_range$date_range_updated, 1) + date_max <- tail(new_date_range$date_range_updated, 1) + check_daterange(date_min, date_max, dt_optimCost$ds) + if (is.null(date_min)) date_min <- min(dt_optimCost$ds) + if (is.null(date_max)) date_max <- max(dt_optimCost$ds) + if (date_min < min(dt_optimCost$ds)) date_min <- min(dt_optimCost$ds) + if (date_max > max(dt_optimCost$ds)) date_max <- max(dt_optimCost$ds) + histFiltered <- filter(dt_optimCost, .data$ds >= date_min & .data$ds <= date_max) + + histSpendAll <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSpendSorted)), sum)) + histSpendAllTotal <- sum(histSpendAll) + histSpendAllUnit <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSpendSorted)), mean)) + histSpendAllUnitTotal <- sum(histSpendAllUnit) + histSpendAllShare <- histSpendAllUnit / histSpendAllUnitTotal + + histSpendWindow <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), sum)) + histSpendWindowTotal <- sum(histSpendWindow) + initSpendUnit <- histSpendWindowUnit <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), mean)) + histSpendWindowUnitTotal <- sum(histSpendWindowUnit) + histSpendWindowShare <- histSpendWindowUnit / histSpendWindowUnitTotal + + simulation_period <- initial_mean_period <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), length)) + nDates <- lapply(mediaSpendSorted, function(x) histFiltered$ds) + names(nDates) <- mediaSpendSorted + if (!quiet) { + message(sprintf( + "Date Window: %s:%s (%s %ss)", + date_min, date_max, unique(initial_mean_period), InputCollect$intervalType + )) + } + zero_spend_channel <- names(histSpendWindow[histSpendWindow == 0]) + + initSpendUnitTotal <- sum(initSpendUnit) + initSpendShare <- initSpendUnit / initSpendUnitTotal + total_budget_unit <- ifelse(is.null(total_budget), initSpendUnitTotal, total_budget / unique(simulation_period)) + total_budget_window <- total_budget_unit * unique(simulation_period) + + ## Get use case based on inputs + usecase <- which_usecase(initSpendUnit[1], date_range) + if (usecase == "all_historical_vec") { + ndates_loc <- which(InputCollect$dt_mod$ds %in% histFiltered$ds) + } else { + ndates_loc <- seq_along(histFiltered$ds) + } + usecase <- paste(usecase, ifelse(!is.null(total_budget), "+ defined_budget", "+ historical_budget")) + + # Response values based on date range -> mean spend + initResponseUnit <- NULL + initResponseMargUnit <- NULL + hist_carryover <- list() + qa_carryover <- list() + for (i in seq_along(mediaSpendSorted)) { + resp <- robyn_response( + json_file = json_file, + # robyn_object = robyn_object, + select_build = select_build, + select_model = select_model, + metric_name = mediaSpendSorted[i], + # metric_value = initSpendUnit[i] * simulation_period[i], + # date_range = date_range, + dt_hyppar = OutputCollect$resultHypParam, + dt_coef = OutputCollect$xDecompAgg, + InputCollect = InputCollect, + OutputCollect = OutputCollect, + quiet = TRUE, + is_allocator = TRUE, + ... + ) + # val <- sort(resp$response_total)[round(length(resp$response_total) / 2)] + # histSpendUnit[i] <- resp$input_immediate[which(resp$response_total == val)] + hist_carryover_temp <- resp$input_carryover[window_loc] + qa_carryover[[i]] <- round(resp$input_total[window_loc]) + names(hist_carryover_temp) <- resp$date[window_loc] + hist_carryover[[i]] <- hist_carryover_temp + # get simulated response + # if (resp$input_immediate[1] == initSpendUnit[i]) { + # x_input <- initSpendUnit[i] + # } else { + # x_input <- mean(resp$input_immediate) + # } + x_input <- initSpendUnit[i] + resp_simulate <- fx_objective( + x = x_input, + coeff = coefs_sorted[[mediaSpendSorted[i]]], + alpha = alphas[[paste0(mediaSpendSorted[i], "_alphas")]], + inflexion = inflexions[[paste0(mediaSpendSorted[i], "_gammas")]], + x_hist_carryover = mean(hist_carryover_temp), + get_sum = FALSE + ) + resp_simulate_plus1 <- fx_objective( + x = x_input + 1, + coeff = coefs_sorted[[mediaSpendSorted[i]]], + alpha = alphas[[paste0(mediaSpendSorted[i], "_alphas")]], + inflexion = inflexions[[paste0(mediaSpendSorted[i], "_gammas")]], + x_hist_carryover = mean(hist_carryover_temp), + get_sum = FALSE + ) + initResponseUnit <- c(initResponseUnit, resp_simulate) + initResponseMargUnit <- c(initResponseMargUnit, resp_simulate_plus1 - resp_simulate) + } + qa_carryover <- do.call(cbind, qa_carryover) %>% as.data.frame() + names(initResponseUnit) <- names(hist_carryover) <- names(qa_carryover) <- mediaSpendSorted + # QA adstock: simulated adstock should be identical to model adstock + # qa_carryover_origin <- OutputCollect$mediaVecCollect %>% + # filter(.data$solID == select_model & .data$type == "adstockedMedia") %>% + # select(mediaSpendSorted) %>% slice(window_loc) %>% round %>% as.data.frame() + # identical(qa_carryover, qa_carryover_origin) + if (length(zero_spend_channel) > 0 && !quiet) { + message("Media variables with 0 spending during date range: ", v2t(zero_spend_channel)) + # hist_carryover[zero_spend_channel] <- 0 + } + + ## Set initial values and bounds + channelConstrLowSortedExt <- ifelse( + 1 - (1 - channelConstrLowSorted) * channel_constr_multiplier < 0, + 0, 1 - (1 - channelConstrLowSorted) * channel_constr_multiplier + ) + channelConstrUpSortedExt <- ifelse( + 1 + (channelConstrUpSorted - 1) * channel_constr_multiplier < 0, + channelConstrUpSorted * channel_constr_multiplier, + 1 + (channelConstrUpSorted - 1) * channel_constr_multiplier + ) + + target_value_ext <- target_value + if (scenario == "target_efficiency") { + channelConstrLowSortedExt <- channelConstrLowSorted + channelConstrUpSortedExt <- channelConstrUpSorted + if (dep_var_type == "conversion") { + if (is.null(target_value)) { + target_value <- sum(initSpendUnit) / sum(initResponseUnit) * 1.2 + } + target_value_ext <- target_value * 1.5 + } else { + if (is.null(target_value)) { + target_value <- sum(initResponseUnit) / sum(initSpendUnit) * 0.8 + } + target_value_ext <- 1 + } + } + temp_init <- temp_init_all <- initSpendUnit + # if no spend within window as initial spend, use historical average + if (length(zero_spend_channel) > 0) temp_init_all[zero_spend_channel] <- histSpendAllUnit[zero_spend_channel] + # Exclude channels with 0 coef from optimisation + temp_ub <- temp_ub_all <- channelConstrUpSorted + temp_lb <- temp_lb_all <- channelConstrLowSorted + temp_ub_ext <- temp_ub_ext_all <- channelConstrUpSortedExt + temp_lb_ext <- temp_lb_ext_all <- channelConstrLowSortedExt + + x0 <- x0_all <- lb <- lb_all <- temp_init_all * temp_lb_all + ub <- ub_all <- temp_init_all * temp_ub_all + x0_ext <- x0_ext_all <- lb_ext <- lb_ext_all <- temp_init_all * temp_lb_ext_all + ub_ext <- ub_ext_all <- temp_init_all * temp_ub_ext_all + + ## Exclude 0 coef and 0 constraint channels for the optimisation + skip_these <- (channel_constr_low == 0 & channel_constr_up == 0) + zero_constraint_channel <- mediaSpendSorted[skip_these] + if (any(skip_these) && !quiet) { + message( + "Excluded variables (constrained to 0): ", + paste(zero_constraint_channel, collapse = ", ") + ) + } + if (!all(coefSelectorSorted)) { + zero_coef_channel <- setdiff(names(coefSelectorSorted), mediaSpendSorted[coefSelectorSorted]) + if (!quiet) { + message( + "Excluded variables (coefficients are 0): ", + paste(zero_coef_channel, collapse = ", ") + ) + } + } else { + zero_coef_channel <- as.character() + } + channel_to_drop_loc <- mediaSpendSorted %in% c(zero_coef_channel, zero_constraint_channel) + channel_for_allocation <- mediaSpendSorted[!channel_to_drop_loc] + if (any(channel_to_drop_loc)) { + temp_init <- temp_init_all[channel_for_allocation] + temp_ub <- temp_ub_all[channel_for_allocation] + temp_lb <- temp_lb_all[channel_for_allocation] + temp_ub_ext <- temp_ub_ext_all[channel_for_allocation] + temp_lb_ext <- temp_lb_ext_all[channel_for_allocation] + x0 <- x0_all[channel_for_allocation] + lb <- lb_all[channel_for_allocation] + ub <- ub_all[channel_for_allocation] + x0_ext <- x0_ext_all[channel_for_allocation] + lb_ext <- lb_ext_all[channel_for_allocation] + ub_ext <- ub_ext_all[channel_for_allocation] + } + + x0 <- lb <- temp_init * temp_lb + ub <- temp_init * temp_ub + x0_ext <- lb_ext <- temp_init * temp_lb_ext + ub_ext <- temp_init * temp_ub_ext + + # Gather all values that will be used internally on optim (nloptr) + coefs_eval <- coefs_sorted[channel_for_allocation] + alphas_eval <- alphas[paste0(channel_for_allocation, "_alphas")] + inflexions_eval <- inflexions[paste0(channel_for_allocation, "_gammas")] + hist_carryover_eval <- hist_carryover[channel_for_allocation] + + eval_list <- list( + coefs_eval = coefs_eval, + alphas_eval = alphas_eval, + inflexions_eval = inflexions_eval, + # mediaSpendSortedFiltered = mediaSpendSorted, + total_budget = total_budget, + total_budget_unit = total_budget_unit, + hist_carryover_eval = hist_carryover_eval, + target_value = target_value, + target_value_ext = target_value_ext, + dep_var_type = dep_var_type + ) + # So we can implicitly use these values within eval_f() + options("ROBYN_TEMP" = eval_list) + + ## Set optim options + if (optim_algo == "MMA_AUGLAG") { + local_opts <- list( + "algorithm" = "NLOPT_LD_MMA", + "xtol_rel" = 1.0e-10 + ) + } else if (optim_algo == "SLSQP_AUGLAG") { + local_opts <- list( + "algorithm" = "NLOPT_LD_SLSQP", + "xtol_rel" = 1.0e-10 + ) + } + + ## Run optim + x_hist_carryover <- unlist(lapply(hist_carryover_eval, mean)) + if (scenario == "max_response") { + ## bounded optimisation + nlsMod <- nloptr::nloptr( + x0 = x0, + eval_f = eval_f, + eval_g_eq = if (constr_mode == "eq") eval_g_eq else NULL, + eval_g_ineq = if (constr_mode == "ineq") eval_g_ineq else NULL, + lb = lb, ub = ub, + opts = list( + "algorithm" = "NLOPT_LD_AUGLAG", + "xtol_rel" = 1.0e-10, + "maxeval" = maxeval, + "local_opts" = local_opts + ), + target_value = NULL + ) + ## unbounded optimisation + nlsModUnbound <- nloptr::nloptr( + x0 = x0_ext, + eval_f = eval_f, + eval_g_eq = if (constr_mode == "eq") eval_g_eq else NULL, + eval_g_ineq = if (constr_mode == "ineq") eval_g_ineq else NULL, + lb = lb_ext, ub = ub_ext, + opts = list( + "algorithm" = "NLOPT_LD_AUGLAG", + "xtol_rel" = 1.0e-10, + "maxeval" = maxeval, + "local_opts" = local_opts + ), + target_value = NULL + ) + } + + if (scenario == "target_efficiency") { + ## bounded optimisation + nlsMod <- nloptr::nloptr( + x0 = x0, + eval_f = eval_f, + eval_g_eq = if (constr_mode == "eq") eval_g_eq_effi else NULL, + eval_g_ineq = if (constr_mode == "ineq") eval_g_eq_effi else NULL, + lb = lb, + ub = x0 * channel_constr_up[1], # Large enough, but not infinite (customizable) + opts = list( + "algorithm" = "NLOPT_LD_AUGLAG", + "xtol_rel" = 1.0e-10, + "maxeval" = maxeval, + "local_opts" = local_opts + ), + target_value = target_value + ) + ## unbounded optimisation + nlsModUnbound <- nloptr::nloptr( + x0 = x0, + eval_f = eval_f, + eval_g_eq = if (constr_mode == "eq") eval_g_eq_effi else NULL, + eval_g_ineq = if (constr_mode == "ineq") eval_g_eq_effi else NULL, + lb = lb, + ub = x0 * channel_constr_up[1], # Large enough, but not infinite (customizable) + opts = list( + "algorithm" = "NLOPT_LD_AUGLAG", + "xtol_rel" = 1.0e-10, + "maxeval" = maxeval, + "local_opts" = local_opts + ), + target_value = target_value_ext + ) + } + + ## get marginal + optmSpendUnit <- nlsMod$solution + optmResponseUnit <- -eval_f(optmSpendUnit)[["objective.channel"]] + optmSpendUnitUnbound <- nlsModUnbound$solution + optmResponseUnitUnbound <- -eval_f(optmSpendUnitUnbound)[["objective.channel"]] + + optmResponseMargUnit <- mapply( + fx_objective, + x = optmSpendUnit + 1, + coeff = coefs_eval, + alpha = alphas_eval, + inflexion = inflexions_eval, + x_hist_carryover = x_hist_carryover, + get_sum = FALSE, + SIMPLIFY = TRUE + ) - optmResponseUnit + optmResponseMargUnitUnbound <- mapply( + fx_objective, + x = optmSpendUnitUnbound + 1, + coeff = coefs_eval, + alpha = alphas_eval, + inflexion = inflexions_eval, + x_hist_carryover = x_hist_carryover, + get_sum = FALSE, + SIMPLIFY = TRUE + ) - optmResponseUnitUnbound + + ## Collect output + names(optmSpendUnit) <- names(optmResponseUnit) <- names(optmResponseMargUnit) <- + names(optmSpendUnitUnbound) <- names(optmResponseUnitUnbound) <- + names(optmResponseMargUnitUnbound) <- channel_for_allocation + mediaSpendSorted %in% names(optmSpendUnit) + optmSpendUnitOut <- optmResponseUnitOut <- optmResponseMargUnitOut <- + optmSpendUnitUnboundOut <- optmResponseUnitUnboundOut <- + optmResponseMargUnitUnboundOut <- initSpendUnit + optmSpendUnitOut[channel_to_drop_loc] <- + optmResponseUnitOut[channel_to_drop_loc] <- + optmResponseMargUnitOut[channel_to_drop_loc] <- + optmSpendUnitUnboundOut[channel_to_drop_loc] <- + optmResponseUnitUnboundOut[channel_to_drop_loc] <- + optmResponseMargUnitUnboundOut[channel_to_drop_loc] <- 0 + optmSpendUnitOut[!channel_to_drop_loc] <- optmSpendUnit + optmResponseUnitOut[!channel_to_drop_loc] <- optmResponseUnit + optmResponseMargUnitOut[!channel_to_drop_loc] <- optmResponseMargUnit + optmSpendUnitUnboundOut[!channel_to_drop_loc] <- optmSpendUnitUnbound + optmResponseUnitUnboundOut[!channel_to_drop_loc] <- optmResponseUnitUnbound + optmResponseMargUnitUnboundOut[!channel_to_drop_loc] <- optmResponseMargUnitUnbound + + dt_optimOut <- data.frame( + solID = select_model, + dep_var_type = dep_var_type, + channels = mediaSpendSorted, + date_min = date_min, + date_max = date_max, + periods = sprintf("%s %ss", initial_mean_period, InputCollect$intervalType), + constr_low = temp_lb_all, + constr_low_abs = lb_all, + constr_up = temp_ub_all, + constr_up_abs = ub_all, + unconstr_mult = channel_constr_multiplier, + constr_low_unb = temp_lb_ext_all, + constr_low_unb_abs = lb_ext_all, + constr_up_unb = temp_ub_ext_all, + constr_up_unb_abs = ub_ext_all, + # Historical spends + histSpendAll = histSpendAll, + histSpendAllTotal = histSpendAllTotal, + histSpendAllUnit = histSpendAllUnit, + histSpendAllUnitTotal = histSpendAllUnitTotal, + histSpendAllShare = histSpendAllShare, + histSpendWindow = histSpendWindow, + histSpendWindowTotal = histSpendWindowTotal, + histSpendWindowUnit = histSpendWindowUnit, + histSpendWindowUnitTotal = histSpendWindowUnitTotal, + histSpendWindowShare = histSpendWindowShare, + # Initial spends for allocation + initSpendUnit = initSpendUnit, + initSpendUnitTotal = initSpendUnitTotal, + initSpendShare = initSpendShare, + initSpendTotal = initSpendUnitTotal * unique(simulation_period), + # initSpendUnitRaw = histSpendUnitRaw, + # adstocked = adstocked, + # adstocked_start_date = as.Date(ifelse(adstocked, head(resp$date, 1), NA), origin = "1970-01-01"), + # adstocked_end_date = as.Date(ifelse(adstocked, tail(resp$date, 1), NA), origin = "1970-01-01"), + # adstocked_periods = length(resp$date), + initResponseUnit = initResponseUnit, + initResponseUnitTotal = sum(initResponseUnit), + initResponseMargUnit = initResponseMargUnit, + initResponseTotal = sum(initResponseUnit) * unique(simulation_period), + initResponseUnitShare = initResponseUnit / sum(initResponseUnit), + initRoiUnit = initResponseUnit / initSpendUnit, + initCpaUnit = initSpendUnit / initResponseUnit, + # Budget change + total_budget_unit = total_budget_unit, + total_budget_unit_delta = total_budget_unit / initSpendUnitTotal - 1, + # Optimized + optmSpendUnit = optmSpendUnitOut, + optmSpendUnitDelta = (optmSpendUnitOut / initSpendUnit - 1), + optmSpendUnitTotal = sum(optmSpendUnitOut), + optmSpendUnitTotalDelta = sum(optmSpendUnitOut) / initSpendUnitTotal - 1, + optmSpendShareUnit = optmSpendUnitOut / sum(optmSpendUnitOut), + optmSpendTotal = sum(optmSpendUnitOut) * unique(simulation_period), + optmSpendUnitUnbound = optmSpendUnitUnboundOut, + optmSpendUnitDeltaUnbound = (optmSpendUnitUnboundOut / initSpendUnit - 1), + optmSpendUnitTotalUnbound = sum(optmSpendUnitUnboundOut), + optmSpendUnitTotalDeltaUnbound = sum(optmSpendUnitUnboundOut) / initSpendUnitTotal - 1, + optmSpendShareUnitUnbound = optmSpendUnitUnboundOut / sum(optmSpendUnitUnboundOut), + optmSpendTotalUnbound = sum(optmSpendUnitUnboundOut) * unique(simulation_period), + optmResponseUnit = optmResponseUnitOut, + optmResponseMargUnit = optmResponseMargUnitOut, + optmResponseUnitTotal = sum(optmResponseUnitOut), + optmResponseTotal = sum(optmResponseUnitOut) * unique(simulation_period), + optmResponseUnitShare = optmResponseUnitOut / sum(optmResponseUnitOut), + optmRoiUnit = optmResponseUnitOut / optmSpendUnitOut, + optmCpaUnit = optmSpendUnitOut / optmResponseUnitOut, + optmResponseUnitLift = (optmResponseUnitOut / initResponseUnit) - 1, + optmResponseUnitUnbound = optmResponseUnitUnboundOut, + optmResponseMargUnitUnbound = optmResponseMargUnitUnboundOut, + optmResponseUnitTotalUnbound = sum(optmResponseUnitUnboundOut), + optmResponseTotalUnbound = sum(optmResponseUnitUnboundOut) * unique(simulation_period), + optmResponseUnitShareUnbound = optmResponseUnitUnboundOut / sum(optmResponseUnitUnboundOut), + optmRoiUnitUnbound = optmResponseUnitUnboundOut / optmSpendUnitUnboundOut, + optmCpaUnitUnbound = optmSpendUnitUnboundOut / optmResponseUnitUnboundOut, + optmResponseUnitLiftUnbound = (optmResponseUnitUnboundOut / initResponseUnit) - 1 + ) %>% + mutate( + optmResponseUnitTotalLift = (.data$optmResponseUnitTotal / .data$initResponseUnitTotal) - 1, + optmResponseUnitTotalLiftUnbound = (.data$optmResponseUnitTotalUnbound / .data$initResponseUnitTotal) - 1 + ) + .Options$ROBYN_TEMP <- NULL # Clean auxiliary method + + ## Calculate curves and main points for each channel + if (scenario == "max_response") { + levs1 <- c("Initial", "Bounded", paste0("Bounded x", channel_constr_multiplier)) + } else if (scenario == "target_efficiency") { + if (dep_var_type == "revenue") { + levs1 <- c( + "Initial", paste0("Hit ROAS ", round(target_value, 2)), + paste0("Hit ROAS ", target_value_ext) + ) + } else { + levs1 <- c( + "Initial", paste0("Hit CPA ", round(target_value, 2)), + paste0("Hit CPA ", round(target_value_ext, 2)) + ) + } + } + eval_list$levs1 <- levs1 + + dt_optimOutScurve <- rbind( + select(dt_optimOut, .data$channels, .data$initSpendUnit, .data$initResponseUnit) %>% + mutate(x = levs1[1]) %>% as.matrix(), + select(dt_optimOut, .data$channels, .data$optmSpendUnit, .data$optmResponseUnit) %>% + mutate(x = levs1[2]) %>% as.matrix(), + select(dt_optimOut, .data$channels, .data$optmSpendUnitUnbound, .data$optmResponseUnitUnbound) %>% + mutate(x = levs1[3]) %>% as.matrix() + ) %>% + `colnames<-`(c("channels", "spend", "response", "type")) %>% + rbind(data.frame(channels = dt_optimOut$channels, spend = 0, response = 0, type = "Carryover")) %>% + mutate(spend = as.numeric(.data$spend), response = as.numeric(.data$response)) %>% + group_by(.data$channels) + + plotDT_scurve <- list() + for (i in channel_for_allocation) { # i <- channels[i] + carryover_vec <- eval_list$hist_carryover_eval[[i]] + dt_optimOutScurve <- dt_optimOutScurve %>% + mutate(spend = ifelse( + .data$channels == i & .data$type %in% levs1, + .data$spend + mean(carryover_vec), ifelse( + .data$channels == i & .data$type == "Carryover", + mean(carryover_vec), .data$spend + ) + )) + get_max_x <- max(filter(dt_optimOutScurve, .data$channels == i)$spend) * 1.5 + simulate_spend <- seq(0, get_max_x, length.out = 100) + simulate_response <- fx_objective( + x = simulate_spend, + coeff = eval_list$coefs_eval[[i]], + alpha = eval_list$alphas_eval[[paste0(i, "_alphas")]], + inflexion = eval_list$inflexions_eval[[paste0(i, "_gammas")]], + x_hist_carryover = 0, + get_sum = FALSE + ) + simulate_response_carryover <- fx_objective( + x = mean(carryover_vec), + coeff = eval_list$coefs_eval[[i]], + alpha = eval_list$alphas_eval[[paste0(i, "_alphas")]], + inflexion = eval_list$inflexions_eval[[paste0(i, "_gammas")]], + x_hist_carryover = 0, + get_sum = FALSE + ) + plotDT_scurve[[i]] <- data.frame( + channel = i, spend = simulate_spend, + mean_carryover = mean(carryover_vec), + carryover_response = simulate_response_carryover, + total_response = simulate_response + ) + dt_optimOutScurve <- dt_optimOutScurve %>% + mutate(response = ifelse( + .data$channels == i & .data$type == "Carryover", + simulate_response_carryover, .data$response + )) + } + eval_list[["plotDT_scurve"]] <- plotDT_scurve <- as_tibble(bind_rows(plotDT_scurve)) + mainPoints <- dt_optimOutScurve %>% + rename("response_point" = "response", "spend_point" = "spend", "channel" = "channels") + temp_caov <- mainPoints %>% filter(.data$type == "Carryover") + mainPoints$mean_spend <- mainPoints$spend_point - temp_caov$spend_point + mainPoints$mean_spend <- ifelse(mainPoints$type == "Carryover", mainPoints$spend_point, mainPoints$mean_spend) + if (levs1[2] == levs1[3]) levs1[3] <- paste0(levs1[3], ".") + mainPoints$type <- factor(mainPoints$type, levels = c("Carryover", levs1)) + mainPoints$roi_mean <- mainPoints$response_point / mainPoints$mean_spend + mresp_caov <- filter(mainPoints, .data$type == "Carryover")$response_point + mresp_init <- filter(mainPoints, .data$type == levels(mainPoints$type)[2])$response_point - mresp_caov + mresp_b <- filter(mainPoints, .data$type == levels(mainPoints$type)[3])$response_point - mresp_caov + mresp_unb <- filter(mainPoints, .data$type == levels(mainPoints$type)[4])$response_point - mresp_caov + mainPoints$marginal_response <- c(mresp_init, mresp_b, mresp_unb, rep(0, length(mresp_init))) + mainPoints$roi_marginal <- mainPoints$marginal_response / mainPoints$mean_spend + mainPoints$cpa_marginal <- mainPoints$mean_spend / mainPoints$marginal_response + eval_list[["mainPoints"]] <- mainPoints + + # Exporting directory + if (export) { + if (is.null(json_file) & !is.null(plot_folder)) { + if (is.null(plot_folder_sub)) plot_folder_sub <- basename(OutputCollect$plot_folder) + plot_folder <- gsub("//+", "/", paste0(plot_folder, "/", plot_folder_sub, "/")) + } else { + plot_folder <- gsub("//+", "/", paste0(OutputCollect$plot_folder, "/")) + } + + # if (!is.null(json_file)) { + # plot_folder <- gsub("//+", "/", paste0(OutputCollect$plot_folder, "/")) + # } else if (is.null(json_file) & is.null(plot_folder) & is.null(plot_folder_sub)) { + # plot_folder <- gsub("//+", "/", paste0(OutputCollect$plot_folder, "/")) + # } else { + # if (is.null(plot_folder_sub)) plot_folder_sub <- basename(OutputCollect$plot_folder) + # plot_folder <- gsub("//+", "/", paste0(plot_folder, "/", plot_folder_sub, "/")) + # } + if (!dir.exists(plot_folder)) { + message("Creating directory for allocator: ", plot_folder) + dir.create(plot_folder) + } + ## Export results into CSV + export_dt_optimOut <- dt_optimOut + if (dep_var_type == "conversion") { + colnames(export_dt_optimOut) <- gsub("Roi", "CPA", colnames(export_dt_optimOut)) + } + write.csv(export_dt_optimOut, paste0(plot_folder, select_model, "_", scenario, "_reallocated.csv")) + } + + ## Plot allocator results + if (plots) { + plots <- allocation_plots( + InputCollect, OutputCollect, + dt_optimOut, + # filter(dt_optimOut, .data$channels %in% channel_for_allocation), + select_model, scenario, eval_list, + export, plot_folder, quiet + ) + } else { + plots <- NULL + } + + output <- list( + dt_optimOut = dt_optimOut, + mainPoints = mainPoints, + nlsMod = nlsMod, + plots = plots, + scenario = scenario, + usecase = usecase, + total_budget = ifelse(is.null(total_budget), total_budget_window, total_budget), + skipped_coef0 = zero_coef_channel, + skipped_constr = zero_constraint_channel, + no_spend = zero_spend_channel, + ui = if (ui) plots else NULL + ) + + class(output) <- c("robyn_allocator", class(output)) + return(output) +} + +#' @rdname robyn_allocator +#' @aliases robyn_allocator +#' @param x \code{robyn_allocator()} output. +#' @export +print.robyn_allocator <- function(x, ...) { + temp <- x$dt_optimOut[!is.nan(x$dt_optimOut$optmRoiUnit), ] + coef0 <- if (length(x$skipped_coef0) > 0) paste("Coefficient 0:", v2t(x$skipped_coef0, quotes = FALSE)) else NULL + constr <- if (length(x$skipped_constr) > 0) paste("Constrained @0:", v2t(x$skipped_constr, quotes = FALSE)) else NULL + nospend <- if (length(x$no_spend) > 0) paste("Spend = 0:", v2t(x$no_spend, quotes = FALSE)) else NULL + media_skipped <- paste(c(coef0, constr, nospend), collapse = " | ") + media_skipped <- ifelse(is.null(media_skipped), "None", media_skipped) + + print(glued( + " +Model ID: {x$dt_optimOut$solID[1]} +Scenario: {x$scenario} +Use case: {x$usecase} +Window: {x$dt_optimOut$date_min[1]}:{x$dt_optimOut$date_max[1]} ({x$dt_optimOut$periods[1]}) + +Dep. Variable Type: {temp$dep_var_type[1]} +Media Skipped: {media_skipped} +Relative Spend Increase: {spend_increase_p}% ({spend_increase}) +Total Response Increase (Optimized): {signif(100 * x$dt_optimOut$optmResponseUnitTotalLift[1], 3)}% + +Allocation Summary: + {summary} +", + spend_increase_p = num_abbr(100 * x$dt_optimOut$optmSpendUnitTotalDelta[1], 3), + spend_increase = formatNum( + sum(x$dt_optimOut$optmSpendUnitTotal) - sum(x$dt_optimOut$initSpendUnitTotal), + abbr = TRUE, sign = TRUE + ), + summary = paste(sprintf( + " +- %s: + Optimizable bound: [%s%%, %s%%], + Initial spend share: %s%% -> Optimized bounded: %s%% + Initial response share: %s%% -> Optimized bounded: %s%% + Initial abs. mean spend: %s -> Optimized: %s [Delta = %s%%]", + temp$channels, + 100 * temp$constr_low - 100, + 100 * temp$constr_up - 100, + signif(100 * temp$initSpendShare, 3), + signif(100 * temp$optmSpendShareUnit, 3), + signif(100 * temp$initResponseUnitShare, 3), + signif(100 * temp$optmResponseUnitShare, 3), + formatNum(temp$initSpendUnit, 3, abbr = TRUE), + formatNum(temp$optmSpendUnit, 3, abbr = TRUE), + formatNum(100 * temp$optmSpendUnitDelta, signif = 2) + ), collapse = "\n ") + )) +} + +#' @rdname robyn_allocator +#' @aliases robyn_allocator +#' @param x \code{robyn_allocator()} output. +#' @export +plot.robyn_allocator <- function(x, ...) plot(x$plots$plots, ...) + +eval_f <- function(X, target_value) { + # eval_list <- get("eval_list", pos = as.environment(-1)) + eval_list <- getOption("ROBYN_TEMP") + coefs_eval <- eval_list[["coefs_eval"]] + alphas_eval <- eval_list[["alphas_eval"]] + inflexions_eval <- eval_list[["inflexions_eval"]] + # mediaSpendSortedFiltered <- eval_list[["mediaSpendSortedFiltered"]] + hist_carryover_eval <- eval_list[["hist_carryover_eval"]] + + objective <- -sum(mapply( + fx_objective, + x = X, + coeff = coefs_eval, + alpha = alphas_eval, + inflexion = inflexions_eval, + x_hist_carryover = hist_carryover_eval, + SIMPLIFY = TRUE + )) + + gradient <- c(mapply( + fx_gradient, + x = X, + coeff = coefs_eval, + alpha = alphas_eval, + inflexion = inflexions_eval, + x_hist_carryover = hist_carryover_eval, + SIMPLIFY = TRUE + )) + + objective.channel <- mapply( + fx_objective.chanel, + x = X, + coeff = coefs_eval, + alpha = alphas_eval, + inflexion = inflexions_eval, + x_hist_carryover = hist_carryover_eval, + SIMPLIFY = TRUE + ) + + optm <- list(objective = objective, gradient = gradient, objective.channel = objective.channel) + return(optm) +} + +fx_objective <- function(x, coeff, alpha, inflexion, x_hist_carryover, get_sum = TRUE) { + # Apply Michaelis Menten model to scale spend to exposure + # if (criteria) { + # xScaled <- mic_men(x = x, Vmax = vmax, Km = km) # vmax * x / (km + x) + # } else if (chnName %in% names(mm_lm_coefs)) { + # xScaled <- x * mm_lm_coefs[chnName] + # } else { + # xScaled <- x + # } + + # Adstock scales + xAdstocked <- x + mean(x_hist_carryover) + # Hill transformation + if (get_sum) { + xOut <- coeff * sum((1 + inflexion**alpha / xAdstocked**alpha)**-1) + } else { + xOut <- coeff * ((1 + inflexion**alpha / xAdstocked**alpha)**-1) + } + return(xOut) +} + +# https://www.derivative-calculator.net/ on the objective function 1/(1+gamma^alpha / x^alpha) +fx_gradient <- function(x, coeff, alpha, inflexion, x_hist_carryover + # , chnName, vmax, km, criteria +) { + # Apply Michaelis Menten model to scale spend to exposure + # if (criteria) { + # xScaled <- mic_men(x = x, Vmax = vmax, Km = km) # vmax * x / (km + x) + # } else if (chnName %in% names(mm_lm_coefs)) { + # xScaled <- x * mm_lm_coefs[chnName] + # } else { + # xScaled <- x + # } + + # Adstock scales + xAdstocked <- x + mean(x_hist_carryover) + xOut <- -coeff * sum((alpha * (inflexion**alpha) * (xAdstocked**(alpha - 1))) / (xAdstocked**alpha + inflexion**alpha)**2) + return(xOut) +} + +fx_objective.chanel <- function(x, coeff, alpha, inflexion, x_hist_carryover + # , chnName, vmax, km, criteria +) { + # Apply Michaelis Menten model to scale spend to exposure + # if (criteria) { + # xScaled <- mic_men(x = x, Vmax = vmax, Km = km) # vmax * x / (km + x) + # } else if (chnName %in% names(mm_lm_coefs)) { + # xScaled <- x * mm_lm_coefs[chnName] + # } else { + # xScaled <- x + # } + + # Adstock scales + xAdstocked <- x + mean(x_hist_carryover) + xOut <- -coeff * sum((1 + inflexion**alpha / xAdstocked**alpha)**-1) + return(xOut) +} + +eval_g_eq <- function(X, target_value) { + eval_list <- getOption("ROBYN_TEMP") + constr <- sum(X) - eval_list$total_budget_unit + grad <- rep(1, length(X)) + return(list( + "constraints" = constr, + "jacobian" = grad + )) +} + +eval_g_ineq <- function(X, target_value) { + eval_list <- getOption("ROBYN_TEMP") + constr <- sum(X) - eval_list$total_budget_unit + grad <- rep(1, length(X)) + return(list( + "constraints" = constr, + "jacobian" = grad + )) +} + +eval_g_eq_effi <- function(X, target_value) { + eval_list <- getOption("ROBYN_TEMP") + sum_response <- sum(mapply( + fx_objective, + x = X, + coeff = eval_list$coefs_eval, + alpha = eval_list$alphas_eval, + inflexion = eval_list$inflexions_eval, + x_hist_carryover = eval_list$hist_carryover_eval, + SIMPLIFY = TRUE + )) + + if (is.null(target_value)) { + if (eval_list$dep_var_type == "conversion") { + constr <- sum(X) - sum_response * eval_list$target_value + } else { + constr <- sum(X) - sum_response / eval_list$target_value + } + } else { + if (eval_list$dep_var_type == "conversion") { + constr <- sum(X) - sum_response * target_value + } else { + constr <- sum(X) - sum_response / target_value + } + } + + grad <- rep(1, length(X)) - mapply( + fx_gradient, + x = X, + coeff = eval_list$coefs_eval, + alpha = eval_list$alphas_eval, + inflexion = eval_list$inflexions_eval, + x_hist_carryover = eval_list$hist_carryover_eval, + SIMPLIFY = TRUE + ) + + # constr <- sum(X) - eval_list$total_budget_unit + # grad <- rep(1, length(X)) + return(list( + "constraints" = constr, + "jacobian" = grad + )) +} + + +get_adstock_params <- function(InputCollect, dt_hyppar) { + if (InputCollect$adstock == "geometric") { + getAdstockHypPar <- unlist(select(dt_hyppar, na.omit(str_extract(names(dt_hyppar), ".*_thetas")))) + } else if (InputCollect$adstock %in% c("weibull_cdf", "weibull_pdf")) { + getAdstockHypPar <- unlist(select(dt_hyppar, na.omit(str_extract(names(dt_hyppar), ".*_shapes|.*_scales")))) + } + return(getAdstockHypPar) +} + +get_hill_params <- function(InputCollect, OutputCollect = NULL, dt_hyppar, dt_coef, mediaSpendSorted, select_model, chnAdstocked = NULL) { + hillHypParVec <- unlist(select(dt_hyppar, na.omit(str_extract(names(dt_hyppar), ".*_alphas|.*_gammas")))) + alphas <- hillHypParVec[paste0(mediaSpendSorted, "_alphas")] + gammas <- hillHypParVec[paste0(mediaSpendSorted, "_gammas")] + if (is.null(chnAdstocked)) { + chnAdstocked <- filter( + OutputCollect$mediaVecCollect, + .data$type == "adstockedMedia", + .data$solID == select_model + ) %>% + select(all_of(mediaSpendSorted)) %>% + slice(InputCollect$rollingWindowStartWhich:InputCollect$rollingWindowEndWhich) + } + inflexions <- unlist(lapply(seq(ncol(chnAdstocked)), function(i) { + c(range(chnAdstocked[, i]) %*% c(1 - gammas[i], gammas[i])) + })) + names(inflexions) <- names(gammas) + coefs <- dt_coef$coef + names(coefs) <- dt_coef$rn + coefs_sorted <- coefs[mediaSpendSorted] + return(list( + alphas = alphas, + inflexions = inflexions, + coefs_sorted = coefs_sorted + )) +} diff --git a/R/auxiliary.R b/R/auxiliary.R index 51369ad..ed32018 100644 --- a/R/auxiliary.R +++ b/R/auxiliary.R @@ -1,96 +1,96 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -# Calculate R-squared -get_rsq <- function(true, predicted, p = NULL, df.int = NULL, n_train = NULL) { - sse <- sum((predicted - true)^2) - sst <- sum((true - mean(true))^2) - rsq <- 1 - sse / sst # rsq interpreted as variance explained - rsq_out <- rsq - if (!is.null(p) && !is.null(df.int)) { - if (!is.null(n_train)) { - n <- n_train # for oos dataset, use n from train set for adj. rsq - } else { - n <- length(true) - } - rdf <- n - p - 1 - rsq_adj <- 1 - (1 - rsq) * ((n - df.int) / rdf) - rsq_out <- rsq_adj - } - return(rsq_out) -} - -# Robyn colors -robyn_palette <- function() { - pal <- c( - "#21130d", "#351904", "#543005", "#8C510A", "#BF812D", "#DFC27D", "#F6E8C3", - "#F5F5F5", "#C7EAE5", "#80CDC1", "#35978F", "#01665E", "#043F43", "#04272D" - ) - repeated <- 4 - list( - fill = rep(pal, repeated), - colour = rep(c(rep("#FFFFFF", 4), rep("#000000", 7), rep("#FFFFFF", 3)), repeated) - ) -} -# lares::plot_palette( -# fill = robyn_palette()$fill, colour = robyn_palette()$colour, -# limit = length(unique(robyn_palette()$fill))) - -flatten_hyps <- function(x) { - if (is.null(x)) { - return(x) - } - temp <- unlist(lapply(x, function(x) { - sprintf("[%s]", paste(if (is.numeric(x)) signif(x, 6) else x, collapse = ", ")) - })) - paste(paste0(" ", names(temp), ":"), temp, collapse = "\n") -} - -#################################################################### -#' Update Robyn Version -#' -#' Update Robyn version from -#' \href{https://github.com/facebookexperimental/Robyn}{Github repository} -#' for latest "dev" version or from -#' \href{https://CRAN.R-project.org/package=Robyn}{CRAN} -#' for latest "stable" version. -#' -#' @param dev Boolean. Dev version? If not, CRAN version. -#' @param ... Parameters to pass to \code{remotes::install_github} -#' or \code{utils::install.packages}, depending on \code{dev} parameter. -#' @return Invisible \code{NULL}. -#' @export -robyn_update <- function(dev = TRUE, ...) { - if (dev) { - try_require("remotes") - # options(timeout = 400) - install_github(repo = "facebookexperimental/Robyn/R", ...) - } else { - utils::install.packages("Robyn", ...) - } -} - -# Merge baseline variables based on baseline_level param input -baseline_vars <- function(InputCollect, baseline_level) { - stopifnot(length(baseline_level) == 1) - stopifnot(baseline_level %in% 0:5) - x <- "" - if (baseline_level >= 1) { - x <- c(x, "(Intercept)", "intercept") - } - if (baseline_level >= 2) { - x <- c(x, "trend") - } - if (baseline_level >= 3) { - x <- unique(c(x, InputCollect$prophet_vars)) - } - if (baseline_level >= 4) { - x <- c(x, InputCollect$context_vars) - } - if (baseline_level >= 5) { - x <- c(x, InputCollect$organic_vars) - } - return(x) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# Calculate R-squared +get_rsq <- function(true, predicted, p = NULL, df.int = NULL, n_train = NULL) { + sse <- sum((predicted - true)^2) + sst <- sum((true - mean(true))^2) + rsq <- 1 - sse / sst # rsq interpreted as variance explained + rsq_out <- rsq + if (!is.null(p) && !is.null(df.int)) { + if (!is.null(n_train)) { + n <- n_train # for oos dataset, use n from train set for adj. rsq + } else { + n <- length(true) + } + rdf <- n - p - 1 + rsq_adj <- 1 - (1 - rsq) * ((n - df.int) / rdf) + rsq_out <- rsq_adj + } + return(rsq_out) +} + +# Robyn colors +robyn_palette <- function() { + pal <- c( + "#21130d", "#351904", "#543005", "#8C510A", "#BF812D", "#DFC27D", "#F6E8C3", + "#F5F5F5", "#C7EAE5", "#80CDC1", "#35978F", "#01665E", "#043F43", "#04272D" + ) + repeated <- 4 + list( + fill = rep(pal, repeated), + colour = rep(c(rep("#FFFFFF", 4), rep("#000000", 7), rep("#FFFFFF", 3)), repeated) + ) +} +# lares::plot_palette( +# fill = robyn_palette()$fill, colour = robyn_palette()$colour, +# limit = length(unique(robyn_palette()$fill))) + +flatten_hyps <- function(x) { + if (is.null(x)) { + return(x) + } + temp <- unlist(lapply(x, function(x) { + sprintf("[%s]", paste(if (is.numeric(x)) signif(x, 6) else x, collapse = ", ")) + })) + paste(paste0(" ", names(temp), ":"), temp, collapse = "\n") +} + +#################################################################### +#' Update Robyn Version +#' +#' Update Robyn version from +#' \href{https://github.com/facebookexperimental/Robyn}{Github repository} +#' for latest "dev" version or from +#' \href{https://CRAN.R-project.org/package=Robyn}{CRAN} +#' for latest "stable" version. +#' +#' @param dev Boolean. Dev version? If not, CRAN version. +#' @param ... Parameters to pass to \code{remotes::install_github} +#' or \code{utils::install.packages}, depending on \code{dev} parameter. +#' @return Invisible \code{NULL}. +#' @export +robyn_update <- function(dev = TRUE, ...) { + if (dev) { + try_require("remotes") + # options(timeout = 400) + install_github(repo = "facebookexperimental/Robyn/R", ...) + } else { + utils::install.packages("Robyn", ...) + } +} + +# Merge baseline variables based on baseline_level param input +baseline_vars <- function(InputCollect, baseline_level) { + stopifnot(length(baseline_level) == 1) + stopifnot(baseline_level %in% 0:5) + x <- "" + if (baseline_level >= 1) { + x <- c(x, "(Intercept)", "intercept") + } + if (baseline_level >= 2) { + x <- c(x, "trend") + } + if (baseline_level >= 3) { + x <- unique(c(x, InputCollect$prophet_vars)) + } + if (baseline_level >= 4) { + x <- c(x, InputCollect$context_vars) + } + if (baseline_level >= 5) { + x <- c(x, InputCollect$organic_vars) + } + return(x) +} diff --git a/R/calibration.R b/R/calibration.R index 16be68f..f0cc4c2 100644 --- a/R/calibration.R +++ b/R/calibration.R @@ -1,135 +1,135 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -robyn_calibrate <- function(calibration_input, - df_raw, # df_raw = InputCollect$dt_mod - dayInterval, # dayInterval = InputCollect$dayInterval - xDecompVec, # xDecompVec = decompCollect$xDecompVec - coefs, # coefs = decompCollect$coefsOutCat - hypParamSam, - wind_start = 1, - wind_end = nrow(df_raw), - adstock) { - ds_wind <- df_raw$ds[wind_start:wind_end] - include_study <- any( - calibration_input$liftStartDate >= min(ds_wind) & - calibration_input$liftEndDate <= (max(ds_wind) + dayInterval - 1) - ) - if (!is.null(calibration_input) & !include_study) { - warning("All calibration_input in outside modelling window. Running without calibration") - } else if (!is.null(calibration_input) & include_study) { - calibration_input <- mutate( - calibration_input, - pred = NA, pred_total = NA, decompStart = NA, decompEnd = NA - ) - split_channels <- strsplit(calibration_input$channel, split = "\\+") - - for (l_study in seq_along(split_channels)) { - get_channels <- split_channels[[l_study]] - scope <- calibration_input$calibration_scope[[l_study]] - study_start <- calibration_input$liftStartDate[[l_study]] - study_end <- calibration_input$liftEndDate[[l_study]] - study_pos <- which(df_raw$ds >= study_start & df_raw$ds <= study_end) - if (study_start %in% df_raw$ds) { - calib_pos <- study_pos - } else { - calib_pos <- c(min(study_pos) - 1, study_pos) - } - calibrate_dates <- df_raw[calib_pos, "ds"][[1]] - calib_pos_rw <- which(xDecompVec$ds %in% calibrate_dates) - - l_chn_collect <- list() - l_chn_total_collect <- list() - for (l_chn in seq_along(get_channels)) { # l_chn =1 - if (scope == "immediate") { - m <- df_raw[, get_channels[l_chn]][[1]] - # m_calib <- df_raw[calib_pos, get_channels[l_chn]][[1]] - - ## 1. Adstock - if (adstock == "geometric") { - theta <- hypParamSam[paste0(get_channels[l_chn], "_thetas")][[1]][[1]] - } - if (grepl("weibull", adstock)) { - shape <- hypParamSam[paste0(get_channels[l_chn], "_shapes")][[1]][[1]] - scale <- hypParamSam[paste0(get_channels[l_chn], "_scales")][[1]][[1]] - } - x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) - if (adstock == "weibull_pdf") { - m_imme <- x_list$x_imme - } else { - m_imme <- m - } - m_total <- x_list$x_decayed - m_caov <- m_total - m_imme - - ## 2. Saturation - m_caov_calib <- m_caov[calib_pos] - m_total_rw <- m_total[wind_start:wind_end] - alpha <- hypParamSam[paste0(get_channels[l_chn], "_alphas")][[1]][[1]] - gamma <- hypParamSam[paste0(get_channels[l_chn], "_gammas")][[1]][[1]] - m_calib_caov_sat <- saturation_hill( - m_total_rw, - alpha = alpha, gamma = gamma, x_marginal = m_caov_calib - ) - m_calib_caov_decomp <- m_calib_caov_sat * coefs$s0[coefs$rn == get_channels[l_chn]] - m_calib_total_decomp <- xDecompVec[calib_pos_rw, get_channels[l_chn]] - m_calib_decomp <- m_calib_total_decomp - m_calib_caov_decomp - } - if (scope == "total") { - m_calib_decomp <- m_calib_total_decomp <- xDecompVec[calib_pos_rw, get_channels[l_chn]] - } - l_chn_collect[[get_channels[l_chn]]] <- m_calib_decomp - l_chn_total_collect[[get_channels[l_chn]]] <- m_calib_total_decomp - } - - if (length(get_channels) > 1) { - l_chn_collect <- rowSums(bind_cols(l_chn_collect)) - l_chn_total_collect <- rowSums(bind_cols(l_chn_total_collect)) - } else { - l_chn_collect <- unlist(l_chn_collect, use.names = FALSE) - l_chn_total_collect <- unlist(l_chn_total_collect, use.names = FALSE) - } - - calibration_input[l_study, ] <- mutate( - calibration_input[l_study, ], - pred = sum(l_chn_collect), - pred_total = sum(l_chn_total_collect), - decompStart = range(calibrate_dates)[1], - decompEnd = range(calibrate_dates)[2] - ) - } - liftCollect <- calibration_input %>% - mutate( - decompStart = as.Date(.data$decompStart, "1970-01-01"), - decompEnd = as.Date(.data$decompEnd, "1970-01-01") - ) %>% - mutate( - liftDays = as.numeric( - difftime(.data$liftEndDate, .data$liftStartDate, units = "days") - ), - decompDays = as.numeric( - difftime(.data$decompEnd, .data$decompStart, units = "days") - ) - ) %>% - mutate( - decompAbsScaled = .data$pred / .data$decompDays * .data$liftDays, - decompAbsTotalScaled = .data$pred_total / .data$decompDays * .data$liftDays - ) %>% - mutate( - liftMedia = .data$channel, - liftStart = .data$liftStartDate, - liftEnd = .data$liftEndDate, - mape_lift = abs((.data$decompAbsScaled - .data$liftAbs) / .data$liftAbs), - calibrated_pct = .data$decompAbsScaled / .data$decompAbsTotalScaled - ) %>% - dplyr::select( - .data$liftMedia, .data$liftStart, .data$liftEnd, .data$liftAbs, - .data$decompStart, .data$decompEnd, .data$decompAbsScaled, - .data$decompAbsTotalScaled, .data$calibrated_pct, .data$mape_lift - ) - - return(liftCollect) - } -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +robyn_calibrate <- function(calibration_input, + df_raw, # df_raw = InputCollect$dt_mod + dayInterval, # dayInterval = InputCollect$dayInterval + xDecompVec, # xDecompVec = decompCollect$xDecompVec + coefs, # coefs = decompCollect$coefsOutCat + hypParamSam, + wind_start = 1, + wind_end = nrow(df_raw), + adstock) { + ds_wind <- df_raw$ds[wind_start:wind_end] + include_study <- any( + calibration_input$liftStartDate >= min(ds_wind) & + calibration_input$liftEndDate <= (max(ds_wind) + dayInterval - 1) + ) + if (!is.null(calibration_input) & !include_study) { + warning("All calibration_input in outside modelling window. Running without calibration") + } else if (!is.null(calibration_input) & include_study) { + calibration_input <- mutate( + calibration_input, + pred = NA, pred_total = NA, decompStart = NA, decompEnd = NA + ) + split_channels <- strsplit(calibration_input$channel, split = "\\+") + + for (l_study in seq_along(split_channels)) { + get_channels <- split_channels[[l_study]] + scope <- calibration_input$calibration_scope[[l_study]] + study_start <- calibration_input$liftStartDate[[l_study]] + study_end <- calibration_input$liftEndDate[[l_study]] + study_pos <- which(df_raw$ds >= study_start & df_raw$ds <= study_end) + if (study_start %in% df_raw$ds) { + calib_pos <- study_pos + } else { + calib_pos <- c(min(study_pos) - 1, study_pos) + } + calibrate_dates <- df_raw[calib_pos, "ds"][[1]] + calib_pos_rw <- which(xDecompVec$ds %in% calibrate_dates) + + l_chn_collect <- list() + l_chn_total_collect <- list() + for (l_chn in seq_along(get_channels)) { # l_chn =1 + if (scope == "immediate") { + m <- df_raw[, get_channels[l_chn]][[1]] + # m_calib <- df_raw[calib_pos, get_channels[l_chn]][[1]] + + ## 1. Adstock + if (adstock == "geometric") { + theta <- hypParamSam[paste0(get_channels[l_chn], "_thetas")][[1]][[1]] + } + if (grepl("weibull", adstock)) { + shape <- hypParamSam[paste0(get_channels[l_chn], "_shapes")][[1]][[1]] + scale <- hypParamSam[paste0(get_channels[l_chn], "_scales")][[1]][[1]] + } + x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) + if (adstock == "weibull_pdf") { + m_imme <- x_list$x_imme + } else { + m_imme <- m + } + m_total <- x_list$x_decayed + m_caov <- m_total - m_imme + + ## 2. Saturation + m_caov_calib <- m_caov[calib_pos] + m_total_rw <- m_total[wind_start:wind_end] + alpha <- hypParamSam[paste0(get_channels[l_chn], "_alphas")][[1]][[1]] + gamma <- hypParamSam[paste0(get_channels[l_chn], "_gammas")][[1]][[1]] + m_calib_caov_sat <- saturation_hill( + m_total_rw, + alpha = alpha, gamma = gamma, x_marginal = m_caov_calib + ) + m_calib_caov_decomp <- m_calib_caov_sat * coefs$s0[coefs$rn == get_channels[l_chn]] + m_calib_total_decomp <- xDecompVec[calib_pos_rw, get_channels[l_chn]] + m_calib_decomp <- m_calib_total_decomp - m_calib_caov_decomp + } + if (scope == "total") { + m_calib_decomp <- m_calib_total_decomp <- xDecompVec[calib_pos_rw, get_channels[l_chn]] + } + l_chn_collect[[get_channels[l_chn]]] <- m_calib_decomp + l_chn_total_collect[[get_channels[l_chn]]] <- m_calib_total_decomp + } + + if (length(get_channels) > 1) { + l_chn_collect <- rowSums(bind_cols(l_chn_collect)) + l_chn_total_collect <- rowSums(bind_cols(l_chn_total_collect)) + } else { + l_chn_collect <- unlist(l_chn_collect, use.names = FALSE) + l_chn_total_collect <- unlist(l_chn_total_collect, use.names = FALSE) + } + + calibration_input[l_study, ] <- mutate( + calibration_input[l_study, ], + pred = sum(l_chn_collect), + pred_total = sum(l_chn_total_collect), + decompStart = range(calibrate_dates)[1], + decompEnd = range(calibrate_dates)[2] + ) + } + liftCollect <- calibration_input %>% + mutate( + decompStart = as.Date(.data$decompStart, "1970-01-01"), + decompEnd = as.Date(.data$decompEnd, "1970-01-01") + ) %>% + mutate( + liftDays = as.numeric( + difftime(.data$liftEndDate, .data$liftStartDate, units = "days") + ), + decompDays = as.numeric( + difftime(.data$decompEnd, .data$decompStart, units = "days") + ) + ) %>% + mutate( + decompAbsScaled = .data$pred / .data$decompDays * .data$liftDays, + decompAbsTotalScaled = .data$pred_total / .data$decompDays * .data$liftDays + ) %>% + mutate( + liftMedia = .data$channel, + liftStart = .data$liftStartDate, + liftEnd = .data$liftEndDate, + mape_lift = abs((.data$decompAbsScaled - .data$liftAbs) / .data$liftAbs), + calibrated_pct = .data$decompAbsScaled / .data$decompAbsTotalScaled + ) %>% + dplyr::select( + .data$liftMedia, .data$liftStart, .data$liftEnd, .data$liftAbs, + .data$decompStart, .data$decompEnd, .data$decompAbsScaled, + .data$decompAbsTotalScaled, .data$calibrated_pct, .data$mape_lift + ) + + return(liftCollect) + } +} diff --git a/R/checks.R b/R/checks.R index 34a0c91..c504793 100644 --- a/R/checks.R +++ b/R/checks.R @@ -1,1101 +1,1101 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -############# Auxiliary non-exported functions ############# - -OPTS_PDN <- c("positive", "negative", "default") -HYPS_NAMES <- c("thetas", "shapes", "scales", "alphas", "gammas", "penalty") -HYPS_OTHERS <- c("lambda", "train_size") -LEGACY_PARAMS <- c("cores", "iterations", "trials", "intercept_sign", "nevergrad_algo") - -check_nas <- function(df, channels = NULL) { - if (!is.null(channels)) df <- select(df, all_of(channels)) - name <- deparse(substitute(df)) - if (sum(is.na(df)) > 0) { - naVals <- lares::missingness(df) - strs <- sprintf("%s (%s | %s%%)", naVals$variable, naVals$missing, naVals$missingness) - stop(paste0( - "Dataset ", name, " contains missing (NA) values. ", - "These values must be removed or fixed for Robyn to properly work.\n Missing values: ", - paste(strs, collapse = ", ") - )) - } - have_inf <- unlist(lapply(df, function(x) sum(is.infinite(x)))) - if (any(have_inf > 0)) { - stop(paste0( - "Dataset ", name, " contains Inf values. ", - "These values must be removed or fixed for Robyn to properly work.\n Check: ", - paste(names(which(have_inf > 0)), collapse = ", ") - )) - } -} - -check_novar <- function(dt_input, InputCollect = NULL) { - novar <- lares::zerovar(dt_input) - if (length(novar) > 0) { - msg <- sprintf( - "There are %s column(s) with no-variance: %s. \nPlease, remove the variable(s) to proceed...", - length(novar), v2t(novar) - ) - if (!is.null(InputCollect)) { - msg <- sprintf( - "%s\n>>> Note: there's no variance on these variables because of the modeling window filter (%s:%s)", - msg, - InputCollect$window_start, - InputCollect$window_end - ) - } - stop(msg) - } -} - -check_allneg <- function(df) { - all_negative <- unlist(lapply(df, function(x) all(x <= 0))) - df <- mutate_at(df, names(which(all_negative)), function(x) abs(x)) - return(df) -} - -check_varnames <- function(dt_input, dt_holidays, - dep_var, date_var, - context_vars, paid_media_spends, - organic_vars) { - dfs <- list(dt_input = dt_input, dt_holidays = dt_holidays) - for (i in seq_along(dfs)) { - # Which names to check by data.frame - table_name <- names(dfs[i]) - if (table_name == "dt_input") { - vars <- c( - dep_var, date_var, context_vars, - paid_media_spends, organic_vars, "auto" - ) - } - if (table_name == "dt_holidays") { - vars <- c("ds", "country") # holiday? - } - df <- dfs[[i]] - vars <- vars[vars != "auto"] - # Duplicate names - if (length(vars) != length(unique(vars))) { - these <- names(table(vars)[table(vars) > 1]) - stop(paste( - "You have duplicated variable names for", table_name, "in different parameters.", - "Check:", paste(these, collapse = ", ") - )) - } - # Names with spaces - with_space <- grepl(" ", vars) - if (sum(with_space) > 0) { - stop(paste( - "You have invalid variable names on", table_name, "with spaces.\n ", - "Please fix columns:", v2t(vars[with_space]) - )) - } - } -} - -check_datevar <- function(dt_input, date_var = "auto") { - if (date_var[1] == "auto") { - is_date <- which(unlist(lapply(dt_input, is.Date))) - if (length(is_date) == 1) { - date_var <- names(is_date) - message(paste("Automatically detected 'date_var':", date_var)) - } else { - stop("Can't automatically find a single date variable to set 'date_var'") - } - } - if (is.null(date_var) || length(date_var) > 1 || !(date_var %in% names(dt_input))) { - stop("You must provide only 1 correct date variable name for 'date_var'") - } - dt_input <- data.frame(arrange(dt_input, as.factor(!!as.symbol(date_var)))) - dt_input[, date_var] <- as.Date(dt_input[[date_var]], origin = "1970-01-01") - date_var_dates <- c( - as.Date(dt_input[, date_var][[1]], origin = "1970-01-01"), - as.Date(dt_input[, date_var][[2]], origin = "1970-01-01") - ) - if (any(table(date_var_dates) > 1)) { - stop("Date variable shouldn't have duplicated dates (panel data)") - } - if (any(is.na(date_var_dates)) || any(is.infinite(date_var_dates))) { - stop("Dates in 'date_var' must have format '2020-12-31' and can't contain NA nor Inf values") - } - dayInterval <- as.integer(difftime( - date_var_dates[2], - date_var_dates[1], - units = "days" - )) - intervalType <- if (dayInterval == 1) { - "day" - } else if (dayInterval == 7) { - "week" - } else if (dayInterval %in% 28:31) { - "month" - } else { - stop(paste(date_var, "data has to be daily, weekly or monthly")) - } - output <- list( - date_var = date_var, - dayInterval = dayInterval, - intervalType = intervalType, - dt_input = as_tibble(dt_input) - ) - invisible(return(output)) -} - -check_depvar <- function(dt_input, dep_var, dep_var_type) { - if (is.null(dep_var)) { - stop("Must provide a valid dependent variable name for 'dep_var'") - } - if (!dep_var %in% names(dt_input)) { - stop("Must provide a valid dependent name for 'dep_var'") - } - if (length(dep_var) > 1) { - stop("Must provide only 1 dependent variable name for 'dep_var'") - } - if (!(is.numeric(dt_input[, dep_var][[1]]) || is.integer(dt_input[, dep_var][[1]]))) { - stop("'dep_var' must be a numeric or integer variable") - } - if (is.null(dep_var_type)) { - stop("Must provide a dependent variable type for 'dep_var_type'") - } - if (!dep_var_type %in% c("conversion", "revenue") || length(dep_var_type) != 1) { - stop("'dep_var_type' must be 'conversion' or 'revenue'") - } -} - -check_prophet <- function(dt_holidays, prophet_country, prophet_vars, prophet_signs, dayInterval) { - check_vector(prophet_vars) - check_vector(prophet_signs) - if (is.null(dt_holidays) || is.null(prophet_vars)) { - return(invisible(NULL)) - } else { - prophet_vars <- tolower(prophet_vars) - opts <- c("trend", "season", "monthly", "weekday", "holiday") - if (!"holiday" %in% prophet_vars) { - if (!is.null(prophet_country)) { - warning(paste( - "Input 'prophet_country' is defined as", prophet_country, - "but 'holiday' is not setup within 'prophet_vars' parameter" - )) - } - prophet_country <- NULL - } - if (!all(prophet_vars %in% opts)) { - stop("Allowed values for 'prophet_vars' are: ", paste(opts, collapse = ", ")) - } - if ("weekday" %in% prophet_vars && dayInterval > 7) { - warning("Ignoring prophet_vars = 'weekday' input given your data granularity") - } - if ("holiday" %in% prophet_vars && ( - is.null(prophet_country) || length(prophet_country) > 1 | - isTRUE(!prophet_country %in% unique(dt_holidays$country)))) { - stop(paste( - "You must provide 1 country code in 'prophet_country' input.", - length(unique(dt_holidays$country)), "countries are included:", - paste(unique(dt_holidays$country), collapse = ", "), - "\nIf your country is not available, manually include data to 'dt_holidays'", - "or remove 'holidays' from 'prophet_vars' input." - )) - } - if (is.null(prophet_signs)) { - prophet_signs <- rep("default", length(prophet_vars)) - } - if (length(prophet_signs) == 1) { - prophet_signs <- rep(prophet_signs, length(prophet_vars)) - } - if (!all(prophet_signs %in% OPTS_PDN)) { - stop("Allowed values for 'prophet_signs' are: ", paste(OPTS_PDN, collapse = ", ")) - } - if (length(prophet_signs) != length(prophet_vars)) { - stop("'prophet_signs' must have same length as 'prophet_vars'") - } - return(invisible(prophet_signs)) - } -} - -check_context <- function(dt_input, context_vars, context_signs) { - if (!is.null(context_vars)) { - if (is.null(context_signs)) context_signs <- rep("default", length(context_vars)) - if (!all(context_signs %in% OPTS_PDN)) { - stop("Allowed values for 'context_signs' are: ", paste(OPTS_PDN, collapse = ", ")) - } - if (length(context_signs) != length(context_vars)) { - stop("Input 'context_signs' must have same length as 'context_vars'") - } - temp <- context_vars %in% names(dt_input) - if (!all(temp)) { - stop(paste( - "Input 'context_vars' not included in data. Check:", - v2t(context_vars[!temp]) - )) - } - return(invisible(list(context_signs = context_signs))) - } -} - -check_vector <- function(x) { - if (!is.null(names(x)) || is.list(x)) { - stop(sprintf("Input '%s' must be a valid vector", deparse(substitute(x)))) - } -} - -check_paidmedia <- function(dt_input, paid_media_vars, paid_media_signs, paid_media_spends) { - if (is.null(paid_media_spends)) { - stop("Must provide 'paid_media_spends'") - } - check_vector(paid_media_vars) - check_vector(paid_media_signs) - check_vector(paid_media_spends) - expVarCount <- length(paid_media_vars) - spendVarCount <- length(paid_media_spends) - - temp <- paid_media_vars %in% names(dt_input) - if (!all(temp)) { - stop(paste( - "Input 'paid_media_vars' not included in data. Check:", - v2t(paid_media_vars[!temp]) - )) - } - temp <- paid_media_spends %in% names(dt_input) - if (!all(temp)) { - stop(paste( - "Input 'paid_media_spends' not included in data. Check:", - v2t(paid_media_spends[!temp]) - )) - } - if (is.null(paid_media_signs)) { - paid_media_signs <- rep("positive", expVarCount) - } - if (!all(paid_media_signs %in% OPTS_PDN)) { - stop("Allowed values for 'paid_media_signs' are: ", paste(OPTS_PDN, collapse = ", ")) - } - if (length(paid_media_signs) == 1) { - paid_media_signs <- rep(paid_media_signs, length(paid_media_vars)) - } - if (length(paid_media_signs) != length(paid_media_vars)) { - stop("Input 'paid_media_signs' must have same length as 'paid_media_vars'") - } - if (spendVarCount != expVarCount) { - stop("Input 'paid_media_spends' must have same length as 'paid_media_vars'") - } - is_num <- unlist(lapply(dt_input[, paid_media_vars], is.numeric)) - if (!all(is_num)) { - stop("All your 'paid_media_vars' must be numeric. Check: ", v2t(paid_media_vars[!is_num])) - } - is_num <- unlist(lapply(dt_input[, paid_media_spends], is.numeric)) - if (!all(is_num)) { - stop("All your 'paid_media_spends' must be numeric. Check: ", v2t(paid_media_spends[!is_num])) - } - get_cols <- any(dt_input[, unique(c(paid_media_vars, paid_media_spends))] < 0) - if (get_cols) { - check_media_names <- unique(c(paid_media_vars, paid_media_spends)) - df_check <- dt_input[, check_media_names] - check_media_val <- unlist(lapply(df_check, function(x) any(x < 0))) - stop( - paste(names(check_media_val)[check_media_val], collapse = ", "), - " contains negative values. Media must be >=0" - ) - } - return(invisible(list( - paid_media_signs = paid_media_signs, - expVarCount = expVarCount, - paid_media_vars = paid_media_vars - ))) -} - -check_organicvars <- function(dt_input, organic_vars, organic_signs) { - if (is.null(organic_vars)) { - return(invisible(NULL)) - } - check_vector(organic_vars) - check_vector(organic_signs) - temp <- organic_vars %in% names(dt_input) - if (!all(temp)) { - stop(paste( - "Input 'organic_vars' not included in data. Check:", - v2t(organic_vars[!temp]) - )) - } - if (!is.null(organic_vars) && is.null(organic_signs)) { - organic_signs <- rep("positive", length(organic_vars)) - # message("'organic_signs' were not provided. Using 'positive'") - } - if (!all(organic_signs %in% OPTS_PDN)) { - stop("Allowed values for 'organic_signs' are: ", paste(OPTS_PDN, collapse = ", ")) - } - if (length(organic_signs) != length(organic_vars)) { - stop("Input 'organic_signs' must have same length as 'organic_vars'") - } - is_num <- unlist(lapply(dt_input[, organic_vars], is.numeric)) - if (!all(is_num)) { - stop("All your 'organic_vars' must be numeric. Check: ", v2t(organic_vars[!is_num])) - } - return(invisible(list(organic_signs = organic_signs))) -} - -check_factorvars <- function(dt_input, factor_vars = NULL, context_vars = NULL) { - check_vector(factor_vars) - check_vector(context_vars) - temp <- select(dt_input, all_of(context_vars)) - are_not_numeric <- !sapply(temp, is.numeric) - if (any(are_not_numeric)) { - these <- are_not_numeric[!names(are_not_numeric) %in% factor_vars] - these <- these[these] - if (length(these) > 0) { - message("Automatically set these variables as 'factor_vars': ", v2t(names(these))) - factor_vars <- c(factor_vars, names(these)) - } - } - if (!is.null(factor_vars)) { - if (!all(factor_vars %in% context_vars)) { - stop("Input 'factor_vars' must be any from 'context_vars' inputs") - } - } - return(factor_vars) -} - -check_allvars <- function(all_ind_vars) { - if (length(all_ind_vars) != length(unique(all_ind_vars))) { - stop("All input variables must have unique names") - } -} - -check_datadim <- function(dt_input, all_ind_vars, rel = 10) { - num_obs <- nrow(dt_input) - if (num_obs < length(all_ind_vars) * rel) { - warning(paste( - "There are", length(all_ind_vars), "independent variables &", - num_obs, "data points.", "We recommend row:column ratio of", rel, "to 1" - )) - } - if (ncol(dt_input) <= 2) { - stop("Provide a valid 'dt_input' input with at least 3 columns or more") - } -} - -check_windows <- function(dt_input, date_var, all_media, window_start, window_end) { - dates_vec <- as.Date(dt_input[, date_var][[1]], origin = "1970-01-01") - - if (is.null(window_start)) { - window_start <- min(dates_vec) - } else { - window_start <- as.Date(as.character(window_start), "%Y-%m-%d", origin = "1970-01-01") - if (is.na(window_start)) { - stop(sprintf("Input 'window_start' must have date format, i.e. '%s'", Sys.Date())) - } else if (window_start < min(dates_vec)) { - window_start <- min(dates_vec) - message(paste( - "Input 'window_start' is smaller than the earliest date in input data.", - "It's automatically set to the earliest date:", window_start - )) - } else if (window_start > max(dates_vec)) { - stop("Input 'window_start' can't be larger than the the latest date in input data: ", max(dates_vec)) - } - } - - rollingWindowStartWhich <- which.min(abs(difftime( - dates_vec, - window_start, - units = "days" - ))) - if (!window_start %in% dates_vec) { - window_start <- dt_input[rollingWindowStartWhich, date_var][[1]] - message("Input 'window_start' is adapted to the closest date contained in input data: ", window_start) - } - refreshAddedStart <- window_start - - if (is.null(window_end)) { - window_end <- max(dates_vec) - } else { - window_end <- as.Date(as.character(window_end), "%Y-%m-%d", origin = "1970-01-01") - if (is.na(window_end)) { - stop(sprintf("Input 'window_end' must have date format, i.e. '%s'", Sys.Date())) - } else if (window_end > max(dates_vec)) { - window_end <- max(dates_vec) - message(paste( - "Input 'window_end' is larger than the latest date in input data.", - "It's automatically set to the latest date:", window_end - )) - } else if (window_end < window_start) { - window_end <- max(dates_vec) - message(paste( - "Input 'window_end' must be >= 'window_start.", - "It's automatically set to the latest date:", window_end - )) - } - } - - rollingWindowEndWhich <- which.min(abs(difftime(dates_vec, window_end, units = "days"))) - if (!(window_end %in% dates_vec)) { - window_end <- dt_input[rollingWindowEndWhich, date_var][[1]] - message("Input 'window_end' is adapted to the closest date contained in input data: ", window_end) - } - rollingWindowLength <- rollingWindowEndWhich - rollingWindowStartWhich + 1 - - dt_init <- dt_input[rollingWindowStartWhich:rollingWindowEndWhich, all_media] - - init_all0 <- dplyr::select_if(dt_init, is.numeric) %>% colSums(.) == 0 - if (any(init_all0)) { - stop( - "These media channels contains only 0 within training period ", - dt_input[rollingWindowStartWhich, date_var][[1]], " to ", - dt_input[rollingWindowEndWhich, date_var][[1]], ": ", - paste(names(dt_init)[init_all0], collapse = ", "), - "\nRecommendation: adapt InputCollect$window_start, remove or combine these channels" - ) - } - output <- list( - dt_input = dt_input, - window_start = window_start, - rollingWindowStartWhich = rollingWindowStartWhich, - refreshAddedStart = refreshAddedStart, - window_end = window_end, - rollingWindowEndWhich = rollingWindowEndWhich, - rollingWindowLength = rollingWindowLength - ) - return(invisible(output)) -} - -check_adstock <- function(adstock) { - if (is.null(adstock)) { - stop("Input 'adstock' can't be NULL. Set any of: 'geometric', 'weibull_cdf' or 'weibull_pdf'") - } - if (adstock == "weibull") adstock <- "weibull_cdf" - if (!adstock %in% c("geometric", "weibull_cdf", "weibull_pdf")) { - stop("Input 'adstock' must be 'geometric', 'weibull_cdf' or 'weibull_pdf'") - } - return(adstock) -} - -check_hyperparameters <- function(hyperparameters = NULL, adstock = NULL, - paid_media_spends = NULL, organic_vars = NULL, - exposure_vars = NULL, prophet_vars = NULL, - contextual_vars = NULL) { - if (is.null(hyperparameters)) { - message(paste( - "Input 'hyperparameters' not provided yet. To include them, run", - "robyn_inputs(InputCollect = InputCollect, hyperparameters = ...)" - )) - } else { - if (!"train_size" %in% names(hyperparameters)) { - hyperparameters[["train_size"]] <- c(0.5, 0.8) - warning("Automatically added missing hyperparameter range: 'train_size' = c(0.5, 0.8)") - } - # Non-adstock hyperparameters check - check_train_size(hyperparameters) - # Adstock hyperparameters check - hyperparameters_ordered <- hyperparameters[order(names(hyperparameters))] - get_hyp_names <- names(hyperparameters_ordered) - original_order <- sapply(names(hyperparameters), function(x) which(x == get_hyp_names)) - ref_hyp_name_spend <- hyper_names(adstock, all_media = paid_media_spends) - ref_hyp_name_expo <- hyper_names(adstock, all_media = exposure_vars) - ref_hyp_name_org <- hyper_names(adstock, all_media = organic_vars) - ref_hyp_name_other <- get_hyp_names[get_hyp_names %in% HYPS_OTHERS] - # Excluding lambda (first HYPS_OTHERS) given its range is not customizable - ref_all_media <- sort(c(ref_hyp_name_spend, ref_hyp_name_org, HYPS_OTHERS)) - all_ref_names <- c(ref_hyp_name_spend, ref_hyp_name_expo, ref_hyp_name_org, HYPS_OTHERS) - all_ref_names <- all_ref_names[order(all_ref_names)] - # Adding penalty variations to the dictionary - if (any(grepl("_penalty", paste0(get_hyp_names)))) { - ref_hyp_name_penalties <- paste0( - c(paid_media_spends, organic_vars, prophet_vars, contextual_vars), "_penalty" - ) - all_ref_names <- c(all_ref_names, ref_hyp_name_penalties) - } else { - ref_hyp_name_penalties <- NULL - } - if (!all(get_hyp_names %in% all_ref_names)) { - wrong_hyp_names <- get_hyp_names[which(!(get_hyp_names %in% all_ref_names))] - stop( - "Input 'hyperparameters' contains following wrong names: ", - paste(wrong_hyp_names, collapse = ", ") - ) - } - total <- length(get_hyp_names) - total_in <- length(c(ref_hyp_name_spend, ref_hyp_name_org, ref_hyp_name_penalties, ref_hyp_name_other)) - if (total != total_in) { - stop(sprintf( - paste( - "%s hyperparameter values are required, and %s were provided.", - "\n Use hyper_names() function to help you with the correct hyperparameters names." - ), - total_in, total - )) - } - # Old workflow: replace exposure with spend hyperparameters - if (any(get_hyp_names %in% ref_hyp_name_expo)) { - get_expo_pos <- which(get_hyp_names %in% ref_hyp_name_expo) - get_hyp_names[get_expo_pos] <- ref_all_media[get_expo_pos] - names(hyperparameters_ordered) <- get_hyp_names - } - check_hyper_limits(hyperparameters_ordered, "thetas") - check_hyper_limits(hyperparameters_ordered, "alphas") - check_hyper_limits(hyperparameters_ordered, "gammas") - check_hyper_limits(hyperparameters_ordered, "shapes") - check_hyper_limits(hyperparameters_ordered, "scales") - hyperparameters_unordered <- hyperparameters_ordered[original_order] - return(hyperparameters_unordered) - } -} - -check_train_size <- function(hyps) { - if ("train_size" %in% names(hyps)) { - if (!length(hyps$train_size) %in% 1:2) { - stop("Hyperparameter 'train_size' must be length 1 (fixed) or 2 (range)") - } - if (any(hyps$train_size <= 0.1) || any(hyps$train_size > 1)) { - stop("Hyperparameter 'train_size' values must be defined between 0.1 and 1") - } - } -} - -check_hyper_limits <- function(hyperparameters, hyper) { - hyper_which <- which(endsWith(names(hyperparameters), hyper)) - if (length(hyper_which) == 0) { - return(invisible(NULL)) - } - limits <- hyper_limits()[[hyper]] - for (i in hyper_which) { - values <- hyperparameters[[i]] - # Lower limit - ineq <- paste(values[1], limits[1], sep = "", collapse = "") - lower_pass <- eval(parse(text = ineq)) - if (!lower_pass) { - stop(sprintf("%s's hyperparameter must have lower bound %s", names(hyperparameters)[i], limits[1])) - } - # Upper limit - ineq <- paste(values[2], limits[2], sep = "", collapse = "") - upper_pass <- eval(parse(text = ineq)) | length(values) == 1 - if (!upper_pass) { - stop(sprintf("%s's hyperparameter must have upper bound %s", names(hyperparameters)[i], limits[2])) - } - # Order of limits - order_pass <- !isFALSE(values[1] <= values[2]) - if (!order_pass) { - stop(sprintf("%s's hyperparameter must have lower bound first and upper bound second", names(hyperparameters)[i])) - } - } -} - -check_calibration <- function(dt_input, date_var, calibration_input, dayInterval, dep_var, - window_start, window_end, paid_media_spends, organic_vars) { - if (!is.null(calibration_input)) { - calibration_input <- as_tibble(as.data.frame(calibration_input)) - these <- c("channel", "liftStartDate", "liftEndDate", "liftAbs", "spend", "confidence", "metric", "calibration_scope") - if (!all(these %in% names(calibration_input))) { - stop("Input 'calibration_input' must contain columns: ", v2t(these), ". Check the demo script for instruction.") - } - if (!is.numeric(calibration_input$liftAbs) || any(is.na(calibration_input$liftAbs))) { - stop("Check 'calibration_input$liftAbs': all lift values must be valid numerical numbers") - } - all_media <- c(paid_media_spends, organic_vars) - cal_media <- str_split(calibration_input$channel, "\\+|,|;|\\s") - if (!all(unlist(cal_media) %in% all_media)) { - these <- unique(unlist(cal_media)[which(!unlist(cal_media) %in% all_media)]) - stop(sprintf( - "All channels from 'calibration_input' must be any of: %s.\n Check: %s", - v2t(all_media), v2t(these) - )) - } - for (i in seq_along(calibration_input$channel)) { - temp <- calibration_input[i, ] - if (temp$liftStartDate < (window_start) || temp$liftEndDate > (window_end)) { - stop(sprintf( - paste( - "Your calibration's date range for %s between %s and %s is not within modeling window (%s to %s).", - "Please, remove this experiment from 'calibration_input'." - ), - temp$channel, temp$liftStartDate, temp$liftEndDate, window_start, window_end - )) - } - if (temp$liftStartDate > temp$liftEndDate) { - stop(sprintf( - paste( - "Your calibration's date range for %s between %s and %s should respect liftStartDate <= liftEndDate.", - "Please, correct this experiment from 'calibration_input'." - ), - temp$channel, temp$liftStartDate, temp$liftEndDate - )) - } - } - if ("spend" %in% colnames(calibration_input)) { - for (i in seq_along(calibration_input$channel)) { - temp <- calibration_input[i, ] - temp2 <- cal_media[[i]] - if (all(temp2 %in% organic_vars)) next - dt_input_spend <- filter( - dt_input, get(date_var) >= temp$liftStartDate, - get(date_var) <= temp$liftEndDate - ) %>% - select(all_of(temp2)) %>% - sum(.) %>% - round(., 0) - if (dt_input_spend > temp$spend * 1.1 || dt_input_spend < temp$spend * 0.9) { - warning(sprintf( - paste( - "Your calibration's spend (%s) for %s between %s and %s does not match your dt_input spend (~%s).", - "Please, check again your dates or split your media inputs into separate media channels." - ), - formatNum(temp$spend, 0), temp$channel, temp$liftStartDate, temp$liftEndDate, - formatNum(dt_input_spend, 3, abbr = TRUE) - )) - } - } - } - if ("confidence" %in% colnames(calibration_input)) { - for (i in seq_along(calibration_input$channel)) { - temp <- calibration_input[i, ] - if (temp$confidence < 0.8) { - warning(sprintf( - paste( - "Your calibration's confidence for %s between %s and %s is lower than 80%%, thus low-confidence.", - "Consider getting rid of this experiment and running it again." - ), - temp$channel, temp$liftStartDate, temp$liftEndDate - )) - } - } - } - if ("metric" %in% colnames(calibration_input)) { - for (i in seq_along(calibration_input$channel)) { - temp <- calibration_input[i, ] - if (temp$metric != dep_var) { - stop(sprintf( - paste( - "Your calibration's metric for %s between %s and %s is not '%s'.", - "Please, remove this experiment from 'calibration_input'." - ), - temp$channel, temp$liftStartDate, temp$liftEndDate, dep_var - )) - } - } - } - if ("scope" %in% colnames(calibration_input)) { - these <- c("immediate", "total") - if (!all(calibration_input$scope %in% these)) { - stop("Inputs in 'calibration_input$scope' must be any of: ", v2t(these)) - } - } - } - return(calibration_input) -} - -check_obj_weight <- function(calibration_input, objective_weights, refresh) { - obj_len <- ifelse(is.null(calibration_input), 2, 3) - if (!is.null(objective_weights)) { - if ((length(objective_weights) != obj_len)) { - stop(paste0("objective_weights must have length of ", obj_len)) - } - if (any(objective_weights < 0) | any(objective_weights > 10)) { - stop("objective_weights must be >= 0 & <= 10") - } - } - if (is.null(objective_weights) & refresh) { - if (obj_len == 2) { - objective_weights <- c(0, 1) - } else { - objective_weights <- c(0, 1, 1) - } - } - return(objective_weights) -} - -check_iteration <- function(calibration_input, iterations, trials, hyps_fixed, refresh) { - if (!refresh) { - if (!hyps_fixed) { - if (is.null(calibration_input) && (iterations < 2000 || trials < 5)) { - warning("We recommend to run at least 2000 iterations per trial and 5 trials to build initial model") - } else if (!is.null(calibration_input) && (iterations < 2000 || trials < 10)) { - warning(paste( - "You are calibrating MMM. We recommend to run at least 2000 iterations per trial and", - "10 trials to build initial model" - )) - } - } - } -} - -check_InputCollect <- function(list) { - names_list <- c( - "dt_input", "paid_media_vars", "paid_media_spends", "context_vars", - "organic_vars", "all_ind_vars", "date_var", "dep_var", - "rollingWindowStartWhich", "rollingWindowEndWhich", - "factor_vars", "prophet_vars", "prophet_signs", "prophet_country", - "intervalType", "dt_holidays" - ) - if (!all(names_list %in% names(list))) { - not_present <- names_list[!names_list %in% names(list)] - stop(paste( - "Some elements where not provided in your inputs list:", - paste(not_present, collapse = ", ") - )) - } - - if (length(list$dt_input) <= 1) { - stop("Check your 'dt_input' object") - } -} - -check_robyn_name <- function(robyn_object, quiet = FALSE) { - if (!is.null(robyn_object)) { - if (!dir.exists(robyn_object)) { - file_end <- lares::right(robyn_object, 4) - if (file_end != ".RDS") { - stop("Input 'robyn_object' must has format .RDS") - } - } - } else { - if (!quiet) message("Skipping export into RDS file") - } -} - -check_dir <- function(plot_folder) { - file_end <- substr(plot_folder, nchar(plot_folder) - 3, nchar(plot_folder)) - if (file_end == ".RDS") { - plot_folder <- dirname(plot_folder) - message("Using robyn object location: ", plot_folder) - } else { - plot_folder <- file.path(dirname(plot_folder), basename(plot_folder)) - } - if (!dir.exists(plot_folder)) { - plot_folder <- getwd() - message("WARNING: Provided 'plot_folder' doesn't exist. Using current working directory: ", plot_folder) - } - return(plot_folder) -} - -check_calibconstr <- function(calibration_constraint, iterations, trials, calibration_input, refresh) { - if (!is.null(calibration_input) & !refresh) { - total_iters <- iterations * trials - if (calibration_constraint < 0.01 || calibration_constraint > 0.1) { - message("Input 'calibration_constraint' must be >= 0.01 and <= 0.1. Changed to default: 0.1") - calibration_constraint <- 0.1 - } - models_lower <- 500 - if (total_iters * calibration_constraint < models_lower) { - warning(sprintf( - paste( - "Input 'calibration_constraint' set for top %s%% calibrated models.", - "%s models left for pareto-optimal selection. Minimum suggested: %s" - ), - calibration_constraint * 100, - round(total_iters * calibration_constraint, 0), - models_lower - )) - } - } - return(calibration_constraint) -} - -check_hyper_fixed <- function(InputCollect, dt_hyper_fixed, add_penalty_factor) { - hyper_fixed <- !is.null(dt_hyper_fixed) - # Adstock hyper-parameters - hypParamSamName <- hyper_names(adstock = InputCollect$adstock, all_media = InputCollect$all_media) - # Add lambda and other hyper-parameters manually - hypParamSamName <- c(hypParamSamName, HYPS_OTHERS) - # Add penalty factor hyper-parameters names - if (add_penalty_factor) { - for_penalty <- names(select(InputCollect$dt_mod, -.data$ds, -.data$dep_var)) - hypParamSamName <- c(hypParamSamName, paste0(for_penalty, "_penalty")) - } - if (hyper_fixed) { - ## Run robyn_mmm if using old model result tables - dt_hyper_fixed <- as_tibble(dt_hyper_fixed) - if (nrow(dt_hyper_fixed) != 1) { - stop(paste( - "Provide only 1 model / 1 row from OutputCollect$resultHypParam or", - "pareto_hyperparameters.csv from previous runs" - )) - } - if (!all(hypParamSamName %in% names(dt_hyper_fixed))) { - these <- hypParamSamName[!hypParamSamName %in% names(dt_hyper_fixed)] - stop(paste( - "Input 'dt_hyper_fixed' is invalid.", - "Please provide 'OutputCollect$resultHypParam' result from previous runs or", - "'pareto_hyperparameters.csv' data with desired model ID. Missing values for:", v2t(these) - )) - } - } - attr(hyper_fixed, "hypParamSamName") <- hypParamSamName - return(hyper_fixed) -} - -# Enable parallelisation of main modelling loop for MacOS and Linux only -check_parallel <- function() "unix" %in% .Platform$OS.type -# ggplot doesn't work with process forking on MacOS; however it works fine on Linux and Windows -check_parallel_plot <- function() !"Darwin" %in% Sys.info()["sysname"] - -check_init_msg <- function(InputCollect, cores) { - opt <- sum(lapply(InputCollect$hyper_updated, length) == 2) - fix <- sum(lapply(InputCollect$hyper_updated, length) == 1) - det <- sprintf("(%s to iterate + %s fixed)", opt, fix) - base <- paste( - "Using", InputCollect$adstock, "adstocking with", - length(InputCollect$hyper_updated), "hyperparameters", det - ) - if (cores == 1) { - message(paste(base, "with no parallel computation")) - } else { - message(paste(base, "on", cores, "cores")) - } -} - -check_class <- function(x, object) { - if (any(!x %in% class(object))) stop(sprintf("Input object must be class %s", x)) -} - -check_allocator_constrains <- function(low, upr) { - if (all(is.na(low)) || all(is.na(upr))) { - stop("You must define lower (channel_constr_low) and upper (channel_constr_up) constraints") - } - max_length <- max(c(length(low), length(upr))) - if (any(low < 0)) { - stop("Inputs 'channel_constr_low' must be >= 0") - } - if (length(upr) != length(low)) { - stop("Inputs 'channel_constr_up' and 'channel_constr_low' must have the same length or length 1") - } - if (any(upr < low)) { - stop("Inputs 'channel_constr_up' must be >= 'channel_constr_low'") - } -} - -check_allocator <- function(OutputCollect, select_model, paid_media_spends, scenario, - channel_constr_low, channel_constr_up, constr_mode) { - if (!(select_model %in% OutputCollect$allSolutions)) { - stop( - "Provided 'select_model' is not within the best results. Try any of: ", - paste(OutputCollect$allSolutions, collapse = ", ") - ) - } - if ("max_historical_response" %in% scenario) scenario <- "max_response" - opts <- c("max_response", "target_efficiency") # Deprecated: max_response_expected_spend - if (!(scenario %in% opts)) { - stop("Input 'scenario' must be one of: ", paste(opts, collapse = ", ")) - } - check_allocator_constrains(channel_constr_low, channel_constr_up) - if (!(scenario == "target_efficiency" & is.null(channel_constr_low) & is.null(channel_constr_up))) { - if (length(channel_constr_low) != 1 && length(channel_constr_low) != length(paid_media_spends)) { - stop(paste( - "Input 'channel_constr_low' have to contain either only 1", - "value or have same length as 'InputCollect$paid_media_spends':", length(paid_media_spends) - )) - } - if (length(channel_constr_up) != 1 && length(channel_constr_up) != length(paid_media_spends)) { - stop(paste( - "Input 'channel_constr_up' have to contain either only 1", - "value or have same length as 'InputCollect$paid_media_spends':", length(paid_media_spends) - )) - } - } - opts <- c("eq", "ineq") - if (!(constr_mode %in% opts)) { - stop("Input 'constr_mode' must be one of: ", paste(opts, collapse = ", ")) - } - return(scenario) -} - -check_metric_type <- function(metric_name, paid_media_spends, paid_media_vars, exposure_vars, organic_vars) { - if (metric_name %in% paid_media_spends && length(metric_name) == 1) { - metric_type <- "spend" - } else if (metric_name %in% exposure_vars && length(metric_name) == 1) { - metric_type <- "exposure" - } else if (metric_name %in% organic_vars && length(metric_name) == 1) { - metric_type <- "organic" - } else { - stop(paste( - "Invalid 'metric_name' input:", metric_name, - "\nInput should be any media variable from paid_media_spends (spend),", - "paid_media_vars (exposure), or organic_vars (organic):", - paste("\n- paid_media_spends:", v2t(paid_media_spends, quotes = FALSE)), - paste("\n- paid_media_vars:", v2t(paid_media_vars, quotes = FALSE)), - paste("\n- organic_vars:", v2t(organic_vars, quotes = FALSE)) - )) - } - return(metric_type) -} - -check_metric_dates <- function(date_range = NULL, all_dates, dayInterval = NULL, quiet = FALSE, is_allocator = FALSE, ...) { - ## default using latest 30 days / 4 weeks / 1 month for spend level - if (is.null(date_range)) { - if (is.null(dayInterval)) stop("Input 'date_range' or 'dayInterval' must be defined") - # if (!is_allocator) { - # date_range <- "last_1" - # } else { - # date_range <- paste0("last_", case_when( - # dayInterval == 1 ~ 30, - # dayInterval == 7 ~ 4, - # dayInterval >= 30 & dayInterval <= 31 ~ 1, - # )) - # } - date_range <- "all" - if (!quiet) message(sprintf("Automatically picked date_range = '%s'", date_range)) - } - if (grepl("last|all", date_range[1])) { - ## Using last_n as date_range range - if ("all" %in% date_range) date_range <- paste0("last_", length(all_dates)) - get_n <- ifelse(grepl("_", date_range[1]), as.integer(gsub("last_", "", date_range)), 1) - date_range <- tail(all_dates, get_n) - date_range_loc <- which(all_dates %in% date_range) - date_range_updated <- all_dates[date_range_loc] - rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) - } else { - ## Using dates as date_range range - if (all(is.Date(as.Date(date_range, origin = "1970-01-01")))) { - date_range <- as.Date(date_range, origin = "1970-01-01") - if (length(date_range) == 1) { - ## Using only 1 date - if (all(date_range %in% all_dates)) { - date_range_updated <- date_range - date_range_loc <- which(all_dates == date_range) - if (!quiet) message("Using ds '", date_range_updated, "' as the response period") - } else { - date_range_loc <- which.min(abs(date_range - all_dates)) - date_range_updated <- all_dates[date_range_loc] - if (!quiet) warning("Input 'date_range' (", date_range, ") has no match. Picking closest date: ", date_range_updated) - } - } else if (length(date_range) == 2) { - ## Using two dates as "from-to" date range - date_range_loc <- unlist(lapply(date_range, function(x) which.min(abs(x - all_dates)))) - date_range_loc <- date_range_loc[1]:date_range_loc[2] - date_range_updated <- all_dates[date_range_loc] - if (!quiet & !all(date_range %in% date_range_updated)) { - warning(paste( - "At least one date in 'date_range' input do not match any date.", - "Picking closest dates for range:", paste(range(date_range_updated), collapse = ":") - )) - } - rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) - get_n <- length(date_range_loc) - } else { - ## Manually inputting each date - date_range_updated <- date_range - if (all(date_range %in% all_dates)) { - date_range_loc <- which(all_dates %in% date_range_updated) - } else { - date_range_loc <- unlist(lapply(date_range_updated, function(x) which.min(abs(x - all_dates)))) - rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) - } - if (all(na.omit(date_range_loc - lag(date_range_loc)) == 1)) { - date_range_updated <- all_dates[date_range_loc] - if (!quiet) warning("At least one date in 'date_range' do not match ds. Picking closest date: ", date_range_updated) - } else { - stop("Input 'date_range' needs to have sequential dates") - } - } - } else { - stop("Input 'date_range' must have date format '2023-01-01' or use 'last_n'") - } - } - return(list( - date_range_updated = date_range_updated, - metric_loc = date_range_loc - )) -} - -check_metric_value <- function(metric_value, metric_name, all_values, metric_loc) { - get_n <- length(metric_loc) - if (any(is.nan(metric_value))) metric_value <- NULL - if (!is.null(metric_value)) { - if (!is.numeric(metric_value)) { - stop(sprintf( - "Input 'metric_value' for %s (%s) must be a numerical value\n", metric_name, toString(metric_value) - )) - } - if (any(metric_value < 0)) { - stop(sprintf( - "Input 'metric_value' for %s must be positive\n", metric_name - )) - } - if (get_n > 1 & length(metric_value) == 1) { - metric_value_updated <- rep(metric_value / get_n, get_n) - # message(paste0("'metric_value'", metric_value, " splitting into ", get_n, " periods evenly")) - } else { - if (length(metric_value) != get_n) { - stop("robyn_response metric_value & date_range must have same length\n") - } - metric_value_updated <- metric_value - } - } - if (is.null(metric_value)) { - metric_value_updated <- all_values[metric_loc] - } - all_values_updated <- all_values - all_values_updated[metric_loc] <- metric_value_updated - return(list( - metric_value_updated = metric_value_updated, - all_values_updated = all_values_updated - )) -} - -check_legacy_input <- function(InputCollect, - cores = NULL, iterations = NULL, trials = NULL, - intercept_sign = NULL, nevergrad_algo = NULL) { - if (!any(LEGACY_PARAMS %in% names(InputCollect))) { - return(invisible(InputCollect)) - } # Legacy check - # Warn the user these InputCollect params will be (are) deprecated - legacyValues <- InputCollect[LEGACY_PARAMS] - legacyValues <- legacyValues[!unlist(lapply(legacyValues, is.null))] - if (length(legacyValues) > 0) { - warning(sprintf( - "Using legacy InputCollect values. Please set %s within robyn_run() instead", - v2t(names(legacyValues)) - )) - } - # Overwrite InputCollect with robyn_run() inputs - if (!is.null(cores)) InputCollect$cores <- cores - if (!is.null(iterations)) InputCollect$iterations <- iterations - if (!is.null(trials)) InputCollect$trials <- trials - if (!is.null(intercept_sign)) InputCollect$intercept_sign <- intercept_sign - if (!is.null(nevergrad_algo)) InputCollect$nevergrad_algo <- nevergrad_algo - attr(InputCollect, "deprecated_params") <- TRUE - return(invisible(InputCollect)) -} - -check_run_inputs <- function(cores, iterations, trials, intercept_sign, nevergrad_algo) { - if (is.null(iterations)) stop("Must provide 'iterations' in robyn_run()") - if (is.null(trials)) stop("Must provide 'trials' in robyn_run()") - if (is.null(nevergrad_algo)) stop("Must provide 'nevergrad_algo' in robyn_run()") - opts <- c("non_negative", "unconstrained") - if (!intercept_sign %in% opts) { - stop(sprintf("Input 'intercept_sign' must be any of: %s", paste(opts, collapse = ", "))) - } -} - -check_daterange <- function(date_min, date_max, dates) { - if (!is.null(date_min)) { - if (length(date_min) > 1) stop("Set a single date for 'date_min' parameter") - if (date_min < min(dates)) { - warning(sprintf( - "Parameter 'date_min' not in your data's date range. Changed to '%s'", min(dates) - )) - } - } - if (!is.null(date_max)) { - if (length(date_max) > 1) stop("Set a single date for 'date_max' parameter") - if (date_max > max(dates)) { - warning(sprintf( - "Parameter 'date_max' not in your data's date range. Changed to '%s'", max(dates) - )) - } - } -} - -check_refresh_data <- function(Robyn, dt_input) { - original_periods <- nrow(Robyn$listInit$InputCollect$dt_modRollWind) - new_periods <- nrow(filter( - dt_input, get(Robyn$listInit$InputCollect$date_var) > Robyn$listInit$InputCollect$window_end - )) - it <- Robyn$listInit$InputCollect$intervalType - if (new_periods > 0.5 * (original_periods + new_periods)) { - warning(sprintf( - paste( - "We recommend re-building a model rather than refreshing this one.", - "More than 50%% of your refresh data (%s %ss) is new data (%s %ss)" - ), - original_periods + new_periods, it, new_periods, it - )) - } -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +############# Auxiliary non-exported functions ############# + +OPTS_PDN <- c("positive", "negative", "default") +HYPS_NAMES <- c("thetas", "shapes", "scales", "alphas", "gammas", "penalty") +HYPS_OTHERS <- c("lambda", "train_size") +LEGACY_PARAMS <- c("cores", "iterations", "trials", "intercept_sign", "nevergrad_algo") + +check_nas <- function(df, channels = NULL) { + if (!is.null(channels)) df <- select(df, all_of(channels)) + name <- deparse(substitute(df)) + if (sum(is.na(df)) > 0) { + naVals <- lares::missingness(df) + strs <- sprintf("%s (%s | %s%%)", naVals$variable, naVals$missing, naVals$missingness) + stop(paste0( + "Dataset ", name, " contains missing (NA) values. ", + "These values must be removed or fixed for Robyn to properly work.\n Missing values: ", + paste(strs, collapse = ", ") + )) + } + have_inf <- unlist(lapply(df, function(x) sum(is.infinite(x)))) + if (any(have_inf > 0)) { + stop(paste0( + "Dataset ", name, " contains Inf values. ", + "These values must be removed or fixed for Robyn to properly work.\n Check: ", + paste(names(which(have_inf > 0)), collapse = ", ") + )) + } +} + +check_novar <- function(dt_input, InputCollect = NULL) { + novar <- lares::zerovar(dt_input) + if (length(novar) > 0) { + msg <- sprintf( + "There are %s column(s) with no-variance: %s. \nPlease, remove the variable(s) to proceed...", + length(novar), v2t(novar) + ) + if (!is.null(InputCollect)) { + msg <- sprintf( + "%s\n>>> Note: there's no variance on these variables because of the modeling window filter (%s:%s)", + msg, + InputCollect$window_start, + InputCollect$window_end + ) + } + stop(msg) + } +} + +check_allneg <- function(df) { + all_negative <- unlist(lapply(df, function(x) all(x <= 0))) + df <- mutate_at(df, names(which(all_negative)), function(x) abs(x)) + return(df) +} + +check_varnames <- function(dt_input, dt_holidays, + dep_var, date_var, + context_vars, paid_media_spends, + organic_vars) { + dfs <- list(dt_input = dt_input, dt_holidays = dt_holidays) + for (i in seq_along(dfs)) { + # Which names to check by data.frame + table_name <- names(dfs[i]) + if (table_name == "dt_input") { + vars <- c( + dep_var, date_var, context_vars, + paid_media_spends, organic_vars, "auto" + ) + } + if (table_name == "dt_holidays") { + vars <- c("ds", "country") # holiday? + } + df <- dfs[[i]] + vars <- vars[vars != "auto"] + # Duplicate names + if (length(vars) != length(unique(vars))) { + these <- names(table(vars)[table(vars) > 1]) + stop(paste( + "You have duplicated variable names for", table_name, "in different parameters.", + "Check:", paste(these, collapse = ", ") + )) + } + # Names with spaces + with_space <- grepl(" ", vars) + if (sum(with_space) > 0) { + stop(paste( + "You have invalid variable names on", table_name, "with spaces.\n ", + "Please fix columns:", v2t(vars[with_space]) + )) + } + } +} + +check_datevar <- function(dt_input, date_var = "auto") { + if (date_var[1] == "auto") { + is_date <- which(unlist(lapply(dt_input, is.Date))) + if (length(is_date) == 1) { + date_var <- names(is_date) + message(paste("Automatically detected 'date_var':", date_var)) + } else { + stop("Can't automatically find a single date variable to set 'date_var'") + } + } + if (is.null(date_var) || length(date_var) > 1 || !(date_var %in% names(dt_input))) { + stop("You must provide only 1 correct date variable name for 'date_var'") + } + dt_input <- data.frame(arrange(dt_input, as.factor(!!as.symbol(date_var)))) + dt_input[, date_var] <- as.Date(dt_input[[date_var]], origin = "1970-01-01") + date_var_dates <- c( + as.Date(dt_input[, date_var][[1]], origin = "1970-01-01"), + as.Date(dt_input[, date_var][[2]], origin = "1970-01-01") + ) + if (any(table(date_var_dates) > 1)) { + stop("Date variable shouldn't have duplicated dates (panel data)") + } + if (any(is.na(date_var_dates)) || any(is.infinite(date_var_dates))) { + stop("Dates in 'date_var' must have format '2020-12-31' and can't contain NA nor Inf values") + } + dayInterval <- as.integer(difftime( + date_var_dates[2], + date_var_dates[1], + units = "days" + )) + intervalType <- if (dayInterval == 1) { + "day" + } else if (dayInterval == 7) { + "week" + } else if (dayInterval %in% 28:31) { + "month" + } else { + stop(paste(date_var, "data has to be daily, weekly or monthly")) + } + output <- list( + date_var = date_var, + dayInterval = dayInterval, + intervalType = intervalType, + dt_input = as_tibble(dt_input) + ) + invisible(return(output)) +} + +check_depvar <- function(dt_input, dep_var, dep_var_type) { + if (is.null(dep_var)) { + stop("Must provide a valid dependent variable name for 'dep_var'") + } + if (!dep_var %in% names(dt_input)) { + stop("Must provide a valid dependent name for 'dep_var'") + } + if (length(dep_var) > 1) { + stop("Must provide only 1 dependent variable name for 'dep_var'") + } + if (!(is.numeric(dt_input[, dep_var][[1]]) || is.integer(dt_input[, dep_var][[1]]))) { + stop("'dep_var' must be a numeric or integer variable") + } + if (is.null(dep_var_type)) { + stop("Must provide a dependent variable type for 'dep_var_type'") + } + if (!dep_var_type %in% c("conversion", "revenue") || length(dep_var_type) != 1) { + stop("'dep_var_type' must be 'conversion' or 'revenue'") + } +} + +check_prophet <- function(dt_holidays, prophet_country, prophet_vars, prophet_signs, dayInterval) { + check_vector(prophet_vars) + check_vector(prophet_signs) + if (is.null(dt_holidays) || is.null(prophet_vars)) { + return(invisible(NULL)) + } else { + prophet_vars <- tolower(prophet_vars) + opts <- c("trend", "season", "monthly", "weekday", "holiday") + if (!"holiday" %in% prophet_vars) { + if (!is.null(prophet_country)) { + warning(paste( + "Input 'prophet_country' is defined as", prophet_country, + "but 'holiday' is not setup within 'prophet_vars' parameter" + )) + } + prophet_country <- NULL + } + if (!all(prophet_vars %in% opts)) { + stop("Allowed values for 'prophet_vars' are: ", paste(opts, collapse = ", ")) + } + if ("weekday" %in% prophet_vars && dayInterval > 7) { + warning("Ignoring prophet_vars = 'weekday' input given your data granularity") + } + if ("holiday" %in% prophet_vars && ( + is.null(prophet_country) || length(prophet_country) > 1 | + isTRUE(!prophet_country %in% unique(dt_holidays$country)))) { + stop(paste( + "You must provide 1 country code in 'prophet_country' input.", + length(unique(dt_holidays$country)), "countries are included:", + paste(unique(dt_holidays$country), collapse = ", "), + "\nIf your country is not available, manually include data to 'dt_holidays'", + "or remove 'holidays' from 'prophet_vars' input." + )) + } + if (is.null(prophet_signs)) { + prophet_signs <- rep("default", length(prophet_vars)) + } + if (length(prophet_signs) == 1) { + prophet_signs <- rep(prophet_signs, length(prophet_vars)) + } + if (!all(prophet_signs %in% OPTS_PDN)) { + stop("Allowed values for 'prophet_signs' are: ", paste(OPTS_PDN, collapse = ", ")) + } + if (length(prophet_signs) != length(prophet_vars)) { + stop("'prophet_signs' must have same length as 'prophet_vars'") + } + return(invisible(prophet_signs)) + } +} + +check_context <- function(dt_input, context_vars, context_signs) { + if (!is.null(context_vars)) { + if (is.null(context_signs)) context_signs <- rep("default", length(context_vars)) + if (!all(context_signs %in% OPTS_PDN)) { + stop("Allowed values for 'context_signs' are: ", paste(OPTS_PDN, collapse = ", ")) + } + if (length(context_signs) != length(context_vars)) { + stop("Input 'context_signs' must have same length as 'context_vars'") + } + temp <- context_vars %in% names(dt_input) + if (!all(temp)) { + stop(paste( + "Input 'context_vars' not included in data. Check:", + v2t(context_vars[!temp]) + )) + } + return(invisible(list(context_signs = context_signs))) + } +} + +check_vector <- function(x) { + if (!is.null(names(x)) || is.list(x)) { + stop(sprintf("Input '%s' must be a valid vector", deparse(substitute(x)))) + } +} + +check_paidmedia <- function(dt_input, paid_media_vars, paid_media_signs, paid_media_spends) { + if (is.null(paid_media_spends)) { + stop("Must provide 'paid_media_spends'") + } + check_vector(paid_media_vars) + check_vector(paid_media_signs) + check_vector(paid_media_spends) + expVarCount <- length(paid_media_vars) + spendVarCount <- length(paid_media_spends) + + temp <- paid_media_vars %in% names(dt_input) + if (!all(temp)) { + stop(paste( + "Input 'paid_media_vars' not included in data. Check:", + v2t(paid_media_vars[!temp]) + )) + } + temp <- paid_media_spends %in% names(dt_input) + if (!all(temp)) { + stop(paste( + "Input 'paid_media_spends' not included in data. Check:", + v2t(paid_media_spends[!temp]) + )) + } + if (is.null(paid_media_signs)) { + paid_media_signs <- rep("positive", expVarCount) + } + if (!all(paid_media_signs %in% OPTS_PDN)) { + stop("Allowed values for 'paid_media_signs' are: ", paste(OPTS_PDN, collapse = ", ")) + } + if (length(paid_media_signs) == 1) { + paid_media_signs <- rep(paid_media_signs, length(paid_media_vars)) + } + if (length(paid_media_signs) != length(paid_media_vars)) { + stop("Input 'paid_media_signs' must have same length as 'paid_media_vars'") + } + if (spendVarCount != expVarCount) { + stop("Input 'paid_media_spends' must have same length as 'paid_media_vars'") + } + is_num <- unlist(lapply(dt_input[, paid_media_vars], is.numeric)) + if (!all(is_num)) { + stop("All your 'paid_media_vars' must be numeric. Check: ", v2t(paid_media_vars[!is_num])) + } + is_num <- unlist(lapply(dt_input[, paid_media_spends], is.numeric)) + if (!all(is_num)) { + stop("All your 'paid_media_spends' must be numeric. Check: ", v2t(paid_media_spends[!is_num])) + } + get_cols <- any(dt_input[, unique(c(paid_media_vars, paid_media_spends))] < 0) + if (get_cols) { + check_media_names <- unique(c(paid_media_vars, paid_media_spends)) + df_check <- dt_input[, check_media_names] + check_media_val <- unlist(lapply(df_check, function(x) any(x < 0))) + stop( + paste(names(check_media_val)[check_media_val], collapse = ", "), + " contains negative values. Media must be >=0" + ) + } + return(invisible(list( + paid_media_signs = paid_media_signs, + expVarCount = expVarCount, + paid_media_vars = paid_media_vars + ))) +} + +check_organicvars <- function(dt_input, organic_vars, organic_signs) { + if (is.null(organic_vars)) { + return(invisible(NULL)) + } + check_vector(organic_vars) + check_vector(organic_signs) + temp <- organic_vars %in% names(dt_input) + if (!all(temp)) { + stop(paste( + "Input 'organic_vars' not included in data. Check:", + v2t(organic_vars[!temp]) + )) + } + if (!is.null(organic_vars) && is.null(organic_signs)) { + organic_signs <- rep("positive", length(organic_vars)) + # message("'organic_signs' were not provided. Using 'positive'") + } + if (!all(organic_signs %in% OPTS_PDN)) { + stop("Allowed values for 'organic_signs' are: ", paste(OPTS_PDN, collapse = ", ")) + } + if (length(organic_signs) != length(organic_vars)) { + stop("Input 'organic_signs' must have same length as 'organic_vars'") + } + is_num <- unlist(lapply(dt_input[, organic_vars], is.numeric)) + if (!all(is_num)) { + stop("All your 'organic_vars' must be numeric. Check: ", v2t(organic_vars[!is_num])) + } + return(invisible(list(organic_signs = organic_signs))) +} + +check_factorvars <- function(dt_input, factor_vars = NULL, context_vars = NULL) { + check_vector(factor_vars) + check_vector(context_vars) + temp <- select(dt_input, all_of(context_vars)) + are_not_numeric <- !sapply(temp, is.numeric) + if (any(are_not_numeric)) { + these <- are_not_numeric[!names(are_not_numeric) %in% factor_vars] + these <- these[these] + if (length(these) > 0) { + message("Automatically set these variables as 'factor_vars': ", v2t(names(these))) + factor_vars <- c(factor_vars, names(these)) + } + } + if (!is.null(factor_vars)) { + if (!all(factor_vars %in% context_vars)) { + stop("Input 'factor_vars' must be any from 'context_vars' inputs") + } + } + return(factor_vars) +} + +check_allvars <- function(all_ind_vars) { + if (length(all_ind_vars) != length(unique(all_ind_vars))) { + stop("All input variables must have unique names") + } +} + +check_datadim <- function(dt_input, all_ind_vars, rel = 10) { + num_obs <- nrow(dt_input) + if (num_obs < length(all_ind_vars) * rel) { + warning(paste( + "There are", length(all_ind_vars), "independent variables &", + num_obs, "data points.", "We recommend row:column ratio of", rel, "to 1" + )) + } + if (ncol(dt_input) <= 2) { + stop("Provide a valid 'dt_input' input with at least 3 columns or more") + } +} + +check_windows <- function(dt_input, date_var, all_media, window_start, window_end) { + dates_vec <- as.Date(dt_input[, date_var][[1]], origin = "1970-01-01") + + if (is.null(window_start)) { + window_start <- min(dates_vec) + } else { + window_start <- as.Date(as.character(window_start), "%Y-%m-%d", origin = "1970-01-01") + if (is.na(window_start)) { + stop(sprintf("Input 'window_start' must have date format, i.e. '%s'", Sys.Date())) + } else if (window_start < min(dates_vec)) { + window_start <- min(dates_vec) + message(paste( + "Input 'window_start' is smaller than the earliest date in input data.", + "It's automatically set to the earliest date:", window_start + )) + } else if (window_start > max(dates_vec)) { + stop("Input 'window_start' can't be larger than the the latest date in input data: ", max(dates_vec)) + } + } + + rollingWindowStartWhich <- which.min(abs(difftime( + dates_vec, + window_start, + units = "days" + ))) + if (!window_start %in% dates_vec) { + window_start <- dt_input[rollingWindowStartWhich, date_var][[1]] + message("Input 'window_start' is adapted to the closest date contained in input data: ", window_start) + } + refreshAddedStart <- window_start + + if (is.null(window_end)) { + window_end <- max(dates_vec) + } else { + window_end <- as.Date(as.character(window_end), "%Y-%m-%d", origin = "1970-01-01") + if (is.na(window_end)) { + stop(sprintf("Input 'window_end' must have date format, i.e. '%s'", Sys.Date())) + } else if (window_end > max(dates_vec)) { + window_end <- max(dates_vec) + message(paste( + "Input 'window_end' is larger than the latest date in input data.", + "It's automatically set to the latest date:", window_end + )) + } else if (window_end < window_start) { + window_end <- max(dates_vec) + message(paste( + "Input 'window_end' must be >= 'window_start.", + "It's automatically set to the latest date:", window_end + )) + } + } + + rollingWindowEndWhich <- which.min(abs(difftime(dates_vec, window_end, units = "days"))) + if (!(window_end %in% dates_vec)) { + window_end <- dt_input[rollingWindowEndWhich, date_var][[1]] + message("Input 'window_end' is adapted to the closest date contained in input data: ", window_end) + } + rollingWindowLength <- rollingWindowEndWhich - rollingWindowStartWhich + 1 + + dt_init <- dt_input[rollingWindowStartWhich:rollingWindowEndWhich, all_media] + + init_all0 <- dplyr::select_if(dt_init, is.numeric) %>% colSums(.) == 0 + if (any(init_all0)) { + stop( + "These media channels contains only 0 within training period ", + dt_input[rollingWindowStartWhich, date_var][[1]], " to ", + dt_input[rollingWindowEndWhich, date_var][[1]], ": ", + paste(names(dt_init)[init_all0], collapse = ", "), + "\nRecommendation: adapt InputCollect$window_start, remove or combine these channels" + ) + } + output <- list( + dt_input = dt_input, + window_start = window_start, + rollingWindowStartWhich = rollingWindowStartWhich, + refreshAddedStart = refreshAddedStart, + window_end = window_end, + rollingWindowEndWhich = rollingWindowEndWhich, + rollingWindowLength = rollingWindowLength + ) + return(invisible(output)) +} + +check_adstock <- function(adstock) { + if (is.null(adstock)) { + stop("Input 'adstock' can't be NULL. Set any of: 'geometric', 'weibull_cdf' or 'weibull_pdf'") + } + if (adstock == "weibull") adstock <- "weibull_cdf" + if (!adstock %in% c("geometric", "weibull_cdf", "weibull_pdf")) { + stop("Input 'adstock' must be 'geometric', 'weibull_cdf' or 'weibull_pdf'") + } + return(adstock) +} + +check_hyperparameters <- function(hyperparameters = NULL, adstock = NULL, + paid_media_spends = NULL, organic_vars = NULL, + exposure_vars = NULL, prophet_vars = NULL, + contextual_vars = NULL) { + if (is.null(hyperparameters)) { + message(paste( + "Input 'hyperparameters' not provided yet. To include them, run", + "robyn_inputs(InputCollect = InputCollect, hyperparameters = ...)" + )) + } else { + if (!"train_size" %in% names(hyperparameters)) { + hyperparameters[["train_size"]] <- c(0.5, 0.8) + warning("Automatically added missing hyperparameter range: 'train_size' = c(0.5, 0.8)") + } + # Non-adstock hyperparameters check + check_train_size(hyperparameters) + # Adstock hyperparameters check + hyperparameters_ordered <- hyperparameters[order(names(hyperparameters))] + get_hyp_names <- names(hyperparameters_ordered) + original_order <- sapply(names(hyperparameters), function(x) which(x == get_hyp_names)) + ref_hyp_name_spend <- hyper_names(adstock, all_media = paid_media_spends) + ref_hyp_name_expo <- hyper_names(adstock, all_media = exposure_vars) + ref_hyp_name_org <- hyper_names(adstock, all_media = organic_vars) + ref_hyp_name_other <- get_hyp_names[get_hyp_names %in% HYPS_OTHERS] + # Excluding lambda (first HYPS_OTHERS) given its range is not customizable + ref_all_media <- sort(c(ref_hyp_name_spend, ref_hyp_name_org, HYPS_OTHERS)) + all_ref_names <- c(ref_hyp_name_spend, ref_hyp_name_expo, ref_hyp_name_org, HYPS_OTHERS) + all_ref_names <- all_ref_names[order(all_ref_names)] + # Adding penalty variations to the dictionary + if (any(grepl("_penalty", paste0(get_hyp_names)))) { + ref_hyp_name_penalties <- paste0( + c(paid_media_spends, organic_vars, prophet_vars, contextual_vars), "_penalty" + ) + all_ref_names <- c(all_ref_names, ref_hyp_name_penalties) + } else { + ref_hyp_name_penalties <- NULL + } + if (!all(get_hyp_names %in% all_ref_names)) { + wrong_hyp_names <- get_hyp_names[which(!(get_hyp_names %in% all_ref_names))] + stop( + "Input 'hyperparameters' contains following wrong names: ", + paste(wrong_hyp_names, collapse = ", ") + ) + } + total <- length(get_hyp_names) + total_in <- length(c(ref_hyp_name_spend, ref_hyp_name_org, ref_hyp_name_penalties, ref_hyp_name_other)) + if (total != total_in) { + stop(sprintf( + paste( + "%s hyperparameter values are required, and %s were provided.", + "\n Use hyper_names() function to help you with the correct hyperparameters names." + ), + total_in, total + )) + } + # Old workflow: replace exposure with spend hyperparameters + if (any(get_hyp_names %in% ref_hyp_name_expo)) { + get_expo_pos <- which(get_hyp_names %in% ref_hyp_name_expo) + get_hyp_names[get_expo_pos] <- ref_all_media[get_expo_pos] + names(hyperparameters_ordered) <- get_hyp_names + } + check_hyper_limits(hyperparameters_ordered, "thetas") + check_hyper_limits(hyperparameters_ordered, "alphas") + check_hyper_limits(hyperparameters_ordered, "gammas") + check_hyper_limits(hyperparameters_ordered, "shapes") + check_hyper_limits(hyperparameters_ordered, "scales") + hyperparameters_unordered <- hyperparameters_ordered[original_order] + return(hyperparameters_unordered) + } +} + +check_train_size <- function(hyps) { + if ("train_size" %in% names(hyps)) { + if (!length(hyps$train_size) %in% 1:2) { + stop("Hyperparameter 'train_size' must be length 1 (fixed) or 2 (range)") + } + if (any(hyps$train_size <= 0.1) || any(hyps$train_size > 1)) { + stop("Hyperparameter 'train_size' values must be defined between 0.1 and 1") + } + } +} + +check_hyper_limits <- function(hyperparameters, hyper) { + hyper_which <- which(endsWith(names(hyperparameters), hyper)) + if (length(hyper_which) == 0) { + return(invisible(NULL)) + } + limits <- hyper_limits()[[hyper]] + for (i in hyper_which) { + values <- hyperparameters[[i]] + # Lower limit + ineq <- paste(values[1], limits[1], sep = "", collapse = "") + lower_pass <- eval(parse(text = ineq)) + if (!lower_pass) { + stop(sprintf("%s's hyperparameter must have lower bound %s", names(hyperparameters)[i], limits[1])) + } + # Upper limit + ineq <- paste(values[2], limits[2], sep = "", collapse = "") + upper_pass <- eval(parse(text = ineq)) | length(values) == 1 + if (!upper_pass) { + stop(sprintf("%s's hyperparameter must have upper bound %s", names(hyperparameters)[i], limits[2])) + } + # Order of limits + order_pass <- !isFALSE(values[1] <= values[2]) + if (!order_pass) { + stop(sprintf("%s's hyperparameter must have lower bound first and upper bound second", names(hyperparameters)[i])) + } + } +} + +check_calibration <- function(dt_input, date_var, calibration_input, dayInterval, dep_var, + window_start, window_end, paid_media_spends, organic_vars) { + if (!is.null(calibration_input)) { + calibration_input <- as_tibble(as.data.frame(calibration_input)) + these <- c("channel", "liftStartDate", "liftEndDate", "liftAbs", "spend", "confidence", "metric", "calibration_scope") + if (!all(these %in% names(calibration_input))) { + stop("Input 'calibration_input' must contain columns: ", v2t(these), ". Check the demo script for instruction.") + } + if (!is.numeric(calibration_input$liftAbs) || any(is.na(calibration_input$liftAbs))) { + stop("Check 'calibration_input$liftAbs': all lift values must be valid numerical numbers") + } + all_media <- c(paid_media_spends, organic_vars) + cal_media <- str_split(calibration_input$channel, "\\+|,|;|\\s") + if (!all(unlist(cal_media) %in% all_media)) { + these <- unique(unlist(cal_media)[which(!unlist(cal_media) %in% all_media)]) + stop(sprintf( + "All channels from 'calibration_input' must be any of: %s.\n Check: %s", + v2t(all_media), v2t(these) + )) + } + for (i in seq_along(calibration_input$channel)) { + temp <- calibration_input[i, ] + if (temp$liftStartDate < (window_start) || temp$liftEndDate > (window_end)) { + stop(sprintf( + paste( + "Your calibration's date range for %s between %s and %s is not within modeling window (%s to %s).", + "Please, remove this experiment from 'calibration_input'." + ), + temp$channel, temp$liftStartDate, temp$liftEndDate, window_start, window_end + )) + } + if (temp$liftStartDate > temp$liftEndDate) { + stop(sprintf( + paste( + "Your calibration's date range for %s between %s and %s should respect liftStartDate <= liftEndDate.", + "Please, correct this experiment from 'calibration_input'." + ), + temp$channel, temp$liftStartDate, temp$liftEndDate + )) + } + } + if ("spend" %in% colnames(calibration_input)) { + for (i in seq_along(calibration_input$channel)) { + temp <- calibration_input[i, ] + temp2 <- cal_media[[i]] + if (all(temp2 %in% organic_vars)) next + dt_input_spend <- filter( + dt_input, get(date_var) >= temp$liftStartDate, + get(date_var) <= temp$liftEndDate + ) %>% + select(all_of(temp2)) %>% + sum(.) %>% + round(., 0) + if (dt_input_spend > temp$spend * 1.1 || dt_input_spend < temp$spend * 0.9) { + warning(sprintf( + paste( + "Your calibration's spend (%s) for %s between %s and %s does not match your dt_input spend (~%s).", + "Please, check again your dates or split your media inputs into separate media channels." + ), + formatNum(temp$spend, 0), temp$channel, temp$liftStartDate, temp$liftEndDate, + formatNum(dt_input_spend, 3, abbr = TRUE) + )) + } + } + } + if ("confidence" %in% colnames(calibration_input)) { + for (i in seq_along(calibration_input$channel)) { + temp <- calibration_input[i, ] + if (temp$confidence < 0.8) { + warning(sprintf( + paste( + "Your calibration's confidence for %s between %s and %s is lower than 80%%, thus low-confidence.", + "Consider getting rid of this experiment and running it again." + ), + temp$channel, temp$liftStartDate, temp$liftEndDate + )) + } + } + } + if ("metric" %in% colnames(calibration_input)) { + for (i in seq_along(calibration_input$channel)) { + temp <- calibration_input[i, ] + if (temp$metric != dep_var) { + stop(sprintf( + paste( + "Your calibration's metric for %s between %s and %s is not '%s'.", + "Please, remove this experiment from 'calibration_input'." + ), + temp$channel, temp$liftStartDate, temp$liftEndDate, dep_var + )) + } + } + } + if ("scope" %in% colnames(calibration_input)) { + these <- c("immediate", "total") + if (!all(calibration_input$scope %in% these)) { + stop("Inputs in 'calibration_input$scope' must be any of: ", v2t(these)) + } + } + } + return(calibration_input) +} + +check_obj_weight <- function(calibration_input, objective_weights, refresh) { + obj_len <- ifelse(is.null(calibration_input), 2, 3) + if (!is.null(objective_weights)) { + if ((length(objective_weights) != obj_len)) { + stop(paste0("objective_weights must have length of ", obj_len)) + } + if (any(objective_weights < 0) | any(objective_weights > 10)) { + stop("objective_weights must be >= 0 & <= 10") + } + } + if (is.null(objective_weights) & refresh) { + if (obj_len == 2) { + objective_weights <- c(0, 1) + } else { + objective_weights <- c(0, 1, 1) + } + } + return(objective_weights) +} + +check_iteration <- function(calibration_input, iterations, trials, hyps_fixed, refresh) { + if (!refresh) { + if (!hyps_fixed) { + if (is.null(calibration_input) && (iterations < 2000 || trials < 5)) { + warning("We recommend to run at least 2000 iterations per trial and 5 trials to build initial model") + } else if (!is.null(calibration_input) && (iterations < 2000 || trials < 10)) { + warning(paste( + "You are calibrating MMM. We recommend to run at least 2000 iterations per trial and", + "10 trials to build initial model" + )) + } + } + } +} + +check_InputCollect <- function(list) { + names_list <- c( + "dt_input", "paid_media_vars", "paid_media_spends", "context_vars", + "organic_vars", "all_ind_vars", "date_var", "dep_var", + "rollingWindowStartWhich", "rollingWindowEndWhich", + "factor_vars", "prophet_vars", "prophet_signs", "prophet_country", + "intervalType", "dt_holidays" + ) + if (!all(names_list %in% names(list))) { + not_present <- names_list[!names_list %in% names(list)] + stop(paste( + "Some elements where not provided in your inputs list:", + paste(not_present, collapse = ", ") + )) + } + + if (length(list$dt_input) <= 1) { + stop("Check your 'dt_input' object") + } +} + +check_robyn_name <- function(robyn_object, quiet = FALSE) { + if (!is.null(robyn_object)) { + if (!dir.exists(robyn_object)) { + file_end <- lares::right(robyn_object, 4) + if (file_end != ".RDS") { + stop("Input 'robyn_object' must has format .RDS") + } + } + } else { + if (!quiet) message("Skipping export into RDS file") + } +} + +check_dir <- function(plot_folder) { + file_end <- substr(plot_folder, nchar(plot_folder) - 3, nchar(plot_folder)) + if (file_end == ".RDS") { + plot_folder <- dirname(plot_folder) + message("Using robyn object location: ", plot_folder) + } else { + plot_folder <- file.path(dirname(plot_folder), basename(plot_folder)) + } + if (!dir.exists(plot_folder)) { + plot_folder <- getwd() + message("WARNING: Provided 'plot_folder' doesn't exist. Using current working directory: ", plot_folder) + } + return(plot_folder) +} + +check_calibconstr <- function(calibration_constraint, iterations, trials, calibration_input, refresh) { + if (!is.null(calibration_input) & !refresh) { + total_iters <- iterations * trials + if (calibration_constraint < 0.01 || calibration_constraint > 0.1) { + message("Input 'calibration_constraint' must be >= 0.01 and <= 0.1. Changed to default: 0.1") + calibration_constraint <- 0.1 + } + models_lower <- 500 + if (total_iters * calibration_constraint < models_lower) { + warning(sprintf( + paste( + "Input 'calibration_constraint' set for top %s%% calibrated models.", + "%s models left for pareto-optimal selection. Minimum suggested: %s" + ), + calibration_constraint * 100, + round(total_iters * calibration_constraint, 0), + models_lower + )) + } + } + return(calibration_constraint) +} + +check_hyper_fixed <- function(InputCollect, dt_hyper_fixed, add_penalty_factor) { + hyper_fixed <- !is.null(dt_hyper_fixed) + # Adstock hyper-parameters + hypParamSamName <- hyper_names(adstock = InputCollect$adstock, all_media = InputCollect$all_media) + # Add lambda and other hyper-parameters manually + hypParamSamName <- c(hypParamSamName, HYPS_OTHERS) + # Add penalty factor hyper-parameters names + if (add_penalty_factor) { + for_penalty <- names(select(InputCollect$dt_mod, -.data$ds, -.data$dep_var)) + hypParamSamName <- c(hypParamSamName, paste0(for_penalty, "_penalty")) + } + if (hyper_fixed) { + ## Run robyn_mmm if using old model result tables + dt_hyper_fixed <- as_tibble(dt_hyper_fixed) + if (nrow(dt_hyper_fixed) != 1) { + stop(paste( + "Provide only 1 model / 1 row from OutputCollect$resultHypParam or", + "pareto_hyperparameters.csv from previous runs" + )) + } + if (!all(hypParamSamName %in% names(dt_hyper_fixed))) { + these <- hypParamSamName[!hypParamSamName %in% names(dt_hyper_fixed)] + stop(paste( + "Input 'dt_hyper_fixed' is invalid.", + "Please provide 'OutputCollect$resultHypParam' result from previous runs or", + "'pareto_hyperparameters.csv' data with desired model ID. Missing values for:", v2t(these) + )) + } + } + attr(hyper_fixed, "hypParamSamName") <- hypParamSamName + return(hyper_fixed) +} + +# Enable parallelisation of main modelling loop for MacOS and Linux only +check_parallel <- function() "unix" %in% .Platform$OS.type +# ggplot doesn't work with process forking on MacOS; however it works fine on Linux and Windows +check_parallel_plot <- function() !"Darwin" %in% Sys.info()["sysname"] + +check_init_msg <- function(InputCollect, cores) { + opt <- sum(lapply(InputCollect$hyper_updated, length) == 2) + fix <- sum(lapply(InputCollect$hyper_updated, length) == 1) + det <- sprintf("(%s to iterate + %s fixed)", opt, fix) + base <- paste( + "Using", InputCollect$adstock, "adstocking with", + length(InputCollect$hyper_updated), "hyperparameters", det + ) + if (cores == 1) { + message(paste(base, "with no parallel computation")) + } else { + message(paste(base, "on", cores, "cores")) + } +} + +check_class <- function(x, object) { + if (any(!x %in% class(object))) stop(sprintf("Input object must be class %s", x)) +} + +check_allocator_constrains <- function(low, upr) { + if (all(is.na(low)) || all(is.na(upr))) { + stop("You must define lower (channel_constr_low) and upper (channel_constr_up) constraints") + } + max_length <- max(c(length(low), length(upr))) + if (any(low < 0)) { + stop("Inputs 'channel_constr_low' must be >= 0") + } + if (length(upr) != length(low)) { + stop("Inputs 'channel_constr_up' and 'channel_constr_low' must have the same length or length 1") + } + if (any(upr < low)) { + stop("Inputs 'channel_constr_up' must be >= 'channel_constr_low'") + } +} + +check_allocator <- function(OutputCollect, select_model, paid_media_spends, scenario, + channel_constr_low, channel_constr_up, constr_mode) { + if (!(select_model %in% OutputCollect$allSolutions)) { + stop( + "Provided 'select_model' is not within the best results. Try any of: ", + paste(OutputCollect$allSolutions, collapse = ", ") + ) + } + if ("max_historical_response" %in% scenario) scenario <- "max_response" + opts <- c("max_response", "target_efficiency") # Deprecated: max_response_expected_spend + if (!(scenario %in% opts)) { + stop("Input 'scenario' must be one of: ", paste(opts, collapse = ", ")) + } + check_allocator_constrains(channel_constr_low, channel_constr_up) + if (!(scenario == "target_efficiency" & is.null(channel_constr_low) & is.null(channel_constr_up))) { + if (length(channel_constr_low) != 1 && length(channel_constr_low) != length(paid_media_spends)) { + stop(paste( + "Input 'channel_constr_low' have to contain either only 1", + "value or have same length as 'InputCollect$paid_media_spends':", length(paid_media_spends) + )) + } + if (length(channel_constr_up) != 1 && length(channel_constr_up) != length(paid_media_spends)) { + stop(paste( + "Input 'channel_constr_up' have to contain either only 1", + "value or have same length as 'InputCollect$paid_media_spends':", length(paid_media_spends) + )) + } + } + opts <- c("eq", "ineq") + if (!(constr_mode %in% opts)) { + stop("Input 'constr_mode' must be one of: ", paste(opts, collapse = ", ")) + } + return(scenario) +} + +check_metric_type <- function(metric_name, paid_media_spends, paid_media_vars, exposure_vars, organic_vars) { + if (metric_name %in% paid_media_spends && length(metric_name) == 1) { + metric_type <- "spend" + } else if (metric_name %in% exposure_vars && length(metric_name) == 1) { + metric_type <- "exposure" + } else if (metric_name %in% organic_vars && length(metric_name) == 1) { + metric_type <- "organic" + } else { + stop(paste( + "Invalid 'metric_name' input:", metric_name, + "\nInput should be any media variable from paid_media_spends (spend),", + "paid_media_vars (exposure), or organic_vars (organic):", + paste("\n- paid_media_spends:", v2t(paid_media_spends, quotes = FALSE)), + paste("\n- paid_media_vars:", v2t(paid_media_vars, quotes = FALSE)), + paste("\n- organic_vars:", v2t(organic_vars, quotes = FALSE)) + )) + } + return(metric_type) +} + +check_metric_dates <- function(date_range = NULL, all_dates, dayInterval = NULL, quiet = FALSE, is_allocator = FALSE, ...) { + ## default using latest 30 days / 4 weeks / 1 month for spend level + if (is.null(date_range)) { + if (is.null(dayInterval)) stop("Input 'date_range' or 'dayInterval' must be defined") + # if (!is_allocator) { + # date_range <- "last_1" + # } else { + # date_range <- paste0("last_", case_when( + # dayInterval == 1 ~ 30, + # dayInterval == 7 ~ 4, + # dayInterval >= 30 & dayInterval <= 31 ~ 1, + # )) + # } + date_range <- "all" + if (!quiet) message(sprintf("Automatically picked date_range = '%s'", date_range)) + } + if (grepl("last|all", date_range[1])) { + ## Using last_n as date_range range + if ("all" %in% date_range) date_range <- paste0("last_", length(all_dates)) + get_n <- ifelse(grepl("_", date_range[1]), as.integer(gsub("last_", "", date_range)), 1) + date_range <- tail(all_dates, get_n) + date_range_loc <- which(all_dates %in% date_range) + date_range_updated <- all_dates[date_range_loc] + rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) + } else { + ## Using dates as date_range range + if (all(is.Date(as.Date(date_range, origin = "1970-01-01")))) { + date_range <- as.Date(date_range, origin = "1970-01-01") + if (length(date_range) == 1) { + ## Using only 1 date + if (all(date_range %in% all_dates)) { + date_range_updated <- date_range + date_range_loc <- which(all_dates == date_range) + if (!quiet) message("Using ds '", date_range_updated, "' as the response period") + } else { + date_range_loc <- which.min(abs(date_range - all_dates)) + date_range_updated <- all_dates[date_range_loc] + if (!quiet) warning("Input 'date_range' (", date_range, ") has no match. Picking closest date: ", date_range_updated) + } + } else if (length(date_range) == 2) { + ## Using two dates as "from-to" date range + date_range_loc <- unlist(lapply(date_range, function(x) which.min(abs(x - all_dates)))) + date_range_loc <- date_range_loc[1]:date_range_loc[2] + date_range_updated <- all_dates[date_range_loc] + if (!quiet & !all(date_range %in% date_range_updated)) { + warning(paste( + "At least one date in 'date_range' input do not match any date.", + "Picking closest dates for range:", paste(range(date_range_updated), collapse = ":") + )) + } + rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) + get_n <- length(date_range_loc) + } else { + ## Manually inputting each date + date_range_updated <- date_range + if (all(date_range %in% all_dates)) { + date_range_loc <- which(all_dates %in% date_range_updated) + } else { + date_range_loc <- unlist(lapply(date_range_updated, function(x) which.min(abs(x - all_dates)))) + rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) + } + if (all(na.omit(date_range_loc - lag(date_range_loc)) == 1)) { + date_range_updated <- all_dates[date_range_loc] + if (!quiet) warning("At least one date in 'date_range' do not match ds. Picking closest date: ", date_range_updated) + } else { + stop("Input 'date_range' needs to have sequential dates") + } + } + } else { + stop("Input 'date_range' must have date format '2023-01-01' or use 'last_n'") + } + } + return(list( + date_range_updated = date_range_updated, + metric_loc = date_range_loc + )) +} + +check_metric_value <- function(metric_value, metric_name, all_values, metric_loc) { + get_n <- length(metric_loc) + if (any(is.nan(metric_value))) metric_value <- NULL + if (!is.null(metric_value)) { + if (!is.numeric(metric_value)) { + stop(sprintf( + "Input 'metric_value' for %s (%s) must be a numerical value\n", metric_name, toString(metric_value) + )) + } + if (any(metric_value < 0)) { + stop(sprintf( + "Input 'metric_value' for %s must be positive\n", metric_name + )) + } + if (get_n > 1 & length(metric_value) == 1) { + metric_value_updated <- rep(metric_value / get_n, get_n) + # message(paste0("'metric_value'", metric_value, " splitting into ", get_n, " periods evenly")) + } else { + if (length(metric_value) != get_n) { + stop("robyn_response metric_value & date_range must have same length\n") + } + metric_value_updated <- metric_value + } + } + if (is.null(metric_value)) { + metric_value_updated <- all_values[metric_loc] + } + all_values_updated <- all_values + all_values_updated[metric_loc] <- metric_value_updated + return(list( + metric_value_updated = metric_value_updated, + all_values_updated = all_values_updated + )) +} + +check_legacy_input <- function(InputCollect, + cores = NULL, iterations = NULL, trials = NULL, + intercept_sign = NULL, nevergrad_algo = NULL) { + if (!any(LEGACY_PARAMS %in% names(InputCollect))) { + return(invisible(InputCollect)) + } # Legacy check + # Warn the user these InputCollect params will be (are) deprecated + legacyValues <- InputCollect[LEGACY_PARAMS] + legacyValues <- legacyValues[!unlist(lapply(legacyValues, is.null))] + if (length(legacyValues) > 0) { + warning(sprintf( + "Using legacy InputCollect values. Please set %s within robyn_run() instead", + v2t(names(legacyValues)) + )) + } + # Overwrite InputCollect with robyn_run() inputs + if (!is.null(cores)) InputCollect$cores <- cores + if (!is.null(iterations)) InputCollect$iterations <- iterations + if (!is.null(trials)) InputCollect$trials <- trials + if (!is.null(intercept_sign)) InputCollect$intercept_sign <- intercept_sign + if (!is.null(nevergrad_algo)) InputCollect$nevergrad_algo <- nevergrad_algo + attr(InputCollect, "deprecated_params") <- TRUE + return(invisible(InputCollect)) +} + +check_run_inputs <- function(cores, iterations, trials, intercept_sign, nevergrad_algo) { + if (is.null(iterations)) stop("Must provide 'iterations' in robyn_run()") + if (is.null(trials)) stop("Must provide 'trials' in robyn_run()") + if (is.null(nevergrad_algo)) stop("Must provide 'nevergrad_algo' in robyn_run()") + opts <- c("non_negative", "unconstrained") + if (!intercept_sign %in% opts) { + stop(sprintf("Input 'intercept_sign' must be any of: %s", paste(opts, collapse = ", "))) + } +} + +check_daterange <- function(date_min, date_max, dates) { + if (!is.null(date_min)) { + if (length(date_min) > 1) stop("Set a single date for 'date_min' parameter") + if (date_min < min(dates)) { + warning(sprintf( + "Parameter 'date_min' not in your data's date range. Changed to '%s'", min(dates) + )) + } + } + if (!is.null(date_max)) { + if (length(date_max) > 1) stop("Set a single date for 'date_max' parameter") + if (date_max > max(dates)) { + warning(sprintf( + "Parameter 'date_max' not in your data's date range. Changed to '%s'", max(dates) + )) + } + } +} + +check_refresh_data <- function(Robyn, dt_input) { + original_periods <- nrow(Robyn$listInit$InputCollect$dt_modRollWind) + new_periods <- nrow(filter( + dt_input, get(Robyn$listInit$InputCollect$date_var) > Robyn$listInit$InputCollect$window_end + )) + it <- Robyn$listInit$InputCollect$intervalType + if (new_periods > 0.5 * (original_periods + new_periods)) { + warning(sprintf( + paste( + "We recommend re-building a model rather than refreshing this one.", + "More than 50%% of your refresh data (%s %ss) is new data (%s %ss)" + ), + original_periods + new_periods, it, new_periods, it + )) + } +} diff --git a/R/clusters.R b/R/clusters.R index 69c3300..621b22a 100644 --- a/R/clusters.R +++ b/R/clusters.R @@ -1,475 +1,475 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Clustering to Reduce Number of Models based on ROI and Errors -#' -#' \code{robyn_clusters()} uses output from \code{robyn_run()}, -#' to reduce the number of models and create bootstrapped confidence -#' interval and help the user pick up the best (lowest combined error) -#' of the most different kinds (clusters) of models. -#' -#' @inheritParams lares::clusterKmeans -#' @inheritParams hyper_names -#' @inheritParams robyn_outputs -#' @param input \code{robyn_export()}'s output or \code{pareto_aggregated.csv} results. -#' @param dep_var_type Character. For dep_var_type 'revenue', ROI is used for clustering. -#' For conversion', CPA is used for clustering. -#' @param cluster_by Character. Any of: "performance" or "hyperparameters". -#' @param max_clusters Integer. Maximum number of clusters. -#' @param limit Integer. Top N results per cluster. If kept in "auto", will select k -#' as the cluster in which the WSS variance was less than 5\%. -#' @param weights Vector, size 3. How much should each error weight? -#' Order: nrmse, decomp.rssd, mape. The highest the value, the closer it will be scaled -#' to origin. Each value will be normalized so they all sum 1. -#' @param export Export plots into local files? -#' @param ... Additional parameters passed to \code{lares::clusterKmeans()}. -#' @author Bernardo Lares (bernardolares@@meta.com) -#' @examples -#' \dontrun{ -#' # Having InputCollect and OutputCollect results -#' cls <- robyn_clusters( -#' input = OutputCollect, -#' all_media = InputCollect$all_media, -#' k = 3, limit = 2, -#' weights = c(1, 1, 1.5) -#' ) -#' } -#' @return List. Clustering results as labeled data.frames and plots. -#' @export -robyn_clusters <- function(input, dep_var_type, - cluster_by = "hyperparameters", - all_media = NULL, - k = "auto", wss_var = 0.06, max_clusters = 10, limit = 1, - weights = rep(1, 3), dim_red = "PCA", - quiet = FALSE, export = FALSE, seed = 123, - ...) { - set.seed(seed) - check_opts(cluster_by, c("performance", "hyperparameters")) - if ("robyn_outputs" %in% class(input)) { - if (is.null(all_media)) { - aux <- colnames(input$mediaVecCollect) - all_media <- aux[-c(1, which(aux == "type"):length(aux))] - path <- input$plot_folder - } else { - path <- paste0(getwd(), "/") - } - # Pareto and ROI data - x <- xDecompAgg <- input$xDecompAgg - if (cluster_by %in% "hyperparameters") x <- input$resultHypParam - df <- .prepare_df(x, all_media, dep_var_type, cluster_by) - } else { - stop(paste( - "You must run robyn_outputs(..., clusters = TRUE) or", - "pass a valid data.frame (sames as pareto_aggregated.csv output)", - "in order to use robyn_clusters()" - )) - } - - ignore <- c("solID", "mape", "decomp.rssd", "nrmse", "nrmse_test", "nrmse_train", "nrmse_val", "pareto") - - # Auto K selected by less than 5% WSS variance (convergence) - min_clusters <- 3 - limit_clusters <- min(nrow(df) - 1, 30) - if ("auto" %in% k) { - cls <- tryCatch( - { - suppressMessages( - clusterKmeans(df, - k = NULL, limit = limit_clusters, ignore = ignore, - dim_red = dim_red, quiet = TRUE, seed = seed - ) - ) - }, - error = function(err) { - message(paste("Couldn't automatically create clusters:", err)) - return(NULL) - } - ) - # if (is.null(cls)) return(NULL) - k <- cls$nclusters %>% - mutate( - pareto = .data$wss / .data$wss[1], - dif = lag(.data$pareto) - .data$pareto - ) %>% - filter(.data$dif > wss_var) %>% - pull(.data$n) %>% - max(., na.rm = TRUE) - if (k < min_clusters) { - warning(sprintf("Too few clusters: %s. Setting to %s", k, min_clusters)) - k <- min_clusters - } - if (!quiet) { - message(sprintf( - ">> Auto selected k = %s (clusters) based on minimum WSS variance of %s%%", - k, wss_var * 100 - )) - } - if (k > max_clusters) { - warning(sprintf("Too many clusters: %s. Lowering to %s (max_clusters)", k, max_clusters)) - k <- max_clusters - } - } - - # Build clusters - stopifnot(k %in% min_clusters:30) - suppressMessages( - cls <- clusterKmeans( - df, - k = k, limit = limit_clusters, ignore = ignore, - dim_red = dim_red, quiet = TRUE, seed = seed - ) - ) - cls$df <- group_by(cls$df, .data$cluster) %>% - mutate(n = n()) %>% - ungroup() - - # Select top models by minimum (weighted) distance to zero - all_paid <- setdiff(names(cls$df), c(ignore, "cluster")) - ts_validation <- ifelse("nrmse_test" %in% colnames(cls$df), TRUE, FALSE) - top_sols <- .clusters_df(df = cls$df, all_paid, balance = weights, limit, ts_validation) - - # Build in-cluster CI with bootstrap - ci_list <- confidence_calcs(xDecompAgg, cls, all_paid, dep_var_type, k, cluster_by, ...) - - output <- list( - # Data and parameters - data = mutate(cls$df, top_sol = .data$solID %in% top_sols$solID, cluster = as.integer(.data$cluster)), - df_cluster_ci = ungroup(ci_list$df_ci) %>% dplyr::select(-.data$cluster_title), - n_clusters = k, - boot_n = ci_list$boot_n, - sim_n = ci_list$sim_n, - errors_weights = weights, - # Within Groups Sum of Squares Plot - wss = cls$nclusters_plot + theme_lares(background = "white"), - # Grouped correlations per cluster - corrs = cls$correlations + labs(title = "Top Correlations by Cluster", subtitle = NULL), - # Mean ROI per cluster - clusters_means = cls$means, - # Dim reduction clusters - clusters_PCA = cls[["PCA"]], - clusters_tSNE = cls[["tSNE"]], - # Top Clusters - models = top_sols, - plot_clusters_ci = .plot_clusters_ci(ci_list$sim_collect, ci_list$df_ci, dep_var_type, ci_list$boot_n, ci_list$sim_n), - plot_models_errors = .plot_topsols_errors(df, top_sols, limit, weights), - plot_models_rois = .plot_topsols_rois(df, top_sols, all_media, limit) - ) - - if (export) { - write.csv(output$data, file = paste0(path, "pareto_clusters.csv")) - write.csv(output$df_cluster_ci, file = paste0(path, "pareto_clusters_ci.csv")) - ggsave(paste0(path, "pareto_clusters_wss.png"), plot = output$wss, dpi = 500, width = 5, height = 4) - get_height <- ceiling(k / 2) / 6 - db <- (output$plot_clusters_ci / (output$plot_models_rois + output$plot_models_errors)) + - patchwork::plot_layout(heights = c(get_height, 1), guides = "collect") - # Suppressing "Picking joint bandwidth of x" messages + - # In min(data$x, na.rm = TRUE) : no non-missing arguments to min; returning Inf warnings - # Setting try() to avoid error: One or both dimensions exceed the maximum (50000px). - # Use `options(ragg.max_dim = ...)` to change the max - try(suppressMessages(suppressWarnings(ggsave(paste0(path, "pareto_clusters_detail.png"), - plot = db, dpi = 500, width = 12, height = 4 + length(all_paid) * 2, limitsize = FALSE - )))) - } - - return(output) -} - -confidence_calcs <- function( - xDecompAgg, cls, all_paid, dep_var_type, k, cluster_by, - boot_n = 1000, sim_n = 10000, ...) { - df_clusters_outcome <- xDecompAgg %>% - filter(!is.na(.data$total_spend)) %>% - left_join(y = dplyr::select(cls$df, c("solID", "cluster")), by = "solID") %>% - dplyr::select(c("solID", "cluster", "rn", "roi_total", "cpa_total", "robynPareto")) %>% - group_by(.data$cluster, .data$rn) %>% - mutate(n = n()) %>% - filter(!is.na(.data$cluster)) %>% - arrange(.data$cluster, .data$rn) - - cluster_collect <- list() - chn_collect <- list() - sim_collect <- list() - for (j in 1:k) { - df_outcome <- filter(df_clusters_outcome, .data$cluster == j) - if (length(unique(df_outcome$solID)) < 3) { - warning(paste("Cluster", j, "does not contain enough models to calculate CI")) - } else { - if (cluster_by == "hyperparameters") { - all_paid <- unique(gsub(paste(paste0("_", HYPS_NAMES), collapse = "|"), "", all_paid)) - } - for (i in all_paid) { - # Bootstrap CI - if (dep_var_type == "conversion") { - # Drop CPA == Inf - df_chn <- filter(df_outcome, .data$rn == i & is.finite(.data$cpa_total)) - v_samp <- df_chn$cpa_total - } else { - df_chn <- filter(df_outcome, .data$rn == i) - v_samp <- df_chn$roi_total - } - boot_res <- .bootci(samp = v_samp, boot_n = boot_n) - boot_mean <- mean(boot_res$boot_means, na.rm = TRUE) - boot_se <- boot_res$se - ci_low <- ifelse(boot_res$ci[1] <= 0, 0, boot_res$ci[1]) - ci_up <- boot_res$ci[2] - - # Collect loop results - chn_collect[[i]] <- df_chn %>% - mutate( - ci_low = ci_low, - ci_up = ci_up, - n = length(v_samp), - boot_se = boot_se, - boot_mean = boot_mean, - cluster = j - ) - sim_collect[[i]] <- data.frame( - cluster = j, - rn = i, - n = length(v_samp), - boot_mean = boot_mean, - x_sim = suppressWarnings(rnorm(sim_n, mean = boot_mean, sd = boot_se)) - ) %>% - mutate(y_sim = dnorm(.data$x_sim, mean = boot_mean, sd = boot_se)) - } - } - cluster_collect[[j]] <- list(chn_collect = chn_collect, sim_collect = sim_collect) - } - - sim_collect <- bind_rows(lapply(cluster_collect, function(x) { - bind_rows(lapply(x$sim_collect, function(y) y)) - })) %>% - filter(.data$n > 0) %>% - mutate(cluster_title = sprintf("Cl.%s (n=%s)", .data$cluster, .data$n)) %>% - ungroup() %>% - as_tibble() - - df_ci <- bind_rows(lapply(cluster_collect, function(x) { - bind_rows(lapply(x$chn_collect, function(y) y)) - })) %>% - mutate(cluster_title = sprintf("Cl.%s (n=%s)", .data$cluster, .data$n)) %>% - dplyr::select( - .data$rn, .data$cluster_title, .data$n, .data$cluster, - .data$boot_mean, .data$boot_se, .data$ci_low, .data$ci_up - ) %>% - distinct() %>% - group_by(.data$rn, .data$cluster_title, .data$cluster) %>% - summarise( - n = .data$n, - boot_mean = .data$boot_mean, - boot_se = boot_se, - boot_ci = sprintf("[%s, %s]", round(.data$ci_low, 2), round(.data$ci_up, 2)), - ci_low = .data$ci_low, - ci_up = .data$ci_up, - sd = boot_se * sqrt(.data$n - 1), - dist100 = (.data$ci_up - .data$ci_low + 2 * boot_se * sqrt(.data$n - 1)) / 99, - .groups = "drop" - ) %>% - ungroup() - return(list( - df_ci = df_ci, - sim_collect = sim_collect, - boot_n = boot_n, - sim_n = sim_n - )) -} - -errors_scores <- function(df, balance = rep(1, 3), ts_validation = TRUE, ...) { - stopifnot(length(balance) == 3) - error_cols <- c(ifelse(ts_validation, "nrmse_test", "nrmse_train"), "decomp.rssd", "mape") - stopifnot(all(error_cols %in% colnames(df))) - balance <- balance / sum(balance) - scores <- df %>% - select(all_of(error_cols)) %>% - rename("nrmse" = 1) %>% - mutate( - nrmse = ifelse(is.infinite(.data$nrmse), max(is.finite(.data$nrmse)), .data$nrmse), - decomp.rssd = ifelse(is.infinite(.data$decomp.rssd), max(is.finite(.data$decomp.rssd)), .data$decomp.rssd), - mape = ifelse(is.infinite(.data$mape), max(is.finite(.data$mape)), .data$mape) - ) %>% - # Force normalized values so they can be comparable - mutate( - nrmse_n = .min_max_norm(.data$nrmse), - decomp.rssd_n = .min_max_norm(.data$decomp.rssd), - mape_n = .min_max_norm(.data$mape) - ) %>% - replace(., is.na(.), 0) %>% - # Balance to give more or less importance to each error - mutate( - nrmse_w = balance[1] * .data$nrmse_n, - decomp.rssd_w = balance[2] * .data$decomp.rssd_n, - mape_w = balance[3] * .data$mape_n - ) %>% - # Calculate error score - mutate(error_score = sqrt(.data$nrmse_w^2 + .data$decomp.rssd_w^2 + .data$mape_w^2)) %>% - pull(.data$error_score) - return(scores) -} - -# ROIs data.frame for clustering (from xDecompAgg or pareto_aggregated.csv) -.prepare_df <- function(x, all_media, dep_var_type, cluster_by) { - if (cluster_by == "performance") { - check_opts(all_media, unique(x$rn)) - if (dep_var_type == "revenue") { - outcome <- select(x, .data$solID, .data$rn, .data$roi_total) %>% - tidyr::spread(key = .data$rn, value = .data$roi_total) %>% - removenacols(all = FALSE) %>% - select(any_of(c("solID", all_media))) - } - if (dep_var_type == "conversion") { - outcome <- select(x, .data$solID, .data$rn, .data$cpa_total) %>% - filter(is.finite(.data$cpa_total)) %>% - tidyr::spread(key = .data$rn, value = .data$cpa_total) %>% - removenacols(all = FALSE) %>% - select(any_of(c("solID", all_media))) - } - errors <- distinct( - x, .data$solID, starts_with("nrmse"), .data$decomp.rssd, .data$mape - ) - outcome <- left_join(outcome, errors, "solID") %>% ungroup() - } else { - if (cluster_by == "hyperparameters") { - outcome <- select( - x, .data$solID, contains(HYPS_NAMES), - contains(c("nrmse", "decomp.rssd", "mape")) - ) %>% - removenacols(all = FALSE) - } - } - return(outcome) -} - -.min_max_norm <- function(x, min = 0, max = 1) { - x <- x[is.finite(x)] - x <- x[!is.na(x)] - if (length(x) <= 1) { - return(x) - } - a <- min(x, na.rm = TRUE) - b <- max(x, na.rm = TRUE) - if (b - a != 0) { - return((max - min) * (x - a) / (b - a) + min) - } else { - return(x) - } -} - -.clusters_df <- function(df, all_paid, balance = rep(1, 3), limit = 1, ts_validation = TRUE, ...) { - df %>% - mutate(error_score = errors_scores(., balance, ts_validation = ts_validation, ...)) %>% - replace(., is.na(.), 0) %>% - group_by(.data$cluster) %>% - arrange(.data$cluster, .data$error_score) %>% - slice(1:limit) %>% - mutate(rank = row_number()) %>% - select(.data$cluster, .data$rank, everything()) -} - -.plot_clusters_ci <- function(sim_collect, df_ci, dep_var_type, boot_n, sim_n) { - temp <- ifelse(dep_var_type == "conversion", "CPA", "ROAS") - df_ci <- df_ci[complete.cases(df_ci), ] - p <- ggplot(sim_collect, aes(x = .data$x_sim, y = .data$rn)) + - facet_wrap(~ .data$cluster_title, scales = "free_x") + - xlim(range(sim_collect$x_sim)) + - geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) + - geom_text( - data = df_ci, - aes(x = .data$boot_mean, y = .data$rn, label = .data$boot_ci), - position = position_nudge(x = -0.02, y = 0.1), - colour = "grey30", size = 3.5 - ) + - geom_vline(xintercept = 1, linetype = "dashed", size = .5, colour = "grey75") + - # scale_fill_viridis_c(option = "D") + - labs( - title = paste("In-Cluster", temp, "& bootstrapped 95% CI"), - subtitle = "Sampling distribution of cluster mean", - x = temp, - y = "Density", - fill = temp, - caption = sprintf( - "Based on %s bootstrap results with %s simulations", - formatNum(boot_n, abbr = TRUE), - formatNum(sim_n, abbr = TRUE) - ) - ) + - theme_lares(background = "white", legend = "none") + - theme(axis.line.x = element_line()) - if (temp == "ROAS") { - p <- p + geom_hline(yintercept = 1, alpha = 0.5, colour = "grey50", linetype = "dashed") - } - return(p) -} - -.plot_topsols_errors <- function(df, top_sols, limit = 1, balance = rep(1, 3)) { - balance <- balance / sum(balance) - left_join(df, select(top_sols, 1:3), "solID") %>% - mutate( - alpha = ifelse(is.na(.data$cluster), 0.6, 1), - label = ifelse(!is.na(.data$cluster), sprintf( - "[%s.%s]", .data$cluster, .data$rank - ), NA) - ) %>% - ggplot(aes(x = .data$nrmse, y = .data$decomp.rssd)) + - geom_point(aes(colour = .data$cluster, alpha = .data$alpha)) + - geom_text(aes(label = .data$label), na.rm = TRUE, hjust = -0.3) + - guides(alpha = "none", colour = "none") + - labs( - title = paste("Selecting Top", limit, "Performing Models by Cluster"), - subtitle = "Based on minimum (weighted) distance to origin", - x = "NRMSE", y = "DECOMP.RSSD", - caption = sprintf( - "Weights: NRMSE %s%%, DECOMP.RSSD %s%%, MAPE %s%%", - round(100 * balance[1]), round(100 * balance[2]), round(100 * balance[3]) - ) - ) + - theme_lares(background = "white", ) -} - -.plot_topsols_rois <- function(df, top_sols, all_media, limit = 1) { - real_rois <- as.data.frame(df)[, -c(which(colnames(df) %in% c("mape", "nrmse", "decomp.rssd")))] - colnames(real_rois) <- paste0("real_", colnames(real_rois)) - top_sols %>% - left_join(real_rois, by = c("solID" = "real_solID")) %>% - mutate(label = sprintf("[%s.%s]\n%s", .data$cluster, .data$rank, .data$solID)) %>% - tidyr::gather("media", "perf", contains(all_media)) %>% - filter(grepl("real_", .data$media)) %>% - mutate(media = gsub("real_", "", .data$media)) %>% - ggplot(aes(x = reorder(.data$media, .data$perf), y = .data$perf)) + - facet_grid(.data$label ~ .) + - geom_col() + - coord_flip() + - labs( - title = paste("Top Performing Models"), - x = NULL, y = "Mean metric per media" - ) + - theme_lares(background = "white", ) -} - -.bootci <- function(samp, boot_n, seed = 1, ...) { - set.seed(seed) - if (length(samp[!is.na(samp)]) > 1) { - samp_n <- length(samp) - samp_mean <- mean(samp, na.rm = TRUE) - boot_sample <- matrix( - sample(x = samp, size = samp_n * boot_n, replace = TRUE), - nrow = boot_n, ncol = samp_n - ) - boot_means <- apply(X = boot_sample, MARGIN = 1, FUN = mean) - se <- sd(boot_means) - # binwidth <- diff(range(boot_means))/30 - # plot_boot <- ggplot(data.frame(x = boot_means),aes(x = x)) + - # geom_histogram(aes(y = ..density.. ), binwidth = binwidth) + - # geom_density(color="red") - me <- qt(0.975, samp_n - 1) * se - # ci <- c(mean(boot_means) - me, mean(boot_means) + me) - samp_me <- me * sqrt(samp_n) - ci <- c(samp_mean - samp_me, samp_mean + samp_me) - - return(list(boot_means = boot_means, ci = ci, se = se)) - } else { - return(list(boot_means = samp, ci = c(samp, samp), se = 0)) - } -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Clustering to Reduce Number of Models based on ROI and Errors +#' +#' \code{robyn_clusters()} uses output from \code{robyn_run()}, +#' to reduce the number of models and create bootstrapped confidence +#' interval and help the user pick up the best (lowest combined error) +#' of the most different kinds (clusters) of models. +#' +#' @inheritParams lares::clusterKmeans +#' @inheritParams hyper_names +#' @inheritParams robyn_outputs +#' @param input \code{robyn_export()}'s output or \code{pareto_aggregated.csv} results. +#' @param dep_var_type Character. For dep_var_type 'revenue', ROI is used for clustering. +#' For conversion', CPA is used for clustering. +#' @param cluster_by Character. Any of: "performance" or "hyperparameters". +#' @param max_clusters Integer. Maximum number of clusters. +#' @param limit Integer. Top N results per cluster. If kept in "auto", will select k +#' as the cluster in which the WSS variance was less than 5\%. +#' @param weights Vector, size 3. How much should each error weight? +#' Order: nrmse, decomp.rssd, mape. The highest the value, the closer it will be scaled +#' to origin. Each value will be normalized so they all sum 1. +#' @param export Export plots into local files? +#' @param ... Additional parameters passed to \code{lares::clusterKmeans()}. +#' @author Bernardo Lares (bernardolares@@meta.com) +#' @examples +#' \dontrun{ +#' # Having InputCollect and OutputCollect results +#' cls <- robyn_clusters( +#' input = OutputCollect, +#' all_media = InputCollect$all_media, +#' k = 3, limit = 2, +#' weights = c(1, 1, 1.5) +#' ) +#' } +#' @return List. Clustering results as labeled data.frames and plots. +#' @export +robyn_clusters <- function(input, dep_var_type, + cluster_by = "hyperparameters", + all_media = NULL, + k = "auto", wss_var = 0.06, max_clusters = 10, limit = 1, + weights = rep(1, 3), dim_red = "PCA", + quiet = FALSE, export = FALSE, seed = 123, + ...) { + set.seed(seed) + check_opts(cluster_by, c("performance", "hyperparameters")) + if ("robyn_outputs" %in% class(input)) { + if (is.null(all_media)) { + aux <- colnames(input$mediaVecCollect) + all_media <- aux[-c(1, which(aux == "type"):length(aux))] + path <- input$plot_folder + } else { + path <- paste0(getwd(), "/") + } + # Pareto and ROI data + x <- xDecompAgg <- input$xDecompAgg + if (cluster_by %in% "hyperparameters") x <- input$resultHypParam + df <- .prepare_df(x, all_media, dep_var_type, cluster_by) + } else { + stop(paste( + "You must run robyn_outputs(..., clusters = TRUE) or", + "pass a valid data.frame (sames as pareto_aggregated.csv output)", + "in order to use robyn_clusters()" + )) + } + + ignore <- c("solID", "mape", "decomp.rssd", "nrmse", "nrmse_test", "nrmse_train", "nrmse_val", "pareto") + + # Auto K selected by less than 5% WSS variance (convergence) + min_clusters <- 3 + limit_clusters <- min(nrow(df) - 1, 30) + if ("auto" %in% k) { + cls <- tryCatch( + { + suppressMessages( + clusterKmeans(df, + k = NULL, limit = limit_clusters, ignore = ignore, + dim_red = dim_red, quiet = TRUE, seed = seed + ) + ) + }, + error = function(err) { + message(paste("Couldn't automatically create clusters:", err)) + return(NULL) + } + ) + # if (is.null(cls)) return(NULL) + k <- cls$nclusters %>% + mutate( + pareto = .data$wss / .data$wss[1], + dif = lag(.data$pareto) - .data$pareto + ) %>% + filter(.data$dif > wss_var) %>% + pull(.data$n) %>% + max(., na.rm = TRUE) + if (k < min_clusters) { + warning(sprintf("Too few clusters: %s. Setting to %s", k, min_clusters)) + k <- min_clusters + } + if (!quiet) { + message(sprintf( + ">> Auto selected k = %s (clusters) based on minimum WSS variance of %s%%", + k, wss_var * 100 + )) + } + if (k > max_clusters) { + warning(sprintf("Too many clusters: %s. Lowering to %s (max_clusters)", k, max_clusters)) + k <- max_clusters + } + } + + # Build clusters + stopifnot(k %in% min_clusters:30) + suppressMessages( + cls <- clusterKmeans( + df, + k = k, limit = limit_clusters, ignore = ignore, + dim_red = dim_red, quiet = TRUE, seed = seed + ) + ) + cls$df <- group_by(cls$df, .data$cluster) %>% + mutate(n = n()) %>% + ungroup() + + # Select top models by minimum (weighted) distance to zero + all_paid <- setdiff(names(cls$df), c(ignore, "cluster")) + ts_validation <- ifelse("nrmse_test" %in% colnames(cls$df), TRUE, FALSE) + top_sols <- .clusters_df(df = cls$df, all_paid, balance = weights, limit, ts_validation) + + # Build in-cluster CI with bootstrap + ci_list <- confidence_calcs(xDecompAgg, cls, all_paid, dep_var_type, k, cluster_by, ...) + + output <- list( + # Data and parameters + data = mutate(cls$df, top_sol = .data$solID %in% top_sols$solID, cluster = as.integer(.data$cluster)), + df_cluster_ci = ungroup(ci_list$df_ci) %>% dplyr::select(-.data$cluster_title), + n_clusters = k, + boot_n = ci_list$boot_n, + sim_n = ci_list$sim_n, + errors_weights = weights, + # Within Groups Sum of Squares Plot + wss = cls$nclusters_plot + theme_lares(background = "white"), + # Grouped correlations per cluster + corrs = cls$correlations + labs(title = "Top Correlations by Cluster", subtitle = NULL), + # Mean ROI per cluster + clusters_means = cls$means, + # Dim reduction clusters + clusters_PCA = cls[["PCA"]], + clusters_tSNE = cls[["tSNE"]], + # Top Clusters + models = top_sols, + plot_clusters_ci = .plot_clusters_ci(ci_list$sim_collect, ci_list$df_ci, dep_var_type, ci_list$boot_n, ci_list$sim_n), + plot_models_errors = .plot_topsols_errors(df, top_sols, limit, weights), + plot_models_rois = .plot_topsols_rois(df, top_sols, all_media, limit) + ) + + if (export) { + write.csv(output$data, file = paste0(path, "pareto_clusters.csv")) + write.csv(output$df_cluster_ci, file = paste0(path, "pareto_clusters_ci.csv")) + ggsave(paste0(path, "pareto_clusters_wss.png"), plot = output$wss, dpi = 500, width = 5, height = 4) + get_height <- ceiling(k / 2) / 6 + db <- (output$plot_clusters_ci / (output$plot_models_rois + output$plot_models_errors)) + + patchwork::plot_layout(heights = c(get_height, 1), guides = "collect") + # Suppressing "Picking joint bandwidth of x" messages + + # In min(data$x, na.rm = TRUE) : no non-missing arguments to min; returning Inf warnings + # Setting try() to avoid error: One or both dimensions exceed the maximum (50000px). + # Use `options(ragg.max_dim = ...)` to change the max + try(suppressMessages(suppressWarnings(ggsave(paste0(path, "pareto_clusters_detail.png"), + plot = db, dpi = 500, width = 12, height = 4 + length(all_paid) * 2, limitsize = FALSE + )))) + } + + return(output) +} + +confidence_calcs <- function( + xDecompAgg, cls, all_paid, dep_var_type, k, cluster_by, + boot_n = 1000, sim_n = 10000, ...) { + df_clusters_outcome <- xDecompAgg %>% + filter(!is.na(.data$total_spend)) %>% + left_join(y = dplyr::select(cls$df, c("solID", "cluster")), by = "solID") %>% + dplyr::select(c("solID", "cluster", "rn", "roi_total", "cpa_total", "robynPareto")) %>% + group_by(.data$cluster, .data$rn) %>% + mutate(n = n()) %>% + filter(!is.na(.data$cluster)) %>% + arrange(.data$cluster, .data$rn) + + cluster_collect <- list() + chn_collect <- list() + sim_collect <- list() + for (j in 1:k) { + df_outcome <- filter(df_clusters_outcome, .data$cluster == j) + if (length(unique(df_outcome$solID)) < 3) { + warning(paste("Cluster", j, "does not contain enough models to calculate CI")) + } else { + if (cluster_by == "hyperparameters") { + all_paid <- unique(gsub(paste(paste0("_", HYPS_NAMES), collapse = "|"), "", all_paid)) + } + for (i in all_paid) { + # Bootstrap CI + if (dep_var_type == "conversion") { + # Drop CPA == Inf + df_chn <- filter(df_outcome, .data$rn == i & is.finite(.data$cpa_total)) + v_samp <- df_chn$cpa_total + } else { + df_chn <- filter(df_outcome, .data$rn == i) + v_samp <- df_chn$roi_total + } + boot_res <- .bootci(samp = v_samp, boot_n = boot_n) + boot_mean <- mean(boot_res$boot_means, na.rm = TRUE) + boot_se <- boot_res$se + ci_low <- ifelse(boot_res$ci[1] <= 0, 0, boot_res$ci[1]) + ci_up <- boot_res$ci[2] + + # Collect loop results + chn_collect[[i]] <- df_chn %>% + mutate( + ci_low = ci_low, + ci_up = ci_up, + n = length(v_samp), + boot_se = boot_se, + boot_mean = boot_mean, + cluster = j + ) + sim_collect[[i]] <- data.frame( + cluster = j, + rn = i, + n = length(v_samp), + boot_mean = boot_mean, + x_sim = suppressWarnings(rnorm(sim_n, mean = boot_mean, sd = boot_se)) + ) %>% + mutate(y_sim = dnorm(.data$x_sim, mean = boot_mean, sd = boot_se)) + } + } + cluster_collect[[j]] <- list(chn_collect = chn_collect, sim_collect = sim_collect) + } + + sim_collect <- bind_rows(lapply(cluster_collect, function(x) { + bind_rows(lapply(x$sim_collect, function(y) y)) + })) %>% + filter(.data$n > 0) %>% + mutate(cluster_title = sprintf("Cl.%s (n=%s)", .data$cluster, .data$n)) %>% + ungroup() %>% + as_tibble() + + df_ci <- bind_rows(lapply(cluster_collect, function(x) { + bind_rows(lapply(x$chn_collect, function(y) y)) + })) %>% + mutate(cluster_title = sprintf("Cl.%s (n=%s)", .data$cluster, .data$n)) %>% + dplyr::select( + .data$rn, .data$cluster_title, .data$n, .data$cluster, + .data$boot_mean, .data$boot_se, .data$ci_low, .data$ci_up + ) %>% + distinct() %>% + group_by(.data$rn, .data$cluster_title, .data$cluster) %>% + summarise( + n = .data$n, + boot_mean = .data$boot_mean, + boot_se = boot_se, + boot_ci = sprintf("[%s, %s]", round(.data$ci_low, 2), round(.data$ci_up, 2)), + ci_low = .data$ci_low, + ci_up = .data$ci_up, + sd = boot_se * sqrt(.data$n - 1), + dist100 = (.data$ci_up - .data$ci_low + 2 * boot_se * sqrt(.data$n - 1)) / 99, + .groups = "drop" + ) %>% + ungroup() + return(list( + df_ci = df_ci, + sim_collect = sim_collect, + boot_n = boot_n, + sim_n = sim_n + )) +} + +errors_scores <- function(df, balance = rep(1, 3), ts_validation = TRUE, ...) { + stopifnot(length(balance) == 3) + error_cols <- c(ifelse(ts_validation, "nrmse_test", "nrmse_train"), "decomp.rssd", "mape") + stopifnot(all(error_cols %in% colnames(df))) + balance <- balance / sum(balance) + scores <- df %>% + select(all_of(error_cols)) %>% + rename("nrmse" = 1) %>% + mutate( + nrmse = ifelse(is.infinite(.data$nrmse), max(is.finite(.data$nrmse)), .data$nrmse), + decomp.rssd = ifelse(is.infinite(.data$decomp.rssd), max(is.finite(.data$decomp.rssd)), .data$decomp.rssd), + mape = ifelse(is.infinite(.data$mape), max(is.finite(.data$mape)), .data$mape) + ) %>% + # Force normalized values so they can be comparable + mutate( + nrmse_n = .min_max_norm(.data$nrmse), + decomp.rssd_n = .min_max_norm(.data$decomp.rssd), + mape_n = .min_max_norm(.data$mape) + ) %>% + replace(., is.na(.), 0) %>% + # Balance to give more or less importance to each error + mutate( + nrmse_w = balance[1] * .data$nrmse_n, + decomp.rssd_w = balance[2] * .data$decomp.rssd_n, + mape_w = balance[3] * .data$mape_n + ) %>% + # Calculate error score + mutate(error_score = sqrt(.data$nrmse_w^2 + .data$decomp.rssd_w^2 + .data$mape_w^2)) %>% + pull(.data$error_score) + return(scores) +} + +# ROIs data.frame for clustering (from xDecompAgg or pareto_aggregated.csv) +.prepare_df <- function(x, all_media, dep_var_type, cluster_by) { + if (cluster_by == "performance") { + check_opts(all_media, unique(x$rn)) + if (dep_var_type == "revenue") { + outcome <- select(x, .data$solID, .data$rn, .data$roi_total) %>% + tidyr::spread(key = .data$rn, value = .data$roi_total) %>% + removenacols(all = FALSE) %>% + select(any_of(c("solID", all_media))) + } + if (dep_var_type == "conversion") { + outcome <- select(x, .data$solID, .data$rn, .data$cpa_total) %>% + filter(is.finite(.data$cpa_total)) %>% + tidyr::spread(key = .data$rn, value = .data$cpa_total) %>% + removenacols(all = FALSE) %>% + select(any_of(c("solID", all_media))) + } + errors <- distinct( + x, .data$solID, starts_with("nrmse"), .data$decomp.rssd, .data$mape + ) + outcome <- left_join(outcome, errors, "solID") %>% ungroup() + } else { + if (cluster_by == "hyperparameters") { + outcome <- select( + x, .data$solID, contains(HYPS_NAMES), + contains(c("nrmse", "decomp.rssd", "mape")) + ) %>% + removenacols(all = FALSE) + } + } + return(outcome) +} + +.min_max_norm <- function(x, min = 0, max = 1) { + x <- x[is.finite(x)] + x <- x[!is.na(x)] + if (length(x) <= 1) { + return(x) + } + a <- min(x, na.rm = TRUE) + b <- max(x, na.rm = TRUE) + if (b - a != 0) { + return((max - min) * (x - a) / (b - a) + min) + } else { + return(x) + } +} + +.clusters_df <- function(df, all_paid, balance = rep(1, 3), limit = 1, ts_validation = TRUE, ...) { + df %>% + mutate(error_score = errors_scores(., balance, ts_validation = ts_validation, ...)) %>% + replace(., is.na(.), 0) %>% + group_by(.data$cluster) %>% + arrange(.data$cluster, .data$error_score) %>% + slice(1:limit) %>% + mutate(rank = row_number()) %>% + select(.data$cluster, .data$rank, everything()) +} + +.plot_clusters_ci <- function(sim_collect, df_ci, dep_var_type, boot_n, sim_n) { + temp <- ifelse(dep_var_type == "conversion", "CPA", "ROAS") + df_ci <- df_ci[complete.cases(df_ci), ] + p <- ggplot(sim_collect, aes(x = .data$x_sim, y = .data$rn)) + + facet_wrap(~ .data$cluster_title, scales = "free_x") + + xlim(range(sim_collect$x_sim)) + + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) + + geom_text( + data = df_ci, + aes(x = .data$boot_mean, y = .data$rn, label = .data$boot_ci), + position = position_nudge(x = -0.02, y = 0.1), + colour = "grey30", size = 3.5 + ) + + geom_vline(xintercept = 1, linetype = "dashed", size = .5, colour = "grey75") + + # scale_fill_viridis_c(option = "D") + + labs( + title = paste("In-Cluster", temp, "& bootstrapped 95% CI"), + subtitle = "Sampling distribution of cluster mean", + x = temp, + y = "Density", + fill = temp, + caption = sprintf( + "Based on %s bootstrap results with %s simulations", + formatNum(boot_n, abbr = TRUE), + formatNum(sim_n, abbr = TRUE) + ) + ) + + theme_lares(background = "white", legend = "none") + + theme(axis.line.x = element_line()) + if (temp == "ROAS") { + p <- p + geom_hline(yintercept = 1, alpha = 0.5, colour = "grey50", linetype = "dashed") + } + return(p) +} + +.plot_topsols_errors <- function(df, top_sols, limit = 1, balance = rep(1, 3)) { + balance <- balance / sum(balance) + left_join(df, select(top_sols, 1:3), "solID") %>% + mutate( + alpha = ifelse(is.na(.data$cluster), 0.6, 1), + label = ifelse(!is.na(.data$cluster), sprintf( + "[%s.%s]", .data$cluster, .data$rank + ), NA) + ) %>% + ggplot(aes(x = .data$nrmse, y = .data$decomp.rssd)) + + geom_point(aes(colour = .data$cluster, alpha = .data$alpha)) + + geom_text(aes(label = .data$label), na.rm = TRUE, hjust = -0.3) + + guides(alpha = "none", colour = "none") + + labs( + title = paste("Selecting Top", limit, "Performing Models by Cluster"), + subtitle = "Based on minimum (weighted) distance to origin", + x = "NRMSE", y = "DECOMP.RSSD", + caption = sprintf( + "Weights: NRMSE %s%%, DECOMP.RSSD %s%%, MAPE %s%%", + round(100 * balance[1]), round(100 * balance[2]), round(100 * balance[3]) + ) + ) + + theme_lares(background = "white", ) +} + +.plot_topsols_rois <- function(df, top_sols, all_media, limit = 1) { + real_rois <- as.data.frame(df)[, -c(which(colnames(df) %in% c("mape", "nrmse", "decomp.rssd")))] + colnames(real_rois) <- paste0("real_", colnames(real_rois)) + top_sols %>% + left_join(real_rois, by = c("solID" = "real_solID")) %>% + mutate(label = sprintf("[%s.%s]\n%s", .data$cluster, .data$rank, .data$solID)) %>% + tidyr::gather("media", "perf", contains(all_media)) %>% + filter(grepl("real_", .data$media)) %>% + mutate(media = gsub("real_", "", .data$media)) %>% + ggplot(aes(x = reorder(.data$media, .data$perf), y = .data$perf)) + + facet_grid(.data$label ~ .) + + geom_col() + + coord_flip() + + labs( + title = paste("Top Performing Models"), + x = NULL, y = "Mean metric per media" + ) + + theme_lares(background = "white", ) +} + +.bootci <- function(samp, boot_n, seed = 1, ...) { + set.seed(seed) + if (length(samp[!is.na(samp)]) > 1) { + samp_n <- length(samp) + samp_mean <- mean(samp, na.rm = TRUE) + boot_sample <- matrix( + sample(x = samp, size = samp_n * boot_n, replace = TRUE), + nrow = boot_n, ncol = samp_n + ) + boot_means <- apply(X = boot_sample, MARGIN = 1, FUN = mean) + se <- sd(boot_means) + # binwidth <- diff(range(boot_means))/30 + # plot_boot <- ggplot(data.frame(x = boot_means),aes(x = x)) + + # geom_histogram(aes(y = ..density.. ), binwidth = binwidth) + + # geom_density(color="red") + me <- qt(0.975, samp_n - 1) * se + # ci <- c(mean(boot_means) - me, mean(boot_means) + me) + samp_me <- me * sqrt(samp_n) + ci <- c(samp_mean - samp_me, samp_mean + samp_me) + + return(list(boot_means = boot_means, ci = ci, se = se)) + } else { + return(list(boot_means = samp, ci = c(samp, samp), se = 0)) + } +} diff --git a/R/convergence.R b/R/convergence.R index d60b6c6..3f10d7a 100644 --- a/R/convergence.R +++ b/R/convergence.R @@ -1,220 +1,220 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Check Models Convergence -#' -#' \code{robyn_converge()} consumes \code{robyn_run()} outputs -#' and calculate convergence status and builds convergence plots. -#' Convergence is calculated by default using the following criteria -#' (having kept the default parameters: sd_qtref = 3 and med_lowb = 2): -#' \describe{ -#' \item{Criteria #1:}{Last quantile's standard deviation < first 3 -#' quantiles' mean standard deviation} -#' \item{Criteria #2:}{Last quantile's absolute median < absolute first -#' quantile's absolute median - 2 * first 3 quantiles' mean standard -#' deviation} -#' } -#' Both mentioned criteria have to be satisfied to consider MOO convergence. -#' -#' @param OutputModels List. Output from \code{robyn_run()}. -#' @param n_cuts Integer. Default to 20 (5\% cuts each). -#' @param sd_qtref Integer. Reference quantile of the error convergence rule -#' for standard deviation (Criteria #1). Defaults to 3. -#' @param med_lowb Integer. Lower bound distance of the error convergence rule -#' for median. (Criteria #2). Default to 3. -#' @param nrmse_win Numeric vector. Lower and upper quantiles thresholds to -#' winsorize NRMSE. Set values within [0,1]; default: c(0, 0.998) which is 1/500. -#' @param ... Additional parameters -#' @examples -#' \dontrun{ -#' # Having OutputModels results -#' MOO <- robyn_converge( -#' OutputModels, -#' n_cuts = 10, -#' sd_qtref = 3, -#' med_lowb = 3 -#' ) -#' } -#' @return List. Plots and MOO convergence results. -#' @export -robyn_converge <- function(OutputModels, - n_cuts = 20, sd_qtref = 3, med_lowb = 2, - nrmse_win = c(0, 0.998), ...) { - stopifnot(n_cuts > min(c(sd_qtref, med_lowb)) + 1) - - # Gather all trials - get_trials <- which(names(OutputModels) %in% paste0("trial", seq(OutputModels$trials))) - df <- bind_rows(lapply(OutputModels[get_trials], function(x) x$resultCollect$resultHypParam)) - calibrated <- isTRUE(sum(df$mape) > 0) - - # Calculate deciles - dt_objfunc_cvg <- tidyr::gather(df, "error_type", "value", any_of(c("nrmse", "decomp.rssd", "mape"))) %>% - select(.data$ElapsedAccum, .data$trial, .data$error_type, .data$value) %>% - arrange(.data$trial, .data$ElapsedAccum) %>% - filter(.data$value > 0, is.finite(.data$value)) %>% - mutate(error_type = toupper(.data$error_type)) %>% - group_by(.data$error_type, .data$trial) %>% - mutate(iter = row_number()) %>% - ungroup() %>% - mutate(cuts = cut( - .data$iter, - breaks = seq(0, max(.data$iter), length.out = n_cuts + 1), - labels = round(seq(max(.data$iter) / n_cuts, max(.data$iter), length.out = n_cuts)), - include.lowest = TRUE, ordered_result = TRUE, dig.lab = 6 - )) - - # Calculate standard deviations and absolute medians on each cut - errors <- dt_objfunc_cvg %>% - group_by(.data$error_type, .data$cuts) %>% - summarise( - n = n(), - median = median(.data$value), - std = sd(.data$value), - .groups = "drop" - ) %>% - group_by(.data$error_type) %>% - mutate( - med_var_P = abs(round(100 * (.data$median - lag(.data$median)) / .data$median, 2)) - ) %>% - group_by(.data$error_type) %>% - mutate( - first_med = abs(dplyr::first(.data$median)), - first_med_avg = abs(mean(.data$median[1:sd_qtref])), - last_med = abs(dplyr::last(.data$median)), - first_sd = dplyr::first(.data$std), - first_sd_avg = mean(.data$std[1:sd_qtref]), - last_sd = dplyr::last(.data$std) - ) %>% - mutate( - med_thres = abs(.data$first_med - med_lowb * .data$first_sd_avg), - flag_med = abs(.data$median) < .data$med_thres, - flag_sd = .data$std < .data$first_sd_avg - ) - - conv_msg <- NULL - for (obj_fun in unique(errors$error_type)) { - temp.df <- filter(errors, .data$error_type == obj_fun) %>% - mutate(median = signif(median, 2)) - last.qt <- tail(temp.df, 1) - greater <- ">" # intToUtf8(8814) - temp <- glued( - paste( - "{error_type} {did}converged: sd@qt.{quantile} {sd} {symb_sd} {sd_threh} &", - "|med@qt.{quantile}| {qtn_median} {symb_med} {med_threh}" - ), - error_type = last.qt$error_type, - did = ifelse(last.qt$flag_sd & last.qt$flag_med, "", "NOT "), - sd = signif(last.qt$last_sd, 2), - symb_sd = ifelse(last.qt$flag_sd, "<=", greater), - sd_threh = signif(last.qt$first_sd_avg, 2), - quantile = n_cuts, - qtn_median = signif(last.qt$last_med, 2), - symb_med = ifelse(last.qt$flag_med, "<=", greater), - med_threh = signif(last.qt$med_thres, 2) - ) - conv_msg <- c(conv_msg, temp) - } - message(paste(paste("-", conv_msg), collapse = "\n")) - - subtitle <- sprintf( - "%s trial%s with %s iterations%s using %s", - max(df$trial), ifelse(max(df$trial) > 1, "s", ""), max(dt_objfunc_cvg$cuts), - ifelse(max(df$trial) > 1, " each", ""), OutputModels$nevergrad_algo - ) - - moo_distrb_plot <- dt_objfunc_cvg %>% - mutate(id = as.integer(.data$cuts)) %>% - mutate(cuts = factor(.data$cuts, levels = rev(levels(.data$cuts)))) %>% - group_by(.data$error_type) %>% - mutate(value = lares::winsorize(.data$value, nrmse_win), na.rm = TRUE) %>% - ggplot(aes(x = .data$value, y = .data$cuts, fill = -.data$id)) + - ggridges::geom_density_ridges( - scale = 2.5, col = "white", quantile_lines = TRUE, quantiles = 2, alpha = 0.7 - ) + - facet_grid(. ~ .data$error_type, scales = "free") + - scale_fill_distiller(palette = "GnBu") + - guides(fill = "none") + - theme_lares(background = "white", ) + - labs( - x = "Objective functions", y = "Iterations [#]", - title = "Objective convergence by iterations quantiles", - subtitle = subtitle, - caption = paste(conv_msg, collapse = "\n") - ) - - moo_cloud_plot <- df %>% - mutate(nrmse = lares::winsorize(.data$nrmse, nrmse_win), na.rm = TRUE) %>% - ggplot(aes( - x = .data$nrmse, y = .data$decomp.rssd, colour = .data$ElapsedAccum - )) + - scale_colour_gradient(low = "skyblue", high = "navyblue") + - labs( - title = ifelse(!calibrated, "Multi-objective evolutionary performance", - "Multi-objective evolutionary performance with calibration" - ), - subtitle = subtitle, - x = ifelse(max(nrmse_win) == 1, "NRMSE", sprintf("NRMSE [Winsorized %s]", paste(nrmse_win, collapse = "-"))), - y = "DECOMP.RSSD", - colour = "Time [s]", - size = "MAPE", - alpha = NULL, - caption = paste(conv_msg, collapse = "\n") - ) + - theme_lares(background = "white", ) - - if (calibrated) { - moo_cloud_plot <- moo_cloud_plot + - geom_point(data = df, aes(size = .data$mape, alpha = 1 - .data$mape)) + - guides(alpha = "none") - } else { - moo_cloud_plot <- moo_cloud_plot + geom_point() - } - - cvg_out <- list( - moo_distrb_plot = moo_distrb_plot, - moo_cloud_plot = moo_cloud_plot, - errors = errors, - conv_msg = conv_msg - ) - attr(cvg_out, "sd_qtref") <- sd_qtref - attr(cvg_out, "med_lowb") <- med_lowb - - return(invisible(cvg_out)) -} - -test_cvg <- function() { - # Experiment with gamma distribution fitting - gamma_mle <- function(params, x) { - gamma_shape <- params[[1]] - gamma_scale <- params[[2]] - # Negative log-likelihood - return(-sum(dgamma(x, shape = gamma_shape, scale = gamma_scale, log = TRUE))) - } - f_geo <- function(a, r, n) { - for (i in 2:n) a[i] <- a[i - 1] * r - return(a) - } - seq_nrmse <- f_geo(5, 0.7, 100) - df_nrmse <- data.frame(x = 1:100, y = seq_nrmse, type = "true") - mod_gamma <- nloptr( - x0 = c(1, 1), eval_f = gamma_mle, lb = c(0, 0), - x = seq_nrmse, - opts = list(algorithm = "NLOPT_LN_SBPLX", maxeval = 1e5) - ) - gamma_params <- mod_gamma$solution - seq_nrmse_gam <- 1 / dgamma(seq_nrmse, shape = gamma_params[[1]], scale = gamma_params[[2]]) - seq_nrmse_gam <- seq_nrmse_gam / (max(seq_nrmse_gam) - min(seq_nrmse_gam)) - seq_nrmse_gam <- max(seq_nrmse) * seq_nrmse_gam - range(seq_nrmse_gam) - range(seq_nrmse) - df_nrmse_gam <- data.frame(x = 1:100, y = seq_nrmse_gam, type = "pred") - df_nrmse <- bind_rows(df_nrmse, df_nrmse_gam) - p <- ggplot(df_nrmse, aes(.data$x, .data$y, color = .data$type)) + - geom_line() - return(p) - # g_low = qgamma(0.025, shape=gamma_params[[1]], scale= gamma_params[[2]]) - # g_up = qgamma(0.975, shape=gamma_params[[1]], scale= gamma_params[[2]]) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Check Models Convergence +#' +#' \code{robyn_converge()} consumes \code{robyn_run()} outputs +#' and calculate convergence status and builds convergence plots. +#' Convergence is calculated by default using the following criteria +#' (having kept the default parameters: sd_qtref = 3 and med_lowb = 2): +#' \describe{ +#' \item{Criteria #1:}{Last quantile's standard deviation < first 3 +#' quantiles' mean standard deviation} +#' \item{Criteria #2:}{Last quantile's absolute median < absolute first +#' quantile's absolute median - 2 * first 3 quantiles' mean standard +#' deviation} +#' } +#' Both mentioned criteria have to be satisfied to consider MOO convergence. +#' +#' @param OutputModels List. Output from \code{robyn_run()}. +#' @param n_cuts Integer. Default to 20 (5\% cuts each). +#' @param sd_qtref Integer. Reference quantile of the error convergence rule +#' for standard deviation (Criteria #1). Defaults to 3. +#' @param med_lowb Integer. Lower bound distance of the error convergence rule +#' for median. (Criteria #2). Default to 3. +#' @param nrmse_win Numeric vector. Lower and upper quantiles thresholds to +#' winsorize NRMSE. Set values within [0,1]; default: c(0, 0.998) which is 1/500. +#' @param ... Additional parameters +#' @examples +#' \dontrun{ +#' # Having OutputModels results +#' MOO <- robyn_converge( +#' OutputModels, +#' n_cuts = 10, +#' sd_qtref = 3, +#' med_lowb = 3 +#' ) +#' } +#' @return List. Plots and MOO convergence results. +#' @export +robyn_converge <- function(OutputModels, + n_cuts = 20, sd_qtref = 3, med_lowb = 2, + nrmse_win = c(0, 0.998), ...) { + stopifnot(n_cuts > min(c(sd_qtref, med_lowb)) + 1) + + # Gather all trials + get_trials <- which(names(OutputModels) %in% paste0("trial", seq(OutputModels$trials))) + df <- bind_rows(lapply(OutputModels[get_trials], function(x) x$resultCollect$resultHypParam)) + calibrated <- isTRUE(sum(df$mape) > 0) + + # Calculate deciles + dt_objfunc_cvg <- tidyr::gather(df, "error_type", "value", any_of(c("nrmse", "decomp.rssd", "mape"))) %>% + select(.data$ElapsedAccum, .data$trial, .data$error_type, .data$value) %>% + arrange(.data$trial, .data$ElapsedAccum) %>% + filter(.data$value > 0, is.finite(.data$value)) %>% + mutate(error_type = toupper(.data$error_type)) %>% + group_by(.data$error_type, .data$trial) %>% + mutate(iter = row_number()) %>% + ungroup() %>% + mutate(cuts = cut( + .data$iter, + breaks = seq(0, max(.data$iter), length.out = n_cuts + 1), + labels = round(seq(max(.data$iter) / n_cuts, max(.data$iter), length.out = n_cuts)), + include.lowest = TRUE, ordered_result = TRUE, dig.lab = 6 + )) + + # Calculate standard deviations and absolute medians on each cut + errors <- dt_objfunc_cvg %>% + group_by(.data$error_type, .data$cuts) %>% + summarise( + n = n(), + median = median(.data$value), + std = sd(.data$value), + .groups = "drop" + ) %>% + group_by(.data$error_type) %>% + mutate( + med_var_P = abs(round(100 * (.data$median - lag(.data$median)) / .data$median, 2)) + ) %>% + group_by(.data$error_type) %>% + mutate( + first_med = abs(dplyr::first(.data$median)), + first_med_avg = abs(mean(.data$median[1:sd_qtref])), + last_med = abs(dplyr::last(.data$median)), + first_sd = dplyr::first(.data$std), + first_sd_avg = mean(.data$std[1:sd_qtref]), + last_sd = dplyr::last(.data$std) + ) %>% + mutate( + med_thres = abs(.data$first_med - med_lowb * .data$first_sd_avg), + flag_med = abs(.data$median) < .data$med_thres, + flag_sd = .data$std < .data$first_sd_avg + ) + + conv_msg <- NULL + for (obj_fun in unique(errors$error_type)) { + temp.df <- filter(errors, .data$error_type == obj_fun) %>% + mutate(median = signif(median, 2)) + last.qt <- tail(temp.df, 1) + greater <- ">" # intToUtf8(8814) + temp <- glued( + paste( + "{error_type} {did}converged: sd@qt.{quantile} {sd} {symb_sd} {sd_threh} &", + "|med@qt.{quantile}| {qtn_median} {symb_med} {med_threh}" + ), + error_type = last.qt$error_type, + did = ifelse(last.qt$flag_sd & last.qt$flag_med, "", "NOT "), + sd = signif(last.qt$last_sd, 2), + symb_sd = ifelse(last.qt$flag_sd, "<=", greater), + sd_threh = signif(last.qt$first_sd_avg, 2), + quantile = n_cuts, + qtn_median = signif(last.qt$last_med, 2), + symb_med = ifelse(last.qt$flag_med, "<=", greater), + med_threh = signif(last.qt$med_thres, 2) + ) + conv_msg <- c(conv_msg, temp) + } + message(paste(paste("-", conv_msg), collapse = "\n")) + + subtitle <- sprintf( + "%s trial%s with %s iterations%s using %s", + max(df$trial), ifelse(max(df$trial) > 1, "s", ""), max(dt_objfunc_cvg$cuts), + ifelse(max(df$trial) > 1, " each", ""), OutputModels$nevergrad_algo + ) + + moo_distrb_plot <- dt_objfunc_cvg %>% + mutate(id = as.integer(.data$cuts)) %>% + mutate(cuts = factor(.data$cuts, levels = rev(levels(.data$cuts)))) %>% + group_by(.data$error_type) %>% + mutate(value = lares::winsorize(.data$value, nrmse_win), na.rm = TRUE) %>% + ggplot(aes(x = .data$value, y = .data$cuts, fill = -.data$id)) + + ggridges::geom_density_ridges( + scale = 2.5, col = "white", quantile_lines = TRUE, quantiles = 2, alpha = 0.7 + ) + + facet_grid(. ~ .data$error_type, scales = "free") + + scale_fill_distiller(palette = "GnBu") + + guides(fill = "none") + + theme_lares(background = "white", ) + + labs( + x = "Objective functions", y = "Iterations [#]", + title = "Objective convergence by iterations quantiles", + subtitle = subtitle, + caption = paste(conv_msg, collapse = "\n") + ) + + moo_cloud_plot <- df %>% + mutate(nrmse = lares::winsorize(.data$nrmse, nrmse_win), na.rm = TRUE) %>% + ggplot(aes( + x = .data$nrmse, y = .data$decomp.rssd, colour = .data$ElapsedAccum + )) + + scale_colour_gradient(low = "skyblue", high = "navyblue") + + labs( + title = ifelse(!calibrated, "Multi-objective evolutionary performance", + "Multi-objective evolutionary performance with calibration" + ), + subtitle = subtitle, + x = ifelse(max(nrmse_win) == 1, "NRMSE", sprintf("NRMSE [Winsorized %s]", paste(nrmse_win, collapse = "-"))), + y = "DECOMP.RSSD", + colour = "Time [s]", + size = "MAPE", + alpha = NULL, + caption = paste(conv_msg, collapse = "\n") + ) + + theme_lares(background = "white", ) + + if (calibrated) { + moo_cloud_plot <- moo_cloud_plot + + geom_point(data = df, aes(size = .data$mape, alpha = 1 - .data$mape)) + + guides(alpha = "none") + } else { + moo_cloud_plot <- moo_cloud_plot + geom_point() + } + + cvg_out <- list( + moo_distrb_plot = moo_distrb_plot, + moo_cloud_plot = moo_cloud_plot, + errors = errors, + conv_msg = conv_msg + ) + attr(cvg_out, "sd_qtref") <- sd_qtref + attr(cvg_out, "med_lowb") <- med_lowb + + return(invisible(cvg_out)) +} + +test_cvg <- function() { + # Experiment with gamma distribution fitting + gamma_mle <- function(params, x) { + gamma_shape <- params[[1]] + gamma_scale <- params[[2]] + # Negative log-likelihood + return(-sum(dgamma(x, shape = gamma_shape, scale = gamma_scale, log = TRUE))) + } + f_geo <- function(a, r, n) { + for (i in 2:n) a[i] <- a[i - 1] * r + return(a) + } + seq_nrmse <- f_geo(5, 0.7, 100) + df_nrmse <- data.frame(x = 1:100, y = seq_nrmse, type = "true") + mod_gamma <- nloptr( + x0 = c(1, 1), eval_f = gamma_mle, lb = c(0, 0), + x = seq_nrmse, + opts = list(algorithm = "NLOPT_LN_SBPLX", maxeval = 1e5) + ) + gamma_params <- mod_gamma$solution + seq_nrmse_gam <- 1 / dgamma(seq_nrmse, shape = gamma_params[[1]], scale = gamma_params[[2]]) + seq_nrmse_gam <- seq_nrmse_gam / (max(seq_nrmse_gam) - min(seq_nrmse_gam)) + seq_nrmse_gam <- max(seq_nrmse) * seq_nrmse_gam + range(seq_nrmse_gam) + range(seq_nrmse) + df_nrmse_gam <- data.frame(x = 1:100, y = seq_nrmse_gam, type = "pred") + df_nrmse <- bind_rows(df_nrmse, df_nrmse_gam) + p <- ggplot(df_nrmse, aes(.data$x, .data$y, color = .data$type)) + + geom_line() + return(p) + # g_low = qgamma(0.025, shape=gamma_params[[1]], scale= gamma_params[[2]]) + # g_up = qgamma(0.975, shape=gamma_params[[1]], scale= gamma_params[[2]]) +} diff --git a/R/data.R b/R/data.R index 2dbb69d..a171943 100644 --- a/R/data.R +++ b/R/data.R @@ -1,63 +1,63 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Robyn Dataset: MMM Demo Data -#' -#' Simulated MMM data. Input time series should be daily, weekly or monthly. -#' -#' @family Dataset -#' @docType data -#' @usage data(dt_simulated_weekly) -#' @return data.frame -#' @format An object of class \code{"data.frame"} -#' \describe{ -#' \item{DATE}{Date} -#' \item{revenue}{Daily total revenue} -#' \item{tv_S}{Television} -#' \item{ooh_S}{Out of home} -#' \item{...}{...} -#' } -#' @examples -#' data(dt_simulated_weekly) -#' head(dt_simulated_weekly) -#' @return Dataframe. Contains simulated dummy dataset to test and run demo. -"dt_simulated_weekly" - -# dt_input <- read.csv('data/de_simulated_data.csv') -# save(dt_input, file = "data/dt_input.RData", version = 2) -# dt_simulated_weekly <- as_tibble(dt_simulated_weekly) -# save(dt_simulated_weekly, file = "data/dt_simulated_weekly.RData", version = 2) - -#################################################################### -#' Robyn Dataset: Holidays by Country -#' -#' Contains \code{prophet}'s "new" default holidays by country. -#' When using own holidays, please keep the header -#' \code{c("ds", "holiday", "country", "year")}. -#' -#' -#' @family Dataset -#' @docType data -#' @usage data(dt_prophet_holidays) -#' @return data.frame -#' @format An object of class \code{"data.frame"} -#' \describe{ -#' \item{ds}{Date} -#' \item{holiday}{Name of celebrated holiday} -#' \item{country}{Code for the country (Alpha-2)} -#' \item{year}{Year of \code{ds}} -#' } -#' @examples -#' data(dt_prophet_holidays) -#' head(dt_prophet_holidays) -#' @return Dataframe. Contains \code{prophet}'s default holidays by country. -"dt_prophet_holidays" - -# dt_prophet_holidays <- read.csv("~/Desktop/generated_holidays.csv") -# dt_prophet_holidays <- as_tibble(dt_prophet_holidays) -# lares::missingness(dt_prophet_holidays) -# dt_prophet_holidays <- dplyr::filter(dt_prophet_holidays, !is.na(country)) -# save(dt_prophet_holidays, file = "data/dt_prophet_holidays.RData", version = 2) +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Robyn Dataset: MMM Demo Data +#' +#' Simulated MMM data. Input time series should be daily, weekly or monthly. +#' +#' @family Dataset +#' @docType data +#' @usage data(dt_simulated_weekly) +#' @return data.frame +#' @format An object of class \code{"data.frame"} +#' \describe{ +#' \item{DATE}{Date} +#' \item{revenue}{Daily total revenue} +#' \item{tv_S}{Television} +#' \item{ooh_S}{Out of home} +#' \item{...}{...} +#' } +#' @examples +#' data(dt_simulated_weekly) +#' head(dt_simulated_weekly) +#' @return Dataframe. Contains simulated dummy dataset to test and run demo. +"dt_simulated_weekly" + +# dt_input <- read.csv('data/de_simulated_data.csv') +# save(dt_input, file = "data/dt_input.RData", version = 2) +# dt_simulated_weekly <- as_tibble(dt_simulated_weekly) +# save(dt_simulated_weekly, file = "data/dt_simulated_weekly.RData", version = 2) + +#################################################################### +#' Robyn Dataset: Holidays by Country +#' +#' Contains \code{prophet}'s "new" default holidays by country. +#' When using own holidays, please keep the header +#' \code{c("ds", "holiday", "country", "year")}. +#' +#' +#' @family Dataset +#' @docType data +#' @usage data(dt_prophet_holidays) +#' @return data.frame +#' @format An object of class \code{"data.frame"} +#' \describe{ +#' \item{ds}{Date} +#' \item{holiday}{Name of celebrated holiday} +#' \item{country}{Code for the country (Alpha-2)} +#' \item{year}{Year of \code{ds}} +#' } +#' @examples +#' data(dt_prophet_holidays) +#' head(dt_prophet_holidays) +#' @return Dataframe. Contains \code{prophet}'s default holidays by country. +"dt_prophet_holidays" + +# dt_prophet_holidays <- read.csv("~/Desktop/generated_holidays.csv") +# dt_prophet_holidays <- as_tibble(dt_prophet_holidays) +# lares::missingness(dt_prophet_holidays) +# dt_prophet_holidays <- dplyr::filter(dt_prophet_holidays, !is.na(country)) +# save(dt_prophet_holidays, file = "data/dt_prophet_holidays.RData", version = 2) diff --git a/R/exports.R b/R/exports.R index fa58c31..e7b16d5 100644 --- a/R/exports.R +++ b/R/exports.R @@ -1,220 +1,220 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Export Robyn Model to Local File [DEPRECATED] -#' -#' Use \code{robyn_save()} to select and save as .RDS file the initial model. -#' -#' @inheritParams robyn_allocator -#' @inheritParams robyn_outputs -#' @inheritParams robyn_write -#' @return (Invisible) list with filename and summary. Class: \code{robyn_save}. -#' @export -robyn_save <- function(InputCollect, - OutputCollect, - robyn_object = NULL, - select_model = NULL, - dir = OutputCollect$plot_folder, - quiet = FALSE, ...) { - warning(paste( - "Function robyn_save() is not supported anymore.", - "Please migrate to robyn_write() and robyn_read()" - )) - check_robyn_name(robyn_object, quiet) - if (is.null(select_model)) select_model <- OutputCollect[["selectID"]] - if (!select_model %in% OutputCollect$allSolutions) { - stop(paste0("Input 'select_model' must be one of these values: ", paste( - OutputCollect$allSolutions, - collapse = ", " - ))) - } - - # Export as JSON file - json <- robyn_write(InputCollect, OutputCollect, select_model, ...) - - summary <- filter(OutputCollect$xDecompAgg, .data$solID == select_model) %>% - select( - variable = .data$rn, .data$coef, decomp = .data$xDecompPerc, - .data$total_spend, mean_non0_spend = .data$mean_spend - ) - - # Nice and tidy table format for hyper-parameters - regex <- paste(paste0("_", HYPS_NAMES), collapse = "|") - hyps <- filter(OutputCollect$resultHypParam, .data$solID == select_model) %>% - select(contains(HYPS_NAMES)) %>% - tidyr::gather() %>% - tidyr::separate(.data$key, - into = c("channel", "none"), - sep = regex, remove = FALSE - ) %>% - mutate(hyperparameter = gsub("^.*_", "", .data$key)) %>% - select(.data$channel, .data$hyperparameter, .data$value) %>% - tidyr::spread(key = "hyperparameter", value = "value") - - values <- OutputCollect[!unlist(lapply(OutputCollect, is.list))] - values <- values[!names(values) %in% c("allSolutions", "hyper_fixed", "plot_folder")] - - output <- list( - robyn_object = robyn_object, - select_model = select_model, - summary = summary, - errors = json$ExportedModel$errors, - hyper_df = hyps, - hyper_values = json$ExportedModel$hyper_values, - hyper_updated = OutputCollect$hyper_updated, - window = c(InputCollect$window_start, InputCollect$window_end), - periods = InputCollect$rollingWindowLength, - interval = InputCollect$intervalType, - adstock = InputCollect$adstock, - plot = robyn_onepagers(InputCollect, OutputCollect, - select_model, - quiet = TRUE, - export = FALSE, - ... - ) - ) - output <- append(output, values) - if (InputCollect$dep_var_type == "conversion") { - colnames(output$summary) <- gsub("roi_", "cpa_", colnames(output$summary)) - } - class(output) <- c("robyn_save", class(output)) - - if (!is.null(robyn_object)) { - if (file.exists(robyn_object)) { - if (!quiet) { - answer <- askYesNo(paste0(robyn_object, " already exists. Are you certain to overwrite it?")) - } else { - answer <- TRUE - } - if (answer == FALSE || is.na(answer)) { - message("Stopped export to avoid overwriting") - return(invisible(output)) - } - } - } - - OutputCollect$resultHypParam <- OutputCollect$resultHypParam[ - OutputCollect$resultHypParam$solID == select_model, - ] - OutputCollect$xDecompAgg <- OutputCollect$xDecompAgg[ - OutputCollect$resultHypParam$solID == select_model, - ] - OutputCollect$mediaVecCollect <- OutputCollect$mediaVecCollect[ - OutputCollect$resultHypParam$solID == select_model, - ] - OutputCollect$xDecompVecCollect <- OutputCollect$xDecompVecCollect[ - OutputCollect$resultHypParam$solID == select_model, - ] - OutputCollect$selectID <- select_model - - InputCollect$refreshCounter <- 0 - listInit <- list(InputCollect = InputCollect, OutputCollect = OutputCollect) - Robyn <- list(listInit = listInit) - - class(Robyn) <- c("robyn_exported", class(Robyn)) - if (!is.null(robyn_object)) { - saveRDS(Robyn, file = robyn_object) - if (!quiet) message("Exported results: ", robyn_object) - } - return(invisible(output)) -} - -#' @rdname robyn_save -#' @aliases robyn_save -#' @param x \code{robyn_save()} output. -#' @export -print.robyn_save <- function(x, ...) { - print(glued( - " - Exported file: {x$robyn_object} - Exported model: {x$select_model} - Window: {x$window[1]} to {x$window[2]} ({x$periods} {x$interval}s)" - )) - - print(glued( - "\n\nModel's Performance and Errors:\n {errors}", - errors = paste( - sprintf( - "R2 (%s): %s)", - ifelse(!isTRUE(x$ExportedModel$ts_validation), "train", "test"), - ifelse(!isTRUE(x$ExportedModel$ts_validation), - signif(x$errors$rsq_train, 4), signif(x$errors$rsq_test, 4) - ) - ), - "| NRMSE =", signif(x$errors$nrmse, 4), - "| DECOMP.RSSD =", signif(x$errors$decomp.rssd, 4), - "| MAPE =", signif(x$errors$mape, 4) - ) - )) - - print(glued("\n\nSummary Values on Selected Model:")) - - print(x$summary %>% - mutate(decomp = formatNum(100 * .data$decomp, pos = "%")) %>% - dplyr::mutate_if(is.numeric, function(x) ifelse(!is.infinite(x), x, 0)) %>% - dplyr::mutate_if(is.numeric, function(x) formatNum(x, 4, abbr = TRUE)) %>% - replace(., . == "NA", "-") %>% as.data.frame()) - - print(glued( - "\n\nHyper-parameters:\n Adstock: {x$adstock}" - )) - - print(as.data.frame(x$hyper_df)) -} - -#' @rdname robyn_save -#' @aliases robyn_save -#' @param x \code{robyn_save()} output. -#' @export -plot.robyn_save <- function(x, ...) plot(x$plot[[1]], ...) - -#' @rdname robyn_save -#' @aliases robyn_save -#' @return (Invisible) list with imported results -#' @export -robyn_load <- function(robyn_object, select_build = NULL, quiet = FALSE) { - if ("robyn_exported" %in% class(robyn_object) || is.list(robyn_object)) { - Robyn <- robyn_object - objectPath <- Robyn$listInit$OutputCollect$plot_folder - robyn_object <- paste0(objectPath, "/Robyn_", Robyn$listInit$OutputCollect$selectID, ".RDS") - if (!dir.exists(objectPath)) { - stop("Directory does not exist or is somewhere else. Check: ", objectPath) - } - } else { - if (!"character" %in% class(robyn_object)) { - stop("Input 'robyn_object' must be a character input or 'robyn_exported' object") - } - check_robyn_name(robyn_object, quiet) - Robyn <- readRDS(robyn_object) - objectPath <- dirname(robyn_object) - } - select_build_all <- 0:(length(Robyn) - 1) - if (is.null(select_build)) { - select_build <- max(select_build_all) - if (!quiet) { - message( - ">>> Loaded Model: ", - ifelse(select_build == 0, "Initial model", paste0("Refresh model #", select_build)) - ) - } - } - if (!(select_build %in% select_build_all) || length(select_build) != 1) { - stop("Input 'select_build' must be one value of ", paste(select_build_all, collapse = ", ")) - } - listName <- ifelse(select_build == 0, "listInit", paste0("listRefresh", select_build)) - InputCollect <- Robyn[[listName]][["InputCollect"]] - OutputCollect <- Robyn[[listName]][["OutputCollect"]] - select_model <- OutputCollect$selectID - output <- list( - Robyn = Robyn, - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - objectPath = objectPath, - robyn_object = robyn_object - ) - return(invisible(output)) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Export Robyn Model to Local File [DEPRECATED] +#' +#' Use \code{robyn_save()} to select and save as .RDS file the initial model. +#' +#' @inheritParams robyn_allocator +#' @inheritParams robyn_outputs +#' @inheritParams robyn_write +#' @return (Invisible) list with filename and summary. Class: \code{robyn_save}. +#' @export +robyn_save <- function(InputCollect, + OutputCollect, + robyn_object = NULL, + select_model = NULL, + dir = OutputCollect$plot_folder, + quiet = FALSE, ...) { + warning(paste( + "Function robyn_save() is not supported anymore.", + "Please migrate to robyn_write() and robyn_read()" + )) + check_robyn_name(robyn_object, quiet) + if (is.null(select_model)) select_model <- OutputCollect[["selectID"]] + if (!select_model %in% OutputCollect$allSolutions) { + stop(paste0("Input 'select_model' must be one of these values: ", paste( + OutputCollect$allSolutions, + collapse = ", " + ))) + } + + # Export as JSON file + json <- robyn_write(InputCollect, OutputCollect, select_model, ...) + + summary <- filter(OutputCollect$xDecompAgg, .data$solID == select_model) %>% + select( + variable = .data$rn, .data$coef, decomp = .data$xDecompPerc, + .data$total_spend, mean_non0_spend = .data$mean_spend + ) + + # Nice and tidy table format for hyper-parameters + regex <- paste(paste0("_", HYPS_NAMES), collapse = "|") + hyps <- filter(OutputCollect$resultHypParam, .data$solID == select_model) %>% + select(contains(HYPS_NAMES)) %>% + tidyr::gather() %>% + tidyr::separate(.data$key, + into = c("channel", "none"), + sep = regex, remove = FALSE + ) %>% + mutate(hyperparameter = gsub("^.*_", "", .data$key)) %>% + select(.data$channel, .data$hyperparameter, .data$value) %>% + tidyr::spread(key = "hyperparameter", value = "value") + + values <- OutputCollect[!unlist(lapply(OutputCollect, is.list))] + values <- values[!names(values) %in% c("allSolutions", "hyper_fixed", "plot_folder")] + + output <- list( + robyn_object = robyn_object, + select_model = select_model, + summary = summary, + errors = json$ExportedModel$errors, + hyper_df = hyps, + hyper_values = json$ExportedModel$hyper_values, + hyper_updated = OutputCollect$hyper_updated, + window = c(InputCollect$window_start, InputCollect$window_end), + periods = InputCollect$rollingWindowLength, + interval = InputCollect$intervalType, + adstock = InputCollect$adstock, + plot = robyn_onepagers(InputCollect, OutputCollect, + select_model, + quiet = TRUE, + export = FALSE, + ... + ) + ) + output <- append(output, values) + if (InputCollect$dep_var_type == "conversion") { + colnames(output$summary) <- gsub("roi_", "cpa_", colnames(output$summary)) + } + class(output) <- c("robyn_save", class(output)) + + if (!is.null(robyn_object)) { + if (file.exists(robyn_object)) { + if (!quiet) { + answer <- askYesNo(paste0(robyn_object, " already exists. Are you certain to overwrite it?")) + } else { + answer <- TRUE + } + if (answer == FALSE || is.na(answer)) { + message("Stopped export to avoid overwriting") + return(invisible(output)) + } + } + } + + OutputCollect$resultHypParam <- OutputCollect$resultHypParam[ + OutputCollect$resultHypParam$solID == select_model, + ] + OutputCollect$xDecompAgg <- OutputCollect$xDecompAgg[ + OutputCollect$resultHypParam$solID == select_model, + ] + OutputCollect$mediaVecCollect <- OutputCollect$mediaVecCollect[ + OutputCollect$resultHypParam$solID == select_model, + ] + OutputCollect$xDecompVecCollect <- OutputCollect$xDecompVecCollect[ + OutputCollect$resultHypParam$solID == select_model, + ] + OutputCollect$selectID <- select_model + + InputCollect$refreshCounter <- 0 + listInit <- list(InputCollect = InputCollect, OutputCollect = OutputCollect) + Robyn <- list(listInit = listInit) + + class(Robyn) <- c("robyn_exported", class(Robyn)) + if (!is.null(robyn_object)) { + saveRDS(Robyn, file = robyn_object) + if (!quiet) message("Exported results: ", robyn_object) + } + return(invisible(output)) +} + +#' @rdname robyn_save +#' @aliases robyn_save +#' @param x \code{robyn_save()} output. +#' @export +print.robyn_save <- function(x, ...) { + print(glued( + " + Exported file: {x$robyn_object} + Exported model: {x$select_model} + Window: {x$window[1]} to {x$window[2]} ({x$periods} {x$interval}s)" + )) + + print(glued( + "\n\nModel's Performance and Errors:\n {errors}", + errors = paste( + sprintf( + "R2 (%s): %s)", + ifelse(!isTRUE(x$ExportedModel$ts_validation), "train", "test"), + ifelse(!isTRUE(x$ExportedModel$ts_validation), + signif(x$errors$rsq_train, 4), signif(x$errors$rsq_test, 4) + ) + ), + "| NRMSE =", signif(x$errors$nrmse, 4), + "| DECOMP.RSSD =", signif(x$errors$decomp.rssd, 4), + "| MAPE =", signif(x$errors$mape, 4) + ) + )) + + print(glued("\n\nSummary Values on Selected Model:")) + + print(x$summary %>% + mutate(decomp = formatNum(100 * .data$decomp, pos = "%")) %>% + dplyr::mutate_if(is.numeric, function(x) ifelse(!is.infinite(x), x, 0)) %>% + dplyr::mutate_if(is.numeric, function(x) formatNum(x, 4, abbr = TRUE)) %>% + replace(., . == "NA", "-") %>% as.data.frame()) + + print(glued( + "\n\nHyper-parameters:\n Adstock: {x$adstock}" + )) + + print(as.data.frame(x$hyper_df)) +} + +#' @rdname robyn_save +#' @aliases robyn_save +#' @param x \code{robyn_save()} output. +#' @export +plot.robyn_save <- function(x, ...) plot(x$plot[[1]], ...) + +#' @rdname robyn_save +#' @aliases robyn_save +#' @return (Invisible) list with imported results +#' @export +robyn_load <- function(robyn_object, select_build = NULL, quiet = FALSE) { + if ("robyn_exported" %in% class(robyn_object) || is.list(robyn_object)) { + Robyn <- robyn_object + objectPath <- Robyn$listInit$OutputCollect$plot_folder + robyn_object <- paste0(objectPath, "/Robyn_", Robyn$listInit$OutputCollect$selectID, ".RDS") + if (!dir.exists(objectPath)) { + stop("Directory does not exist or is somewhere else. Check: ", objectPath) + } + } else { + if (!"character" %in% class(robyn_object)) { + stop("Input 'robyn_object' must be a character input or 'robyn_exported' object") + } + check_robyn_name(robyn_object, quiet) + Robyn <- readRDS(robyn_object) + objectPath <- dirname(robyn_object) + } + select_build_all <- 0:(length(Robyn) - 1) + if (is.null(select_build)) { + select_build <- max(select_build_all) + if (!quiet) { + message( + ">>> Loaded Model: ", + ifelse(select_build == 0, "Initial model", paste0("Refresh model #", select_build)) + ) + } + } + if (!(select_build %in% select_build_all) || length(select_build) != 1) { + stop("Input 'select_build' must be one value of ", paste(select_build_all, collapse = ", ")) + } + listName <- ifelse(select_build == 0, "listInit", paste0("listRefresh", select_build)) + InputCollect <- Robyn[[listName]][["InputCollect"]] + OutputCollect <- Robyn[[listName]][["OutputCollect"]] + select_model <- OutputCollect$selectID + output <- list( + Robyn = Robyn, + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = select_model, + objectPath = objectPath, + robyn_object = robyn_object + ) + return(invisible(output)) +} diff --git a/R/imports.R b/R/imports.R index 88bb36b..4f51bdb 100644 --- a/R/imports.R +++ b/R/imports.R @@ -1,51 +1,51 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Robyn MMM Project from Meta Marketing Science -#' -#' Robyn is an automated Marketing Mix Modeling (MMM) code. It aims to reduce human -#' bias by means of ridge regression and evolutionary algorithms, enables actionable -#' decision making providing a budget allocator and diminishing returns curves and -#' allows ground-truth calibration to account for causation. -#' -#' @md -#' @name Robyn -#' @docType package -#' @author Gufeng Zhou (gufeng@@meta.com) -#' @author Leonel Sentana (leonelsentana@@meta.com) -#' @author Igor Skokan (igorskokan@@meta.com) -#' @author Bernardo Lares (bernardolares@@meta.com) -#' @importFrom doRNG %dorng% -#' @importFrom doParallel registerDoParallel stopImplicitCluster -#' @importFrom dplyr across any_of arrange as_tibble bind_rows case_when contains desc distinct -#' everything filter group_by lag left_join mutate n pull rename row_number select slice -#' summarise summarise_all ungroup all_of bind_cols mutate_at starts_with ends_with tally n_distinct -#' @importFrom foreach foreach %dopar% getDoParWorkers registerDoSEQ -#' @import ggplot2 -#' @importFrom ggridges geom_density_ridges geom_density_ridges_gradient -#' @importFrom glmnet glmnet -#' @importFrom jsonlite fromJSON toJSON write_json read_json -#' @importFrom lares check_opts clusterKmeans formatNum freqs glued num_abbr ohse removenacols -#' theme_lares `%>%` scale_x_abbr scale_x_percent scale_y_percent scale_y_abbr try_require v2t -#' @importFrom lubridate is.Date day floor_date -#' @importFrom minpack.lm nlsLM -#' @importFrom nloptr nloptr -#' @importFrom parallel detectCores -#' @importFrom patchwork guide_area plot_layout plot_annotation wrap_plots -#' @importFrom prophet add_regressor add_seasonality fit.prophet prophet -#' @importFrom reticulate tuple use_condaenv import conda_create conda_install py_module_available -#' virtualenv_create py_install use_virtualenv -#' @importFrom stats AIC BIC coef complete.cases dgamma dnorm end lm model.matrix na.omit -#' nls.control median qt sd predict pweibull dweibull quantile qunif reorder rnorm start setNames -#' @importFrom stringr str_count str_detect str_remove str_split str_which str_extract str_replace -#' str_to_title -#' @importFrom tidyr pivot_longer pivot_wider -#' @importFrom utils askYesNo flush.console head setTxtProgressBar tail txtProgressBar write.csv -"_PACKAGE" - -if (getRversion() >= "2.15.1") { - globalVariables(c(".", "install_github")) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Robyn MMM Project from Meta Marketing Science +#' +#' Robyn is an automated Marketing Mix Modeling (MMM) code. It aims to reduce human +#' bias by means of ridge regression and evolutionary algorithms, enables actionable +#' decision making providing a budget allocator and diminishing returns curves and +#' allows ground-truth calibration to account for causation. +#' +#' @md +#' @name Robyn +#' @docType package +#' @author Gufeng Zhou (gufeng@@meta.com) +#' @author Leonel Sentana (leonelsentana@@meta.com) +#' @author Igor Skokan (igorskokan@@meta.com) +#' @author Bernardo Lares (bernardolares@@meta.com) +#' @importFrom doRNG %dorng% +#' @importFrom doParallel registerDoParallel stopImplicitCluster +#' @importFrom dplyr across any_of arrange as_tibble bind_rows case_when contains desc distinct +#' everything filter group_by lag left_join mutate n pull rename row_number select slice +#' summarise summarise_all ungroup all_of bind_cols mutate_at starts_with ends_with tally n_distinct +#' @importFrom foreach foreach %dopar% getDoParWorkers registerDoSEQ +#' @import ggplot2 +#' @importFrom ggridges geom_density_ridges geom_density_ridges_gradient +#' @importFrom glmnet glmnet +#' @importFrom jsonlite fromJSON toJSON write_json read_json +#' @importFrom lares check_opts clusterKmeans formatNum freqs glued num_abbr ohse removenacols +#' theme_lares `%>%` scale_x_abbr scale_x_percent scale_y_percent scale_y_abbr try_require v2t +#' @importFrom lubridate is.Date day floor_date +#' @importFrom minpack.lm nlsLM +#' @importFrom nloptr nloptr +#' @importFrom parallel detectCores +#' @importFrom patchwork guide_area plot_layout plot_annotation wrap_plots +#' @importFrom prophet add_regressor add_seasonality fit.prophet prophet +#' @importFrom reticulate tuple use_condaenv import conda_create conda_install py_module_available +#' virtualenv_create py_install use_virtualenv +#' @importFrom stats AIC BIC coef complete.cases dgamma dnorm end lm model.matrix na.omit +#' nls.control median qt sd predict pweibull dweibull quantile qunif reorder rnorm start setNames +#' @importFrom stringr str_count str_detect str_remove str_split str_which str_extract str_replace +#' str_to_title +#' @importFrom tidyr pivot_longer pivot_wider +#' @importFrom utils askYesNo flush.console head setTxtProgressBar tail txtProgressBar write.csv +"_PACKAGE" + +if (getRversion() >= "2.15.1") { + globalVariables(c(".", "install_github")) +} diff --git a/R/json.R b/R/json.R index d437014..a828066 100644 --- a/R/json.R +++ b/R/json.R @@ -1,430 +1,430 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Import and Export Robyn JSON files -#' -#' \code{robyn_write()} generates light JSON files with all the information -#' required to replicate Robyn models. Depending on user inputs, there are -#' 3 use cases: only the inputs data, input data + modeling results data, -#' and input data, modeling results + specifics of a single selected model. -#' To replicate a model, you must provide InputCollect, OutputCollect, and, -#' if OutputCollect contains more than one model, the select_model. -#' -#' @inheritParams robyn_outputs -#' @param InputCollect \code{robyn_inputs()} output. -#' @param select_model Character. Which model ID do you want to export -#' into the JSON file? -#' @param add_data Boolean. Include raw dataset. Useful to recreate models -#' with a single file containing all the required information (no need of CSV). -#' @param dir Character. Existing directory to export JSON file to. -#' @param pareto_df Dataframe. Save all pareto solutions to json file. -#' @param ... Additional parameters to export into a custom Extras element. -#' @examples -#' \dontrun{ -#' InputCollectJSON <- robyn_inputs( -#' dt_input = Robyn::dt_simulated_weekly, -#' json_file = "~/Desktop/RobynModel-1_29_12.json" -#' ) -#' print(InputCollectJSON) -#' } -#' @return (invisible) List. Contains all inputs and outputs of exported model. -#' Class: \code{robyn_write}. -#' @export -robyn_write <- function(InputCollect, - OutputCollect = NULL, - select_model = NULL, - dir = OutputCollect$plot_folder, - add_data = TRUE, - export = TRUE, - quiet = FALSE, - pareto_df = NULL, - ...) { - # Checks - stopifnot(inherits(InputCollect, "robyn_inputs")) - if (!is.null(OutputCollect)) { - stopifnot(inherits(OutputCollect, "robyn_outputs")) - if (is.null(select_model) && length(OutputCollect$allSolutions == 1)) { - select_model <- OutputCollect$allSolutions - } - } - if (is.null(dir)) dir <- getwd() - - # InputCollect JSON - ret <- list() - skip <- which(unlist(lapply(InputCollect, function(x) is.list(x) | is.null(x)))) - skip <- skip[!names(skip) %in% c("calibration_input", "hyperparameters", "custom_params")] - ret[["InputCollect"]] <- InputCollect[-skip] - # toJSON(ret$InputCollect, pretty = TRUE) - - # ExportedModel JSON - if (!is.null(OutputCollect)) { - # Modeling associated data - collect <- list() - collect$ts_validation <- OutputCollect$OutputModels$ts_validation - collect$train_timestamp <- OutputCollect$OutputModels$train_timestamp - collect$export_timestamp <- Sys.time() - collect$run_time <- sprintf("%s min", attr(OutputCollect$OutputModels, "runTime")) - collect$outputs_time <- sprintf("%s min", attr(OutputCollect, "runTime")) - collect$total_time <- sprintf( - "%s min", attr(OutputCollect, "runTime") + - attr(OutputCollect$OutputModels, "runTime") - ) - collect$total_iters <- OutputCollect$OutputModels$iterations * - OutputCollect$OutputModels$trials - collect$conv_msg <- gsub("\\:.*", "", OutputCollect$OutputModels$convergence$conv_msg) - if ("clusters" %in% names(OutputCollect)) { - collect$n_clusters <- OutputCollect$clusters$n_clusters - } - - skip <- which(unlist(lapply(OutputCollect, function(x) is.list(x) | is.null(x)))) - skip <- c(skip, which(names(OutputCollect) %in% "allSolutions")) - collect <- append(collect, OutputCollect[-skip]) - ret[["ModelsCollect"]] <- collect - - # Model associated data - if (length(select_model) == 1) { - stopifnot(select_model %in% OutputCollect$allSolutions) - outputs <- list() - outputs$select_model <- select_model - df <- filter(OutputCollect$xDecompAgg, .data$solID == select_model) - perf_metric <- ifelse(InputCollect$dep_var_type == "revenue", "ROAS", "CPA") - outputs$performance <- df %>% - filter(.data$rn %in% InputCollect$paid_media_spends) %>% - group_by(.data$solID) %>% - summarise( - metric = perf_metric, - performance = ifelse( - perf_metric == "ROAS", - sum(.data$xDecompAgg) / sum(.data$total_spend), - sum(.data$total_spend) / sum(.data$xDecompAgg) - ), .groups = "drop" - ) - outputs$summary <- df %>% - mutate( - metric = perf_metric, - performance = ifelse(.data$metric == "ROAS", .data$roi_total, .data$cpa_total) - ) %>% - select( - variable = .data$rn, coef = .data$coef, - decompPer = .data$xDecompPerc, decompAgg = .data$xDecompAggRF, - .data$performance, .data$mean_response, .data$mean_spend, - contains("boot_mean"), contains("ci_") - ) - outputs$errors <- filter(OutputCollect$resultHypParam, .data$solID == select_model) %>% - select(starts_with("rsq_"), starts_with("nrmse"), .data$decomp.rssd, .data$mape) - outputs$hyper_values <- OutputCollect$resultHypParam %>% - filter(.data$solID == select_model) %>% - select(contains(HYPS_NAMES), dplyr::ends_with("_penalty"), any_of(HYPS_OTHERS)) %>% - select(order(colnames(.))) %>% - as.list() - outputs$hyper_updated <- OutputCollect$hyper_updated - if ("clusters" %in% names(OutputCollect)) { - outputs$clusters <- list( - data = OutputCollect$clusters$data %>% - group_by(.data$cluster) %>% mutate(n = n()) %>% - filter(.data$solID == select_model) %>% - select(any_of(c("solID", "cluster", "n"))) - ) - } - ret[["ExportedModel"]] <- outputs - } else { - select_model <- "models" - } - } else { - select_model <- "inputs" - } - - extras <- list(...) - if (isTRUE(add_data) & !"raw_data" %in% names(extras)) { - extras[["raw_data"]] <- as_tibble(InputCollect$dt_input) - } - if (length(extras) > 0) { - ret[["Extras"]] <- extras - } - - if (!dir.exists(dir) & export) dir.create(dir, recursive = TRUE) - filename <- sprintf("%s/RobynModel-%s.json", dir, select_model) - filename <- gsub("//", "/", filename) - class(ret) <- c("robyn_write", class(ret)) - attr(ret, "json_file") <- filename - if (export) { - if (!quiet) message(sprintf(">> Exported %s as %s", select_model, filename)) - if (!is.null(pareto_df)) { - if (!all(c("solID", "cluster") %in% names(pareto_df))) { - warning(paste( - "Input 'pareto_df' is not a valid data.frame;", - "must contain 'solID' and 'cluster' columns." - )) - } else { - all_c <- unique(pareto_df$cluster) - pareto_df <- lapply(all_c, function(x) { - (pareto_df %>% filter(.data$cluster == x))$solID - }) - names(pareto_df) <- paste0("cluster", all_c) - ret[["OutputCollect"]][["all_sols"]] <- pareto_df - } - } - write_json(ret, filename, pretty = TRUE, digits = 10) - } - return(invisible(ret)) -} - - -#' @rdname robyn_write -#' @aliases robyn_write -#' @param x \code{robyn_read()} or \code{robyn_write()} output. -#' @export -print.robyn_write <- function(x, ...) { - val <- any(c(x$ExportedModel$ts_validation, x$ModelsCollect$ts_validation)) - print(glued( - " - Exported directory: {x$ExportedModel$plot_folder} - Exported model: {x$ExportedModel$select_model} - Window: {start} to {end} ({periods} {type}s) - Time Series Validation: {val} (train size = {val_detail})", - start = x$InputCollect$window_start, - end = x$InputCollect$window_end, - periods = x$InputCollect$rollingWindowLength, - type = x$InputCollect$intervalType, - val_detail = formatNum(100 * x$ExportedModel$hyper_values$train_size, 2, pos = "%") - )) - errors <- x$ExportedModel$errors - print(glued( - "\n\nModel's Performance and Errors:\n {performance}{errors}", - performance = ifelse("performance" %in% names(x$ExportedModel), sprintf( - "Total Model %s = %s\n ", - x$ExportedModel$performance$metric, signif(x$ExportedModel$performance$performance, 4) - ), ""), - errors = paste( - sprintf( - "Adj.R2 (train): %s", - signif(errors$rsq_train, 4) - ), - "| NRMSE =", signif(errors$nrmse, 4), - "| DECOMP.RSSD =", signif(errors$decomp.rssd, 4), - "| MAPE =", signif(errors$mape, 4) - ) - )) - - if ("ExportedModel" %in% names(x)) { - print(glued("\n\nSummary Values on Selected Model:")) - - print(x$ExportedModel$summary %>% - select(-contains("boot"), -contains("ci_")) %>% - dplyr::rename_at("performance", list(~ ifelse(x$InputCollect$dep_var_type == "revenue", "ROAS", "CPA"))) %>% - mutate(decompPer = formatNum(100 * .data$decompPer, pos = "%")) %>% - dplyr::mutate_if(is.numeric, function(x) ifelse(!is.infinite(x), x, 0)) %>% - dplyr::mutate_if(is.numeric, function(x) formatNum(x, 4, abbr = TRUE)) %>% - replace(., . == "NA", "-") %>% as.data.frame()) - - print(glued( - "\n\nHyper-parameters:\n Adstock: {x$InputCollect$adstock}" - )) - - # Nice and tidy table format for hyper-parameters - HYPS_NAMES <- c(HYPS_NAMES, "penalty") - regex <- paste(paste0("_", HYPS_NAMES), collapse = "|") - hyper_df <- as.data.frame(x$ExportedModel$hyper_values) %>% - select(-contains("lambda"), -any_of(HYPS_OTHERS)) %>% - tidyr::gather() %>% - tidyr::separate(.data$key, - into = c("channel", "none"), - sep = regex, remove = FALSE - ) %>% - mutate(hyperparameter = gsub("^.*_", "", .data$key)) %>% - select(.data$channel, .data$hyperparameter, .data$value) %>% - tidyr::spread(key = "hyperparameter", value = "value") - print(hyper_df) - } -} - - -#' @rdname robyn_write -#' @aliases robyn_write -#' @param json_file Character. JSON file name to read and import. -#' @param step Integer. 1 for import only and 2 for import and output. -#' @export -robyn_read <- function(json_file = NULL, step = 1, quiet = FALSE, ...) { - if (!is.null(json_file)) { - if (inherits(json_file, "character")) { - if (lares::right(tolower(json_file), 4) != "json") { - stop("JSON file must be a valid .json file") - } - if (!file.exists(json_file)) { - stop("JSON file can't be imported: ", json_file) - } - json <- read_json(json_file, simplifyVector = TRUE) - json$InputCollect <- json$InputCollect[lapply(json$InputCollect, length) > 0] - json$ExportedModel <- append(json$ModelsCollect, json$ExportedModel) - # Add train_size if not available (<3.9.0) - if (!"train_size" %in% names(json$ExportedModel$hyper_values)) { - json$ExportedModel$hyper_values$train_size <- 1 - } - if (!"InputCollect" %in% names(json) && step == 1) { - stop("JSON file must contain InputCollect element") - } - if (!"ExportedModel" %in% names(json) && step == 2) { - stop("JSON file must contain ExportedModel element") - } - json$ModelsCollect <- NULL - if (!quiet) message("Imported JSON file successfully: ", json_file) - class(json) <- c("robyn_read", class(json)) - return(json) - } - } - return(json_file) -} - -#' @rdname robyn_write -#' @aliases robyn_write -#' @export -print.robyn_read <- function(x, ...) { - a <- x$InputCollect - print(glued( - " -############ InputCollect ############ - -Date: {a$date_var} -Dependent: {a$dep_var} [{a$dep_var_type}] -Paid Media: {paste(a$paid_media_vars, collapse = ', ')} -Paid Media Spend: {paste(a$paid_media_spends, collapse = ', ')} -Context: {paste(a$context_vars, collapse = ', ')} -Organic: {paste(a$organic_vars, collapse = ', ')} -Prophet (Auto-generated): {prophet} -Unused variables: {unused} -Model Window: {windows} ({a$rollingWindowEndWhich - a$rollingWindowStartWhich + 1} {a$intervalType}s) -With Calibration: {!is.null(a$calibration_input)} -Custom parameters: {custom_params} - -Adstock: {a$adstock} -{hyps} -", - windows = paste(a$window_start, a$window_end, sep = ":"), - custom_params = if (length(a$custom_params) > 0) paste("\n", flatten_hyps(a$custom_params)) else "None", - prophet = if (!is.null(a$prophet_vars)) { - sprintf("%s on %s", paste(a$prophet_vars, collapse = ", "), a$prophet_country) - } else { - "\033[0;31mDeactivated\033[0m" - }, - unused = if (length(a$unused_vars) > 0) { - paste(a$unused_vars, collapse = ", ") - } else { - "None" - }, - hyps = glued( - "Hyper-parameters ranges:\n{flatten_hyps(a$hyperparameters)}" - ) - )) - - if (!is.null(x$ExportedModel)) { - temp <- x - class(temp) <- "robyn_write" - print(glued("\n\n############ Exported Model ############\n")) - print(temp) - } - return(invisible(x)) -} - -#' @rdname robyn_write -#' @aliases robyn_write -#' @export -robyn_recreate <- function(json_file, quiet = FALSE, ...) { - json <- robyn_read(json_file, quiet = TRUE) - message(">>> Recreating ", json$ExportedModel$select_model) - args <- list(...) - if (!"InputCollect" %in% names(args)) { - InputCollect <- robyn_inputs( - json_file = json_file, - quiet = quiet, - ... - ) - if (!is.null(json$ExportedModel$select_model)) { - OutputCollect <- robyn_run( - InputCollect = InputCollect, - json_file = json_file, - export = FALSE, - quiet = quiet, - ... - ) - } else { - OutputCollect <- NULL - } - } else { - # Use case: skip feature engineering when InputCollect is provided - InputCollect <- args[["InputCollect"]] - OutputCollect <- robyn_run( - json_file = json_file, - export = FALSE, - quiet = quiet, - ... - ) - } - return(invisible(list( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - Extras = json[["Extras"]] - ))) -} - -# Import the whole chain any refresh model to init -robyn_chain <- function(json_file) { - json_data <- robyn_read(json_file, quiet = TRUE) - ids <- c(json_data$InputCollect$refreshChain, json_data$ExportedModel$select_model) - plot_folder <- json_data$ExportedModel$plot_folder - temp <- str_split(plot_folder, "/")[[1]] - chain <- temp[startsWith(temp, "Robyn_") & grepl("_init+$|_rf[0-9]+$", temp)] - if (length(chain) == 0) chain <- tail(temp[temp != ""], 1) - avlb <- NULL - if (length(ids) != length(chain)) { - temp <- list.files(plot_folder) - mods <- unique(temp[ - (startsWith(temp, "RobynModel") | grepl("\\.json+$", temp)) & - grepl("^[^_]*_[^_]*_[^_]*$", temp) - ]) - avlb <- gsub("RobynModel-|\\.json", "", mods) - if (length(ids) == length(mods)) { - chain <- rep_len(chain, length(mods)) - } - } - base_dir <- gsub(sprintf("\\/%s.*", chain[1]), "", plot_folder) - chainData <- list() - for (i in rev(seq_along(ids))) { - if (i == length(ids)) { - json_new <- json_data - } else { - file <- paste0("RobynModel-", json_new$InputCollect$refreshSourceID, ".json") - filename <- paste(c(base_dir, chain[1:i], file), collapse = "/") - if (file.exists(filename)) { - json_new <- robyn_read(filename, quiet = TRUE) - } else { - if (ids[i] %in% avlb) { - filename <- mods[avlb == ids[i]] - json_new <- robyn_read(filename, quiet = TRUE) - } else { - last_try <- gsub(chain[1], "", filename) - if (file.exists(last_try)) { - json_new <- robyn_read(last_try, quiet = TRUE) - message("Stored original model in new file: ", filename) - jsonlite::write_json(json_new, filename, pretty = TRUE) - } else { - message("Skipping chain. File can't be found: ", filename) - } - } - } - } - chainData[[json_new$ExportedModel$select_model]] <- json_new - } - chainData <- chainData[rev(seq_along(chain))] - dirs <- unlist(lapply(chainData, function(x) x$ExportedModel$plot_folder)) - dirs[!dir.exists(dirs)] <- plot_folder - json_files <- paste0(dirs, "RobynModel-", names(dirs), ".json") - attr(chainData, "json_files") <- json_files - attr(chainData, "chain") <- ids # names(chainData) - if (length(ids) != length(names(chainData))) { - warning("Can't replicate chain-like results if you don't follow Robyn's chain structure") - } - return(invisible(chainData)) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Import and Export Robyn JSON files +#' +#' \code{robyn_write()} generates light JSON files with all the information +#' required to replicate Robyn models. Depending on user inputs, there are +#' 3 use cases: only the inputs data, input data + modeling results data, +#' and input data, modeling results + specifics of a single selected model. +#' To replicate a model, you must provide InputCollect, OutputCollect, and, +#' if OutputCollect contains more than one model, the select_model. +#' +#' @inheritParams robyn_outputs +#' @param InputCollect \code{robyn_inputs()} output. +#' @param select_model Character. Which model ID do you want to export +#' into the JSON file? +#' @param add_data Boolean. Include raw dataset. Useful to recreate models +#' with a single file containing all the required information (no need of CSV). +#' @param dir Character. Existing directory to export JSON file to. +#' @param pareto_df Dataframe. Save all pareto solutions to json file. +#' @param ... Additional parameters to export into a custom Extras element. +#' @examples +#' \dontrun{ +#' InputCollectJSON <- robyn_inputs( +#' dt_input = Robyn::dt_simulated_weekly, +#' json_file = "~/Desktop/RobynModel-1_29_12.json" +#' ) +#' print(InputCollectJSON) +#' } +#' @return (invisible) List. Contains all inputs and outputs of exported model. +#' Class: \code{robyn_write}. +#' @export +robyn_write <- function(InputCollect, + OutputCollect = NULL, + select_model = NULL, + dir = OutputCollect$plot_folder, + add_data = TRUE, + export = TRUE, + quiet = FALSE, + pareto_df = NULL, + ...) { + # Checks + stopifnot(inherits(InputCollect, "robyn_inputs")) + if (!is.null(OutputCollect)) { + stopifnot(inherits(OutputCollect, "robyn_outputs")) + if (is.null(select_model) && length(OutputCollect$allSolutions == 1)) { + select_model <- OutputCollect$allSolutions + } + } + if (is.null(dir)) dir <- getwd() + + # InputCollect JSON + ret <- list() + skip <- which(unlist(lapply(InputCollect, function(x) is.list(x) | is.null(x)))) + skip <- skip[!names(skip) %in% c("calibration_input", "hyperparameters", "custom_params")] + ret[["InputCollect"]] <- InputCollect[-skip] + # toJSON(ret$InputCollect, pretty = TRUE) + + # ExportedModel JSON + if (!is.null(OutputCollect)) { + # Modeling associated data + collect <- list() + collect$ts_validation <- OutputCollect$OutputModels$ts_validation + collect$train_timestamp <- OutputCollect$OutputModels$train_timestamp + collect$export_timestamp <- Sys.time() + collect$run_time <- sprintf("%s min", attr(OutputCollect$OutputModels, "runTime")) + collect$outputs_time <- sprintf("%s min", attr(OutputCollect, "runTime")) + collect$total_time <- sprintf( + "%s min", attr(OutputCollect, "runTime") + + attr(OutputCollect$OutputModels, "runTime") + ) + collect$total_iters <- OutputCollect$OutputModels$iterations * + OutputCollect$OutputModels$trials + collect$conv_msg <- gsub("\\:.*", "", OutputCollect$OutputModels$convergence$conv_msg) + if ("clusters" %in% names(OutputCollect)) { + collect$n_clusters <- OutputCollect$clusters$n_clusters + } + + skip <- which(unlist(lapply(OutputCollect, function(x) is.list(x) | is.null(x)))) + skip <- c(skip, which(names(OutputCollect) %in% "allSolutions")) + collect <- append(collect, OutputCollect[-skip]) + ret[["ModelsCollect"]] <- collect + + # Model associated data + if (length(select_model) == 1) { + stopifnot(select_model %in% OutputCollect$allSolutions) + outputs <- list() + outputs$select_model <- select_model + df <- filter(OutputCollect$xDecompAgg, .data$solID == select_model) + perf_metric <- ifelse(InputCollect$dep_var_type == "revenue", "ROAS", "CPA") + outputs$performance <- df %>% + filter(.data$rn %in% InputCollect$paid_media_spends) %>% + group_by(.data$solID) %>% + summarise( + metric = perf_metric, + performance = ifelse( + perf_metric == "ROAS", + sum(.data$xDecompAgg) / sum(.data$total_spend), + sum(.data$total_spend) / sum(.data$xDecompAgg) + ), .groups = "drop" + ) + outputs$summary <- df %>% + mutate( + metric = perf_metric, + performance = ifelse(.data$metric == "ROAS", .data$roi_total, .data$cpa_total) + ) %>% + select( + variable = .data$rn, coef = .data$coef, + decompPer = .data$xDecompPerc, decompAgg = .data$xDecompAggRF, + .data$performance, .data$mean_response, .data$mean_spend, + contains("boot_mean"), contains("ci_") + ) + outputs$errors <- filter(OutputCollect$resultHypParam, .data$solID == select_model) %>% + select(starts_with("rsq_"), starts_with("nrmse"), .data$decomp.rssd, .data$mape) + outputs$hyper_values <- OutputCollect$resultHypParam %>% + filter(.data$solID == select_model) %>% + select(contains(HYPS_NAMES), dplyr::ends_with("_penalty"), any_of(HYPS_OTHERS)) %>% + select(order(colnames(.))) %>% + as.list() + outputs$hyper_updated <- OutputCollect$hyper_updated + if ("clusters" %in% names(OutputCollect)) { + outputs$clusters <- list( + data = OutputCollect$clusters$data %>% + group_by(.data$cluster) %>% mutate(n = n()) %>% + filter(.data$solID == select_model) %>% + select(any_of(c("solID", "cluster", "n"))) + ) + } + ret[["ExportedModel"]] <- outputs + } else { + select_model <- "models" + } + } else { + select_model <- "inputs" + } + + extras <- list(...) + if (isTRUE(add_data) & !"raw_data" %in% names(extras)) { + extras[["raw_data"]] <- as_tibble(InputCollect$dt_input) + } + if (length(extras) > 0) { + ret[["Extras"]] <- extras + } + + if (!dir.exists(dir) & export) dir.create(dir, recursive = TRUE) + filename <- sprintf("%s/RobynModel-%s.json", dir, select_model) + filename <- gsub("//", "/", filename) + class(ret) <- c("robyn_write", class(ret)) + attr(ret, "json_file") <- filename + if (export) { + if (!quiet) message(sprintf(">> Exported %s as %s", select_model, filename)) + if (!is.null(pareto_df)) { + if (!all(c("solID", "cluster") %in% names(pareto_df))) { + warning(paste( + "Input 'pareto_df' is not a valid data.frame;", + "must contain 'solID' and 'cluster' columns." + )) + } else { + all_c <- unique(pareto_df$cluster) + pareto_df <- lapply(all_c, function(x) { + (pareto_df %>% filter(.data$cluster == x))$solID + }) + names(pareto_df) <- paste0("cluster", all_c) + ret[["OutputCollect"]][["all_sols"]] <- pareto_df + } + } + write_json(ret, filename, pretty = TRUE, digits = 10) + } + return(invisible(ret)) +} + + +#' @rdname robyn_write +#' @aliases robyn_write +#' @param x \code{robyn_read()} or \code{robyn_write()} output. +#' @export +print.robyn_write <- function(x, ...) { + val <- any(c(x$ExportedModel$ts_validation, x$ModelsCollect$ts_validation)) + print(glued( + " + Exported directory: {x$ExportedModel$plot_folder} + Exported model: {x$ExportedModel$select_model} + Window: {start} to {end} ({periods} {type}s) + Time Series Validation: {val} (train size = {val_detail})", + start = x$InputCollect$window_start, + end = x$InputCollect$window_end, + periods = x$InputCollect$rollingWindowLength, + type = x$InputCollect$intervalType, + val_detail = formatNum(100 * x$ExportedModel$hyper_values$train_size, 2, pos = "%") + )) + errors <- x$ExportedModel$errors + print(glued( + "\n\nModel's Performance and Errors:\n {performance}{errors}", + performance = ifelse("performance" %in% names(x$ExportedModel), sprintf( + "Total Model %s = %s\n ", + x$ExportedModel$performance$metric, signif(x$ExportedModel$performance$performance, 4) + ), ""), + errors = paste( + sprintf( + "Adj.R2 (train): %s", + signif(errors$rsq_train, 4) + ), + "| NRMSE =", signif(errors$nrmse, 4), + "| DECOMP.RSSD =", signif(errors$decomp.rssd, 4), + "| MAPE =", signif(errors$mape, 4) + ) + )) + + if ("ExportedModel" %in% names(x)) { + print(glued("\n\nSummary Values on Selected Model:")) + + print(x$ExportedModel$summary %>% + select(-contains("boot"), -contains("ci_")) %>% + dplyr::rename_at("performance", list(~ ifelse(x$InputCollect$dep_var_type == "revenue", "ROAS", "CPA"))) %>% + mutate(decompPer = formatNum(100 * .data$decompPer, pos = "%")) %>% + dplyr::mutate_if(is.numeric, function(x) ifelse(!is.infinite(x), x, 0)) %>% + dplyr::mutate_if(is.numeric, function(x) formatNum(x, 4, abbr = TRUE)) %>% + replace(., . == "NA", "-") %>% as.data.frame()) + + print(glued( + "\n\nHyper-parameters:\n Adstock: {x$InputCollect$adstock}" + )) + + # Nice and tidy table format for hyper-parameters + HYPS_NAMES <- c(HYPS_NAMES, "penalty") + regex <- paste(paste0("_", HYPS_NAMES), collapse = "|") + hyper_df <- as.data.frame(x$ExportedModel$hyper_values) %>% + select(-contains("lambda"), -any_of(HYPS_OTHERS)) %>% + tidyr::gather() %>% + tidyr::separate(.data$key, + into = c("channel", "none"), + sep = regex, remove = FALSE + ) %>% + mutate(hyperparameter = gsub("^.*_", "", .data$key)) %>% + select(.data$channel, .data$hyperparameter, .data$value) %>% + tidyr::spread(key = "hyperparameter", value = "value") + print(hyper_df) + } +} + + +#' @rdname robyn_write +#' @aliases robyn_write +#' @param json_file Character. JSON file name to read and import. +#' @param step Integer. 1 for import only and 2 for import and output. +#' @export +robyn_read <- function(json_file = NULL, step = 1, quiet = FALSE, ...) { + if (!is.null(json_file)) { + if (inherits(json_file, "character")) { + if (lares::right(tolower(json_file), 4) != "json") { + stop("JSON file must be a valid .json file") + } + if (!file.exists(json_file)) { + stop("JSON file can't be imported: ", json_file) + } + json <- read_json(json_file, simplifyVector = TRUE) + json$InputCollect <- json$InputCollect[lapply(json$InputCollect, length) > 0] + json$ExportedModel <- append(json$ModelsCollect, json$ExportedModel) + # Add train_size if not available (<3.9.0) + if (!"train_size" %in% names(json$ExportedModel$hyper_values)) { + json$ExportedModel$hyper_values$train_size <- 1 + } + if (!"InputCollect" %in% names(json) && step == 1) { + stop("JSON file must contain InputCollect element") + } + if (!"ExportedModel" %in% names(json) && step == 2) { + stop("JSON file must contain ExportedModel element") + } + json$ModelsCollect <- NULL + if (!quiet) message("Imported JSON file successfully: ", json_file) + class(json) <- c("robyn_read", class(json)) + return(json) + } + } + return(json_file) +} + +#' @rdname robyn_write +#' @aliases robyn_write +#' @export +print.robyn_read <- function(x, ...) { + a <- x$InputCollect + print(glued( + " +############ InputCollect ############ + +Date: {a$date_var} +Dependent: {a$dep_var} [{a$dep_var_type}] +Paid Media: {paste(a$paid_media_vars, collapse = ', ')} +Paid Media Spend: {paste(a$paid_media_spends, collapse = ', ')} +Context: {paste(a$context_vars, collapse = ', ')} +Organic: {paste(a$organic_vars, collapse = ', ')} +Prophet (Auto-generated): {prophet} +Unused variables: {unused} +Model Window: {windows} ({a$rollingWindowEndWhich - a$rollingWindowStartWhich + 1} {a$intervalType}s) +With Calibration: {!is.null(a$calibration_input)} +Custom parameters: {custom_params} + +Adstock: {a$adstock} +{hyps} +", + windows = paste(a$window_start, a$window_end, sep = ":"), + custom_params = if (length(a$custom_params) > 0) paste("\n", flatten_hyps(a$custom_params)) else "None", + prophet = if (!is.null(a$prophet_vars)) { + sprintf("%s on %s", paste(a$prophet_vars, collapse = ", "), a$prophet_country) + } else { + "\033[0;31mDeactivated\033[0m" + }, + unused = if (length(a$unused_vars) > 0) { + paste(a$unused_vars, collapse = ", ") + } else { + "None" + }, + hyps = glued( + "Hyper-parameters ranges:\n{flatten_hyps(a$hyperparameters)}" + ) + )) + + if (!is.null(x$ExportedModel)) { + temp <- x + class(temp) <- "robyn_write" + print(glued("\n\n############ Exported Model ############\n")) + print(temp) + } + return(invisible(x)) +} + +#' @rdname robyn_write +#' @aliases robyn_write +#' @export +robyn_recreate <- function(json_file, quiet = FALSE, ...) { + json <- robyn_read(json_file, quiet = TRUE) + message(">>> Recreating ", json$ExportedModel$select_model) + args <- list(...) + if (!"InputCollect" %in% names(args)) { + InputCollect <- robyn_inputs( + json_file = json_file, + quiet = quiet, + ... + ) + if (!is.null(json$ExportedModel$select_model)) { + OutputCollect <- robyn_run( + InputCollect = InputCollect, + json_file = json_file, + export = FALSE, + quiet = quiet, + ... + ) + } else { + OutputCollect <- NULL + } + } else { + # Use case: skip feature engineering when InputCollect is provided + InputCollect <- args[["InputCollect"]] + OutputCollect <- robyn_run( + json_file = json_file, + export = FALSE, + quiet = quiet, + ... + ) + } + return(invisible(list( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + Extras = json[["Extras"]] + ))) +} + +# Import the whole chain any refresh model to init +robyn_chain <- function(json_file) { + json_data <- robyn_read(json_file, quiet = TRUE) + ids <- c(json_data$InputCollect$refreshChain, json_data$ExportedModel$select_model) + plot_folder <- json_data$ExportedModel$plot_folder + temp <- str_split(plot_folder, "/")[[1]] + chain <- temp[startsWith(temp, "Robyn_") & grepl("_init+$|_rf[0-9]+$", temp)] + if (length(chain) == 0) chain <- tail(temp[temp != ""], 1) + avlb <- NULL + if (length(ids) != length(chain)) { + temp <- list.files(plot_folder) + mods <- unique(temp[ + (startsWith(temp, "RobynModel") | grepl("\\.json+$", temp)) & + grepl("^[^_]*_[^_]*_[^_]*$", temp) + ]) + avlb <- gsub("RobynModel-|\\.json", "", mods) + if (length(ids) == length(mods)) { + chain <- rep_len(chain, length(mods)) + } + } + base_dir <- gsub(sprintf("\\/%s.*", chain[1]), "", plot_folder) + chainData <- list() + for (i in rev(seq_along(ids))) { + if (i == length(ids)) { + json_new <- json_data + } else { + file <- paste0("RobynModel-", json_new$InputCollect$refreshSourceID, ".json") + filename <- paste(c(base_dir, chain[1:i], file), collapse = "/") + if (file.exists(filename)) { + json_new <- robyn_read(filename, quiet = TRUE) + } else { + if (ids[i] %in% avlb) { + filename <- mods[avlb == ids[i]] + json_new <- robyn_read(filename, quiet = TRUE) + } else { + last_try <- gsub(chain[1], "", filename) + if (file.exists(last_try)) { + json_new <- robyn_read(last_try, quiet = TRUE) + message("Stored original model in new file: ", filename) + jsonlite::write_json(json_new, filename, pretty = TRUE) + } else { + message("Skipping chain. File can't be found: ", filename) + } + } + } + } + chainData[[json_new$ExportedModel$select_model]] <- json_new + } + chainData <- chainData[rev(seq_along(chain))] + dirs <- unlist(lapply(chainData, function(x) x$ExportedModel$plot_folder)) + dirs[!dir.exists(dirs)] <- plot_folder + json_files <- paste0(dirs, "RobynModel-", names(dirs), ".json") + attr(chainData, "json_files") <- json_files + attr(chainData, "chain") <- ids # names(chainData) + if (length(ids) != length(names(chainData))) { + warning("Can't replicate chain-like results if you don't follow Robyn's chain structure") + } + return(invisible(chainData)) +} diff --git a/R/model.R b/R/model.R index ded1d7e..64d10e6 100644 --- a/R/model.R +++ b/R/model.R @@ -1,1357 +1,1357 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Robyn Modelling Function -#' -#' \code{robyn_run()} consumes \code{robyn_input()} outputs, -#' runs \code{robyn_mmm()}, and collects all modeling results. -#' -#' @inheritParams robyn_allocator -#' @inheritParams robyn_outputs -#' @inheritParams robyn_inputs -#' @param dt_hyper_fixed data.frame or named list. Only provide when loading -#' old model results. It consumes hyperparameters from saved csv -#' \code{pareto_hyperparameters.csv} or JSON file to replicate a model. -#' @param ts_validation Boolean. When set to \code{TRUE}, Robyn will split data -#' by test, train, and validation partitions to validate the time series. By -#' default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be -#' customized or set to a fixed value using the hyperparameters input. For example, -#' if \code{train_size = 0.7}, validation size and test size will both be 0.15 -#' and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the -#' objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective -#' function. -#' @param add_penalty_factor Boolean. Add penalty factor hyperparameters to -#' glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because -#' this feature might add too much hyperparameter space and probably requires -#' more iterations to converge. -#' @param refresh Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}. -#' @param cores Integer. Default to \code{parallel::detectCores() - 1} (all cores -#' except one). Set to 1 if you want to turn parallel computing off. -#' @param iterations Integer. Recommended 2000 for default when using -#' \code{nevergrad_algo = "TwoPointsDE"}. -#' @param trials Integer. Recommended 5 for default -#' \code{nevergrad_algo = "TwoPointsDE"}. -#' @param nevergrad_algo Character. Default to "TwoPointsDE". Options are -#' \code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", -#' "DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", -#' "cGA", "RandomSearch")}. -#' @param intercept Boolean. Should intercept(s) be fitted (default=TRUE) or -#' set to zero (FALSE). -#' @param intercept_sign Character. Choose one of "non_negative" (default) or -#' "unconstrained". By default, if intercept is negative, Robyn will drop intercept -#' and refit the model. Consider changing intercept_sign to "unconstrained" when -#' there are \code{context_vars} with large positive values. -#' @param rssd_zero_penalty Boolean. When TRUE, the objective function -#' DECOMP.RSSD will penalize models with more 0 media effects additionally. -#' In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef -#' variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while -#' another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1. -#' @param objective_weights Numeric vector. Default to NULL to give equal weights -#' to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration -#' data is provided). When you are not calibrating, only the first 2 values for -#' \code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight -#' to the 1st (NRMSE). This is an experimental feature. There's no research on -#' optimal weight setting. Subjective weights might strongly bias modeling results. -#' @param seed Integer. For reproducible results when running nevergrad. -#' @param lambda_control Deprecated in v3.6.0. -#' @param outputs Boolean. If set to TRUE, will run \code{robyn_run()} and -#' \code{robyn_outputs()}, returning a list with OutputModels and -#' OutputCollect results. -#' @param ... Additional parameters passed to \code{robyn_outputs()}. -#' @return List. Class: \code{robyn_models}. Contains the results of all trials -#' and iterations modeled. -#' @examples -#' \dontrun{ -#' # Having InputCollect results -#' OutputModels <- robyn_run( -#' InputCollect = InputCollect, -#' cores = 2, -#' iterations = 200, -#' trials = 1 -#' ) -#' } -#' @return List. Contains all trained models. Class: \code{robyn_models}. -#' @export -robyn_run <- function(InputCollect = NULL, - dt_hyper_fixed = NULL, - json_file = NULL, - ts_validation = FALSE, - add_penalty_factor = FALSE, - refresh = FALSE, - seed = 123L, - quiet = FALSE, - cores = NULL, - trials = 5, - iterations = 2000, - rssd_zero_penalty = TRUE, - objective_weights = NULL, - nevergrad_algo = "TwoPointsDE", - intercept = TRUE, - intercept_sign = "non_negative", - lambda_control = NULL, - outputs = FALSE, - ...) { - if (isTRUE(outputs)) { - OutputModels <- robyn_run( - InputCollect = InputCollect, - dt_hyper_fixed = dt_hyper_fixed, - json_file = json_file, - add_penalty_factor = add_penalty_factor, - ts_validation = ts_validation, - refresh = refresh, - seed = seed, - quiet = quiet, - cores = cores, - trials = trials, - iterations = iterations, - rssd_zero_penalty = rssd_zero_penalty, - objective_weights = objective_weights, - nevergrad_algo = nevergrad_algo, - intercept = intercept, - intercept_sign = intercept_sign, - lambda_control = lambda_control, - outputs = FALSE, - ... - ) - OutputCollect <- robyn_outputs(InputCollect, OutputModels, ...) - return(list( - OutputModels = OutputModels, - OutputCollect = OutputCollect - )) - } - - t0 <- Sys.time() - - ### Use previously exported model using json_file - if (!is.null(json_file)) { - # InputCollect <- robyn_inputs(json_file = json_file, dt_input = dt_input, dt_holidays = dt_holidays) - if (is.null(InputCollect)) InputCollect <- robyn_inputs(json_file = json_file, ...) - json <- robyn_read(json_file, step = 2, quiet = TRUE) - dt_hyper_fixed <- json$ExportedModel$hyper_values - for (i in seq_along(json$ExportedModel)) { - assign(names(json$ExportedModel)[i], json$ExportedModel[[i]]) - } - bootstrap <- select(json$ExportedModel$summary, any_of(c("variable", "boot_mean", "ci_low", "ci_up"))) - if (is.null(seed) | length(seed) == 0) seed <- 123L - dt_hyper_fixed$solID <- json$ExportedModel$select_model - } else { - bootstrap <- NULL - } - - ##################################### - #### Set local environment - - if (!"hyperparameters" %in% names(InputCollect) || is.null(InputCollect$hyperparameters)) { - stop("Must provide 'hyperparameters' in robyn_inputs()'s output first") - } - - # Check and warn on legacy inputs (using InputCollect params as robyn_run() inputs) - InputCollect <- check_legacy_input(InputCollect, cores, iterations, trials, intercept_sign, nevergrad_algo) - # Overwrite values imported from InputCollect - legacyValues <- InputCollect[LEGACY_PARAMS] - legacyValues <- legacyValues[!unlist(lapply(legacyValues, is.null))] - if (length(legacyValues) > 0) { - for (i in seq_along(InputCollect)) assign(names(InputCollect)[i], InputCollect[[i]]) - } - - # Keep in mind: https://www.jottr.org/2022/12/05/avoid-detectcores/ - max_cores <- max(1L, parallel::detectCores(), na.rm = TRUE) - if (is.null(cores)) { - cores <- max_cores - 1 # It's recommended to always leave at least one core free - } else if (cores > max_cores) { - warning(sprintf("Max possible cores in your machine is %s (your input was %s)", max_cores, cores)) - cores <- max_cores - } - if (cores == 0) cores <- 1 - - hyps_fixed <- !is.null(dt_hyper_fixed) - if (hyps_fixed) trials <- iterations <- 1 - check_run_inputs(cores, iterations, trials, intercept_sign, nevergrad_algo) - check_iteration(InputCollect$calibration_input, iterations, trials, hyps_fixed, refresh) - init_msgs_run(InputCollect, refresh, lambda_control = NULL, quiet) - objective_weights <- check_obj_weight(InputCollect$calibration_input, objective_weights, refresh) - - ##################################### - #### Prepare hyper-parameters - hyper_collect <- hyper_collector( - InputCollect, - hyper_in = InputCollect$hyperparameters, - ts_validation = ts_validation, - add_penalty_factor = add_penalty_factor, - dt_hyper_fixed = dt_hyper_fixed, - cores = cores - ) - InputCollect$hyper_updated <- hyper_collect$hyper_list_all - - ##################################### - #### Run robyn_mmm() for each trial - - OutputModels <- robyn_train( - InputCollect, hyper_collect, - cores = cores, iterations = iterations, trials = trials, - intercept_sign = intercept_sign, intercept = intercept, - nevergrad_algo = nevergrad_algo, - dt_hyper_fixed = dt_hyper_fixed, - ts_validation = ts_validation, - add_penalty_factor = add_penalty_factor, - rssd_zero_penalty = rssd_zero_penalty, - objective_weights = objective_weights, - refresh, seed, quiet - ) - - attr(OutputModels, "hyper_fixed") <- hyper_collect$all_fixed - attr(OutputModels, "bootstrap") <- bootstrap - attr(OutputModels, "refresh") <- refresh - - if (TRUE) { - OutputModels$train_timestamp <- Sys.time() - OutputModels$cores <- cores - OutputModels$iterations <- iterations - OutputModels$trials <- trials - OutputModels$intercept <- intercept - OutputModels$intercept_sign <- intercept_sign - OutputModels$nevergrad_algo <- nevergrad_algo - OutputModels$ts_validation <- ts_validation - OutputModels$add_penalty_factor <- add_penalty_factor - OutputModels$hyper_updated <- hyper_collect$hyper_list_all - OutputModels$hyper_fixed <- hyper_collect$all_fixed - } - - # Not direct output & not all fixed hyperparameters - if (is.null(dt_hyper_fixed)) { - output <- OutputModels - } else if (!hyper_collect$all_fixed) { - # Direct output & not all fixed hyperparameters, including refresh mode - output <- robyn_outputs(InputCollect, OutputModels, refresh = refresh, ...) - } else { - if (!"clusters" %in% names(list(...))) { - # Direct output & all fixed hyperparameters, thus no cluster - output <- robyn_outputs(InputCollect, OutputModels, clusters = FALSE, ...) - } else { - output <- robyn_outputs(InputCollect, OutputModels, ...) - } - } - - # Created with assign from JSON file - if (exists("clusters")) { - if (!is.integer(get("clusters"))) { - output$clusters <- get("clusters") - } - } - - # Check convergence when more than 1 iteration - if (!hyper_collect$all_fixed) { - output[["convergence"]] <- robyn_converge(OutputModels, ...) - output[["ts_validation_plot"]] <- ts_validation(OutputModels, ...) - } else { - if ("solID" %in% names(dt_hyper_fixed)) { - output[["selectID"]] <- dt_hyper_fixed$solID - } else { - output[["selectID"]] <- OutputModels$trial1$resultCollect$resultHypParam$solID - } - if (!quiet) message("Successfully recreated model ID: ", output$selectID) - } - - # Save hyper-parameters list - output[["hyper_updated"]] <- hyper_collect$hyper_list_all - output[["seed"]] <- seed - - # Report total timing - attr(output, "runTime") <- round(difftime(Sys.time(), t0, units = "mins"), 2) - if (!quiet && iterations > 1) message(paste("Total run time:", attr(output, "runTime"), "mins")) - - class(output) <- unique(c("robyn_models", class(output))) - return(output) -} - -#' @rdname robyn_run -#' @aliases robyn_run -#' @param x \code{robyn_models()} output. -#' @export -print.robyn_models <- function(x, ...) { - is_fixed <- all(lapply(x$hyper_updated, length) == 1) - print(glued( - " - Total trials: {x$trials} - Iterations per trial: {x$iterations} {total_iters} - Runtime (minutes): {attr(x, 'runTime')} - Cores: {x$cores} - - Updated Hyper-parameters{fixed}: - {hypers} - - Nevergrad Algo: {x$nevergrad_algo} - Intercept: {x$intercept} - Intercept sign: {x$intercept_sign} - Time-series validation: {x$ts_validation} - Penalty factor: {x$add_penalty_factor} - Refresh: {isTRUE(attr(x, 'refresh'))} - - Convergence on last quantile (iters {iters}): - {convergence} - - ", - total_iters = sprintf("(%s real)", ifelse( - "trial1" %in% names(x), nrow(x$trial1$resultCollect$resultHypParam), 1 - )), - iters = ifelse(is.null(x$convergence), 1, paste(tail(x$convergence$errors$cuts, 2), collapse = ":")), - fixed = ifelse(is_fixed, " (fixed)", ""), - convergence = if (!is_fixed) paste(x$convergence$conv_msg, collapse = "\n ") else "Fixed hyper-parameters", - hypers = flatten_hyps(x$hyper_updated) - )) - - if ("robyn_outputs" %in% class(x)) { - print(glued( - " -Plot Folder: {x$plot_folder} -Calibration Constraint: {x$calibration_constraint} -Hyper-parameters fixed: {x$hyper_fixed} -Pareto-front ({x$pareto_fronts}) All solutions ({nSols}): {paste(x$allSolutions, collapse = ', ')} -{clusters_info} -", - nSols = length(x$allSolutions), - clusters_info = if ("models" %in% names(x[["clusters"]])) { - glued( - "Clusters (k = {x$clusters$n_clusters}): {paste(x$clusters$models$solID, collapse = ', ')}" - ) - } else { - NULL - } - )) - } -} - -#################################################################### -#' Train Robyn Models -#' -#' \code{robyn_train()} consumes output from \code{robyn_input()} -#' and runs the \code{robyn_mmm()} on each trial. -#' -#' @inheritParams robyn_run -#' @param hyper_collect List. Containing hyperparameter bounds. Defaults to -#' \code{InputCollect$hyperparameters}. -#' @return List. Iteration results to include in \code{robyn_run()} results. -#' @export -robyn_train <- function(InputCollect, hyper_collect, - cores, iterations, trials, - intercept_sign, intercept, - nevergrad_algo, - dt_hyper_fixed = NULL, - ts_validation = TRUE, - add_penalty_factor = FALSE, - objective_weights = NULL, - rssd_zero_penalty = TRUE, - refresh = FALSE, seed = 123, - quiet = FALSE) { - hyper_fixed <- hyper_collect$all_fixed - - if (hyper_fixed) { - OutputModels <- list() - OutputModels[[1]] <- robyn_mmm( - InputCollect = InputCollect, - hyper_collect = hyper_collect, - iterations = iterations, - cores = cores, - nevergrad_algo = nevergrad_algo, - intercept = intercept, - intercept_sign = intercept_sign, - dt_hyper_fixed = dt_hyper_fixed, - ts_validation = ts_validation, - add_penalty_factor = add_penalty_factor, - rssd_zero_penalty = rssd_zero_penalty, - objective_weights = objective_weights, - seed = seed, - quiet = quiet - ) - OutputModels[[1]]$trial <- 1 - # Set original solID (to overwrite default 1_1_1) - if ("solID" %in% names(dt_hyper_fixed)) { - these <- c("resultHypParam", "xDecompVec", "xDecompAgg", "decompSpendDist") - for (tab in these) OutputModels[[1]]$resultCollect[[tab]]$solID <- dt_hyper_fixed$solID - } - } else { - ## Run robyn_mmm() for each trial if hyperparameters are not all fixed - check_init_msg(InputCollect, cores) - if (!quiet) { - message(paste( - ">>> Starting", trials, "trials with", - iterations, "iterations each", - ifelse(is.null(InputCollect$calibration_input), "using", "with calibration using"), - nevergrad_algo, "nevergrad algorithm..." - )) - } - - OutputModels <- list() - - for (ngt in 1:trials) { # ngt = 1 - if (!quiet) message(paste(" Running trial", ngt, "of", trials)) - model_output <- robyn_mmm( - InputCollect = InputCollect, - hyper_collect = hyper_collect, - iterations = iterations, - cores = cores, - nevergrad_algo = nevergrad_algo, - intercept = intercept, - intercept_sign = intercept_sign, - ts_validation = ts_validation, - add_penalty_factor = add_penalty_factor, - rssd_zero_penalty = rssd_zero_penalty, - objective_weights = objective_weights, - refresh = refresh, - trial = ngt, - seed = seed + ngt, - quiet = quiet - ) - check_coef0 <- any(model_output$resultCollect$decompSpendDist$decomp.rssd == Inf) - if (check_coef0) { - num_coef0_mod <- filter(model_output$resultCollect$decompSpendDist, is.infinite(.data$decomp.rssd)) %>% - distinct(.data$iterNG, .data$iterPar) %>% - nrow() - num_coef0_mod <- ifelse(num_coef0_mod > iterations, iterations, num_coef0_mod) - if (!quiet) { - message(paste( - "This trial contains", num_coef0_mod, "iterations with all media coefficient = 0.", - "Please reconsider your media variable choice if the pareto choices are unreasonable.", - "\n Recommendations:", - "\n1. Increase hyperparameter ranges for 0-coef channels to give Robyn more freedom", - "\n2. Split media into sub-channels, and/or aggregate similar channels, and/or introduce other media", - "\n3. Increase trials to get more samples" - )) - } - } - model_output["trial"] <- ngt - OutputModels[[ngt]] <- model_output - } - } - names(OutputModels) <- paste0("trial", seq_along(OutputModels)) - return(OutputModels) -} - - -#################################################################### -#' Core MMM Function -#' -#' \code{robyn_mmm()} function activates Nevergrad to generate samples of -#' hyperparameters, conducts media transformation within each loop, fits the -#' Ridge regression, calibrates the model optionally, decomposes responses -#' and collects the result. It's an inner function within \code{robyn_run()}. -#' -#' @inheritParams robyn_run -#' @inheritParams robyn_allocator -#' @param hyper_collect List. Containing hyperparameter bounds. Defaults to -#' \code{InputCollect$hyperparameters}. -#' @param iterations Integer. Number of iterations to run. -#' @param trial Integer. Which trial are we running? Used to ID each model. -#' @return List. MMM results with hyperparameters values. -#' @export -robyn_mmm <- function(InputCollect, - hyper_collect, - iterations, - cores, - nevergrad_algo, - intercept = TRUE, - intercept_sign, - ts_validation = TRUE, - add_penalty_factor = FALSE, - objective_weights = NULL, - dt_hyper_fixed = NULL, - # lambda_fixed = NULL, - rssd_zero_penalty = TRUE, - refresh = FALSE, - trial = 1L, - seed = 123L, - quiet = FALSE, ...) { - if (iterations > 1) { - if (reticulate::py_module_available("nevergrad")) { - ng <- reticulate::import("nevergrad", delay_load = TRUE) - if (is.integer(seed)) { - np <- reticulate::import("numpy", delay_load = FALSE) - np$random$seed(seed) - } - } else { - stop( - "You must have nevergrad python library installed.\nPlease check our install demo: ", - "https://github.com/facebookexperimental/Robyn/blob/main/demo/install_nevergrad.R" - ) - } - } - - ################################################ - #### Collect hyperparameters - - if (TRUE) { - hypParamSamName <- names(hyper_collect$hyper_list_all) - # Optimization hyper-parameters - hyper_bound_list_updated <- hyper_collect$hyper_bound_list_updated - hyper_bound_list_updated_name <- names(hyper_bound_list_updated) - hyper_count <- length(hyper_bound_list_updated_name) - # Fixed hyper-parameters - hyper_bound_list_fixed <- hyper_collect$hyper_bound_list_fixed - hyper_bound_list_fixed_name <- names(hyper_bound_list_fixed) - hyper_count_fixed <- length(hyper_bound_list_fixed_name) - dt_hyper_fixed_mod <- hyper_collect$dt_hyper_fixed_mod - hyper_fixed <- hyper_collect$all_fixed - } - - ################################################ - #### Setup environment - - if (is.null(InputCollect$dt_mod)) { - stop("Run InputCollect$dt_mod <- robyn_engineering() first to get the dt_mod") - } - - ## Get environment for parallel backend - if (TRUE) { - dt_mod <- InputCollect$dt_mod - xDecompAggPrev <- InputCollect$xDecompAggPrev - rollingWindowStartWhich <- InputCollect$rollingWindowStartWhich - rollingWindowEndWhich <- InputCollect$rollingWindowEndWhich - refreshAddedStart <- InputCollect$refreshAddedStart - dt_modRollWind <- InputCollect$dt_modRollWind - refresh_steps <- InputCollect$refresh_steps - rollingWindowLength <- InputCollect$rollingWindowLength - paid_media_spends <- InputCollect$paid_media_spends - organic_vars <- InputCollect$organic_vars - context_vars <- InputCollect$context_vars - prophet_vars <- InputCollect$prophet_vars - adstock <- InputCollect$adstock - context_signs <- InputCollect$context_signs - paid_media_signs <- InputCollect$paid_media_signs - prophet_signs <- InputCollect$prophet_signs - organic_signs <- InputCollect$organic_signs - calibration_input <- InputCollect$calibration_input - optimizer_name <- nevergrad_algo - i <- NULL # For parallel iterations (globalVar) - } - - ################################################ - #### Get spend share - - dt_inputTrain <- InputCollect$dt_input[rollingWindowStartWhich:rollingWindowEndWhich, ] - temp <- select(dt_inputTrain, all_of(paid_media_spends)) - dt_spendShare <- data.frame( - rn = paid_media_spends, - total_spend = unlist(summarise_all(temp, sum)), - # mean_spend = unlist(summarise_all(temp, function(x) { - # ifelse(is.na(mean(x[x > 0])), 0, mean(x[x > 0])) - # })) - mean_spend = unlist(summarise_all(temp, mean)) - ) %>% - mutate(spend_share = .data$total_spend / sum(.data$total_spend)) - # When not refreshing, dt_spendShareRF = dt_spendShare - refreshAddedStartWhich <- which(dt_modRollWind$ds == refreshAddedStart) - temp <- select(dt_inputTrain, all_of(paid_media_spends)) %>% - slice(refreshAddedStartWhich:rollingWindowLength) - dt_spendShareRF <- data.frame( - rn = paid_media_spends, - total_spend = unlist(summarise_all(temp, sum)), - # mean_spend = unlist(summarise_all(temp, function(x) { - # ifelse(is.na(mean(x[x > 0])), 0, mean(x[x > 0])) - # })) - mean_spend = unlist(summarise_all(temp, mean)) - ) %>% - mutate(spend_share = .data$total_spend / sum(.data$total_spend)) - # Join both dataframes into a single one - dt_spendShare <- left_join(dt_spendShare, dt_spendShareRF, "rn", suffix = c("", "_refresh")) - - ################################################ - #### Get lambda - lambda_min_ratio <- 0.0001 # default value from glmnet - lambdas <- lambda_seq( - x = select(dt_mod, -.data$ds, -.data$dep_var), - y = dt_mod$dep_var, - seq_len = 100, lambda_min_ratio - ) - lambda_max <- max(lambdas) * 0.1 - lambda_min <- lambda_max * lambda_min_ratio - - ################################################ - #### Start Nevergrad loop - t0 <- Sys.time() - - ## Set iterations - # hyper_fixed <- hyper_count == 0 - if (hyper_fixed == FALSE) { - iterTotal <- iterations - iterPar <- cores - iterNG <- ceiling(iterations / cores) # Sometimes the progress bar may not get to 100% - } else { - iterTotal <- iterPar <- iterNG <- 1 - } - - ## Start Nevergrad optimizer - if (!hyper_fixed) { - my_tuple <- tuple(hyper_count) - instrumentation <- ng$p$Array(shape = my_tuple, lower = 0, upper = 1) - optimizer <- ng$optimizers$registry[optimizer_name](instrumentation, budget = iterTotal, num_workers = cores) - - # Set multi-objective dimensions for objective functions (errors) - if (is.null(calibration_input)) { - optimizer$tell(ng$p$MultiobjectiveReference(), tuple(1, 1)) - if (is.null(objective_weights)) { - objective_weights <- tuple(1, 1) - } else { - objective_weights <- tuple(objective_weights[1], objective_weights[2]) - } - optimizer$set_objective_weights(objective_weights) - } else { - optimizer$tell(ng$p$MultiobjectiveReference(), tuple(1, 1, 1)) - if (is.null(objective_weights)) { - objective_weights <- tuple(1, 1, 1) - } else { - objective_weights <- tuple(objective_weights[1], objective_weights[2], objective_weights[3]) - } - optimizer$set_objective_weights(objective_weights) - } - } - - ## Prepare loop - resultCollectNG <- list() - cnt <- 0 - if (!hyper_fixed && !quiet) pb <- txtProgressBar(max = iterTotal, style = 3) - - sysTimeDopar <- tryCatch( - { - system.time({ - for (lng in 1:iterNG) { # lng = 1 - nevergrad_hp <- list() - nevergrad_hp_val <- list() - hypParamSamList <- list() - hypParamSamNG <- NULL - - if (hyper_fixed == FALSE) { - # Setting initial seeds (co = cores) - for (co in 1:iterPar) { # co = 1 - ## Get hyperparameter sample with ask (random) - nevergrad_hp[[co]] <- optimizer$ask() - nevergrad_hp_val[[co]] <- nevergrad_hp[[co]]$value - ## Scale sample to given bounds using uniform distribution - for (hypNameLoop in hyper_bound_list_updated_name) { - index <- which(hypNameLoop == hyper_bound_list_updated_name) - channelBound <- unlist(hyper_bound_list_updated[hypNameLoop]) - hyppar_value <- signif(nevergrad_hp_val[[co]][index], 10) - if (length(channelBound) > 1) { - hypParamSamNG[hypNameLoop] <- qunif(hyppar_value, min(channelBound), max(channelBound)) - } else { - hypParamSamNG[hypNameLoop] <- hyppar_value - } - } - hypParamSamList[[co]] <- data.frame(t(hypParamSamNG)) - } - hypParamSamNG <- bind_rows(hypParamSamList) - names(hypParamSamNG) <- hyper_bound_list_updated_name - ## Add fixed hyperparameters - if (hyper_count_fixed != 0) { - hypParamSamNG <- cbind(hypParamSamNG, dt_hyper_fixed_mod) %>% - select(all_of(hypParamSamName)) - } - } else { - hypParamSamNG <- select(dt_hyper_fixed_mod, all_of(hypParamSamName)) - } - - # Must remain within this function for it to work - robyn_iterations <- function(i, ...) { # i=1 - t1 <- Sys.time() - #### Get hyperparameter sample - hypParamSam <- hypParamSamNG[i, ] - adstock <- check_adstock(adstock) - - #### Transform media for model fitting - temp <- run_transformations(InputCollect, hypParamSam, adstock) - dt_modSaturated <- temp$dt_modSaturated - dt_saturatedImmediate <- temp$dt_saturatedImmediate - dt_saturatedCarryover <- temp$dt_saturatedCarryover - - ##################################### - #### Split train & test and prepare data for modelling - - dt_window <- dt_modSaturated - - ## Contrast matrix because glmnet does not treat categorical variables (one hot encoding) - y_window <- dt_window$dep_var - x_window <- as.matrix(lares::ohse(select(dt_window, -.data$dep_var))) - y_train <- y_val <- y_test <- y_window - x_train <- x_val <- x_test <- x_window - - ## Split train, test, and validation sets - train_size <- hypParamSam[, "train_size"][[1]] - val_size <- test_size <- (1 - train_size) / 2 - if (train_size < 1) { - train_size_index <- floor(quantile(seq(nrow(dt_window)), train_size)) - val_size_index <- train_size_index + floor(val_size * nrow(dt_window)) - y_train <- y_window[1:train_size_index] - y_val <- y_window[(train_size_index + 1):val_size_index] - y_test <- y_window[(val_size_index + 1):length(y_window)] - x_train <- x_window[1:train_size_index, ] - x_val <- x_window[(train_size_index + 1):val_size_index, ] - x_test <- x_window[(val_size_index + 1):length(y_window), ] - } else { - y_val <- y_test <- x_val <- x_test <- NULL - } - - ## Define and set sign control - dt_sign <- select(dt_window, -.data$dep_var) - x_sign <- c(prophet_signs, context_signs, paid_media_signs, organic_signs) - names(x_sign) <- c(prophet_vars, context_vars, paid_media_spends, organic_vars) - check_factor <- unlist(lapply(dt_sign, is.factor)) - lower.limits <- rep(0, length(prophet_signs)) - upper.limits <- rep(1, length(prophet_signs)) - trend_loc <- which(colnames(x_train) == "trend") - if (length(trend_loc) > 0 & sum(x_train[, trend_loc]) < 0) { - trend_loc <- which(prophet_vars == "trend") - lower.limits[trend_loc] <- -1 - upper.limits[trend_loc] <- 0 - } - for (s in (length(prophet_signs) + 1):length(x_sign)) { - if (check_factor[s] == TRUE) { - level.n <- length(levels(unlist(dt_sign[, s, with = FALSE]))) - if (level.n <= 1) { - stop("All factor variables must have more than 1 level") - } - lower_vec <- if (x_sign[s] == "positive") { - rep(0, level.n - 1) - } else { - rep(-Inf, level.n - 1) - } - upper_vec <- if (x_sign[s] == "negative") { - rep(0, level.n - 1) - } else { - rep(Inf, level.n - 1) - } - lower.limits <- c(lower.limits, lower_vec) - upper.limits <- c(upper.limits, upper_vec) - } else { - lower.limits <- c(lower.limits, ifelse(x_sign[s] == "positive", 0, -Inf)) - upper.limits <- c(upper.limits, ifelse(x_sign[s] == "negative", 0, Inf)) - } - } - - ##################################### - #### Fit ridge regression with nevergrad's lambda - # lambdas <- lambda_seq(x_train, y_train, seq_len = 100, lambda_min_ratio = 0.0001) - # lambda_max <- max(lambdas) - lambda_hp <- unlist(hypParamSamNG$lambda[i]) - if (hyper_fixed == FALSE) { - lambda_scaled <- lambda_min + (lambda_max - lambda_min) * lambda_hp - } else { - lambda_scaled <- lambda_hp - } - - if (add_penalty_factor) { - penalty.factor <- unlist(hypParamSamNG[i, grepl("_penalty", names(hypParamSamNG))]) - } else { - penalty.factor <- rep(1, ncol(x_train)) - } - - ##################################### - ## NRMSE: Model's fit error - - ## If no lift calibration, refit using best lambda - mod_out <- model_refit( - x_train, y_train, - x_val, y_val, - x_test, y_test, - lambda = lambda_scaled, - lower.limits = lower.limits, - upper.limits = upper.limits, - intercept = intercept, - intercept_sign = intercept_sign, - penalty.factor = penalty.factor, - ... - ) - decompCollect <- model_decomp( - coefs = mod_out$coefs, - y_pred = mod_out$y_pred, - dt_modSaturated = dt_modSaturated, - dt_saturatedImmediate = dt_saturatedImmediate, - dt_saturatedCarryover = dt_saturatedCarryover, - dt_modRollWind = dt_modRollWind, - refreshAddedStart = refreshAddedStart - ) - nrmse <- ifelse(ts_validation, mod_out$nrmse_val, mod_out$nrmse_train) - mape <- 0 - df.int <- mod_out$df.int - - ##################################### - #### MAPE: Calibration error - if (!is.null(calibration_input)) { - liftCollect <- robyn_calibrate( - calibration_input = calibration_input, - df_raw = dt_mod, - hypParamSam = hypParamSam, - wind_start = rollingWindowStartWhich, - wind_end = rollingWindowEndWhich, - dayInterval = InputCollect$dayInterval, - adstock = adstock, - xDecompVec = decompCollect$xDecompVec, - coefs = decompCollect$coefsOutCat - ) - mape <- mean(liftCollect$mape_lift, na.rm = TRUE) - } - - ##################################### - #### DECOMP.RSSD: Business error - # Sum of squared distance between decomp share and spend share to be minimized - dt_decompSpendDist <- decompCollect$xDecompAgg %>% - filter(.data$rn %in% paid_media_spends) %>% - select( - .data$rn, .data$xDecompAgg, .data$xDecompPerc, .data$xDecompMeanNon0Perc, - .data$xDecompMeanNon0, .data$xDecompPercRF, .data$xDecompMeanNon0PercRF, - .data$xDecompMeanNon0RF - ) %>% - left_join( - select( - dt_spendShare, - .data$rn, .data$spend_share, .data$spend_share_refresh, - .data$mean_spend, .data$total_spend - ), - by = "rn" - ) %>% - mutate( - effect_share = .data$xDecompPerc / sum(.data$xDecompPerc), - effect_share_refresh = .data$xDecompPercRF / sum(.data$xDecompPercRF) - ) - dt_decompSpendDist <- left_join( - filter(decompCollect$xDecompAgg, .data$rn %in% paid_media_spends), - select(dt_decompSpendDist, .data$rn, contains("_spend"), contains("_share")), - by = "rn" - ) - if (!refresh) { - decomp.rssd <- sqrt(sum((dt_decompSpendDist$effect_share - dt_decompSpendDist$spend_share)^2)) - # Penalty for models with more 0-coefficients - if (rssd_zero_penalty) { - is_0eff <- round(dt_decompSpendDist$effect_share, 4) == 0 - share_0eff <- sum(is_0eff) / length(dt_decompSpendDist$effect_share) - decomp.rssd <- decomp.rssd * (1 + share_0eff) - } - } else { - dt_decompRF <- select(decompCollect$xDecompAgg, .data$rn, decomp_perc = .data$xDecompPerc) %>% - left_join(select(xDecompAggPrev, .data$rn, decomp_perc_prev = .data$xDecompPerc), - by = "rn" - ) - decomp.rssd.media <- dt_decompRF %>% - filter(.data$rn %in% paid_media_spends) %>% - summarise(rssd.media = sqrt(mean((.data$decomp_perc - .data$decomp_perc_prev)^2))) %>% - pull(.data$rssd.media) - decomp.rssd.nonmedia <- dt_decompRF %>% - filter(!.data$rn %in% paid_media_spends) %>% - summarise(rssd.nonmedia = sqrt(mean((.data$decomp_perc - .data$decomp_perc_prev)^2))) %>% - pull(.data$rssd.nonmedia) - decomp.rssd <- decomp.rssd.media + decomp.rssd.nonmedia / - (1 - refresh_steps / rollingWindowLength) - } - # When all media in this iteration have 0 coefficients - if (is.nan(decomp.rssd)) { - decomp.rssd <- Inf - dt_decompSpendDist$effect_share <- 0 - } - - ##################################### - #### Collect Multi-Objective Errors and Iteration Results - resultCollect <- list() - - # Auxiliary dynamic vector - common <- data.frame( - rsq_train = mod_out$rsq_train, - rsq_val = mod_out$rsq_val, - rsq_test = mod_out$rsq_test, - nrmse_train = mod_out$nrmse_train, - nrmse_val = mod_out$nrmse_val, - nrmse_test = mod_out$nrmse_test, - nrmse = nrmse, - decomp.rssd = decomp.rssd, - mape = mape, - lambda = lambda_scaled, - lambda_hp = lambda_hp, - lambda_max = lambda_max, - lambda_min_ratio = lambda_min_ratio, - solID = paste(trial, lng, i, sep = "_"), - trial = trial, - iterNG = lng, - iterPar = i - ) - - total_common <- ncol(common) - split_common <- which(colnames(common) == "lambda_min_ratio") - - resultCollect[["resultHypParam"]] <- as_tibble(hypParamSam) %>% - select(-.data$lambda) %>% - bind_cols(common[, 1:split_common]) %>% - mutate( - pos = prod(decompCollect$xDecompAgg$pos), - Elapsed = as.numeric(difftime(Sys.time(), t1, units = "secs")), - ElapsedAccum = as.numeric(difftime(Sys.time(), t0, units = "secs")) - ) %>% - bind_cols(common[, (split_common + 1):total_common]) %>% - dplyr::mutate_all(unlist) - - resultCollect[["xDecompAgg"]] <- decompCollect$xDecompAgg %>% - mutate(train_size = train_size) %>% - bind_cols(common) - - if (!is.null(calibration_input)) { - resultCollect[["liftCalibration"]] <- liftCollect %>% - bind_cols(common) - } - - resultCollect[["decompSpendDist"]] <- dt_decompSpendDist %>% - bind_cols(common) - - resultCollect <- append(resultCollect, as.list(common)) - return(resultCollect) - } - - ########### Parallel start - nrmse.collect <- NULL - decomp.rssd.collect <- NULL - best_mape <- Inf - if (cores == 1) { - doparCollect <- lapply(1:iterPar, robyn_iterations) - } else { - # Create cluster to minimize overhead for parallel back-end registering - if (check_parallel() && !hyper_fixed) { - registerDoParallel(cores) - } else { - registerDoSEQ() - } - suppressPackageStartupMessages( - doparCollect <- foreach(i = 1:iterPar, .options.RNG = seed) %dorng% robyn_iterations(i) - ) - } - - nrmse.collect <- unlist(lapply(doparCollect, function(x) x$nrmse)) - decomp.rssd.collect <- unlist(lapply(doparCollect, function(x) x$decomp.rssd)) - mape.lift.collect <- unlist(lapply(doparCollect, function(x) x$mape)) - - ##################################### - #### Nevergrad tells objectives - - if (!hyper_fixed) { - if (is.null(calibration_input)) { - for (co in 1:iterPar) { - optimizer$tell(nevergrad_hp[[co]], tuple(nrmse.collect[co], decomp.rssd.collect[co])) - } - } else { - for (co in 1:iterPar) { - optimizer$tell(nevergrad_hp[[co]], tuple(nrmse.collect[co], decomp.rssd.collect[co], mape.lift.collect[co])) - } - } - } - - resultCollectNG[[lng]] <- doparCollect - if (!quiet) { - cnt <- cnt + iterPar - if (!hyper_fixed) setTxtProgressBar(pb, cnt) - } - } ## end NG loop - }) # end system.time - }, - error = function(err) { - if (length(resultCollectNG) > 1) { - msg <- "Error while running robyn_mmm(); providing PARTIAL results" - warning(msg) - message(paste(msg, err, sep = "\n")) - sysTimeDopar <- rep(Sys.time() - t0, 3) - } else { - stop(err) - } - } - ) - - # stop cluster to avoid memory leaks - if (cores > 1) { - stopImplicitCluster() - registerDoSEQ() - getDoParWorkers() - } - - if (!hyper_fixed) { - cat("\r", paste("\n Finished in", round(sysTimeDopar[3] / 60, 2), "mins")) - flush.console() - close(pb) - } - - ##################################### - #### Final result collect - - resultCollect <- list() - - resultCollect[["resultHypParam"]] <- as_tibble(bind_rows( - lapply(resultCollectNG, function(x) { - bind_rows(lapply(x, function(y) y$resultHypParam)) - }) - )) - - # resultCollect[["xDecompVec"]] <- as_tibble(bind_rows( - # lapply(resultCollectNG, function(x) { - # bind_rows(lapply(x, function(y) y$xDecompVec)) - # }) - # )) - - resultCollect[["xDecompAgg"]] <- as_tibble(bind_rows( - lapply(resultCollectNG, function(x) { - bind_rows(lapply(x, function(y) y$xDecompAgg)) - }) - )) - - if (!is.null(calibration_input)) { - resultCollect[["liftCalibration"]] <- as_tibble(bind_rows( - lapply(resultCollectNG, function(x) { - bind_rows(lapply(x, function(y) y$liftCalibration)) - }) - ) %>% - arrange(.data$mape, .data$liftMedia, .data$liftStart)) - } - - resultCollect[["decompSpendDist"]] <- as_tibble(bind_rows( - lapply(resultCollectNG, function(x) { - bind_rows(lapply(x, function(y) y$decompSpendDist)) - }) - )) - - resultCollect$iter <- length(resultCollect$mape) - resultCollect$elapsed.min <- sysTimeDopar[3] / 60 - - # Adjust accumulated time - resultCollect$resultHypParam <- resultCollect$resultHypParam %>% - mutate(ElapsedAccum = .data$ElapsedAccum - min(.data$ElapsedAccum) + - .data$Elapsed[which.min(.data$ElapsedAccum)]) - - return(list( - resultCollect = resultCollect, - hyperBoundNG = hyper_bound_list_updated, - hyperBoundFixed = hyper_bound_list_fixed - )) -} - -model_decomp <- function(coefs, y_pred, - dt_modSaturated, dt_saturatedImmediate, - dt_saturatedCarryover, dt_modRollWind, - refreshAddedStart) { - ## Input for decomp - y <- dt_modSaturated$dep_var - # x <- data.frame(x) - - x <- select(dt_modSaturated, -.data$dep_var) - intercept <- coefs[1] - x_name <- names(x) - x_factor <- x_name[sapply(x, is.factor)] - - ## Decomp x - xDecomp <- data.frame(mapply(function(regressor, coeff) { - regressor * coeff - }, regressor = x, coeff = coefs[-1])) - xDecomp <- cbind(data.frame(intercept = rep(intercept, nrow(xDecomp))), xDecomp) - xDecompOut <- cbind(data.frame(ds = dt_modRollWind$ds, y = y, y_pred = y_pred), xDecomp) - - ## Decomp immediate & carryover response - sel_coef <- c(rownames(coefs), names(coefs)) %in% names(dt_saturatedImmediate) - coefs_media <- coefs[sel_coef] - names(coefs_media) <- rownames(coefs)[sel_coef] - mediaDecompImmediate <- data.frame(mapply(function(regressor, coeff) { - regressor * coeff - }, regressor = dt_saturatedImmediate, coeff = coefs_media)) - mediaDecompCarryover <- data.frame(mapply(function(regressor, coeff) { - regressor * coeff - }, regressor = dt_saturatedCarryover, coeff = coefs_media)) - - ## Output decomp - y_hat <- rowSums(xDecomp, na.rm = TRUE) - y_hat.scaled <- rowSums(abs(xDecomp), na.rm = TRUE) - xDecompOutPerc.scaled <- abs(xDecomp) / y_hat.scaled - xDecompOut.scaled <- y_hat * xDecompOutPerc.scaled - - temp <- select(xDecompOut, .data$intercept, all_of(x_name)) - xDecompOutAgg <- sapply(temp, function(x) sum(x)) - xDecompOutAggPerc <- xDecompOutAgg / sum(y_hat) - xDecompOutAggMeanNon0 <- unlist(lapply(temp, function(x) ifelse(is.na(mean(x[x > 0])), 0, mean(x[x != 0])))) - xDecompOutAggMeanNon0[is.nan(xDecompOutAggMeanNon0)] <- 0 - xDecompOutAggMeanNon0Perc <- xDecompOutAggMeanNon0 / sum(xDecompOutAggMeanNon0) - - refreshAddedStartWhich <- which(xDecompOut$ds == refreshAddedStart) - refreshAddedEnd <- max(xDecompOut$ds) - refreshAddedEndWhich <- which(xDecompOut$ds == refreshAddedEnd) - - temp <- select(xDecompOut, .data$intercept, all_of(x_name)) %>% - slice(refreshAddedStartWhich:refreshAddedEndWhich) - xDecompOutAggRF <- unlist(lapply(temp, function(x) sum(x))) - y_hatRF <- y_hat[refreshAddedStartWhich:refreshAddedEndWhich] - xDecompOutAggPercRF <- xDecompOutAggRF / sum(y_hatRF) - xDecompOutAggMeanNon0RF <- unlist(lapply(temp, function(x) ifelse(is.na(mean(x[x > 0])), 0, mean(x[x != 0])))) - xDecompOutAggMeanNon0RF[is.nan(xDecompOutAggMeanNon0RF)] <- 0 - xDecompOutAggMeanNon0PercRF <- xDecompOutAggMeanNon0RF / sum(xDecompOutAggMeanNon0RF) - - coefsOutCat <- coefsOut <- data.frame(rn = c(rownames(coefs), names(coefs)), coefs) - if (length(x_factor) > 0) { - coefsOut$rn <- sapply(x_factor, function(x) str_replace(coefsOut$rn, paste0(x, ".*"), x)) - } - rn_order <- names(xDecompOutAgg) - rn_order[rn_order == "intercept"] <- "(Intercept)" - coefsOut <- coefsOut %>% - group_by(.data$rn) %>% - rename("coef" = 2) %>% - summarise(coef = mean(.data$coef)) %>% - arrange(match(.data$rn, rn_order)) - - decompOutAgg <- as_tibble(cbind(coefsOut, data.frame( - xDecompAgg = xDecompOutAgg, - xDecompPerc = xDecompOutAggPerc, - xDecompMeanNon0 = xDecompOutAggMeanNon0, - xDecompMeanNon0Perc = xDecompOutAggMeanNon0Perc, - xDecompAggRF = xDecompOutAggRF, - xDecompPercRF = xDecompOutAggPercRF, - xDecompMeanNon0RF = xDecompOutAggMeanNon0RF, - xDecompMeanNon0PercRF = xDecompOutAggMeanNon0PercRF, - pos = xDecompOutAgg >= 0 - ))) - - decompCollect <- list( - xDecompVec = xDecompOut, xDecompVec.scaled = xDecompOut.scaled, - xDecompAgg = decompOutAgg, coefsOutCat = coefsOutCat, - mediaDecompImmediate = mutate(mediaDecompImmediate, ds = xDecompOut$ds, y = xDecompOut$y), - mediaDecompCarryover = mutate(mediaDecompCarryover, ds = xDecompOut$ds, y = xDecompOut$y) - ) - return(decompCollect) -} - -model_refit <- function(x_train, y_train, x_val, y_val, x_test, y_test, - lambda, lower.limits, upper.limits, - intercept = TRUE, - intercept_sign = "non_negative", - penalty.factor = rep(1, ncol(y_train)), - ...) { - mod <- glmnet( - x_train, - y_train, - family = "gaussian", - alpha = 0, # 0 for ridge regression - lambda = lambda, - lower.limits = lower.limits, - upper.limits = upper.limits, - type.measure = "mse", - penalty.factor = penalty.factor, - intercept = intercept, - ... - ) # coef(mod) - - df.int <- 1 - - ## Drop intercept if negative and intercept_sign == "non_negative" - if (intercept_sign == "non_negative" && coef(mod)[1] < 0) { - mod <- glmnet( - x_train, - y_train, - family = "gaussian", - alpha = 0, # 0 for ridge regression - lambda = lambda, - lower.limits = lower.limits, - upper.limits = upper.limits, - penalty.factor = penalty.factor, - intercept = FALSE, - ... - ) # coef(mod) - df.int <- 0 - } # plot(mod); print(mod) - - # Calculate all Adjusted R2 - y_train_pred <- as.vector(predict(mod, s = lambda, newx = x_train)) - rsq_train <- get_rsq(true = y_train, predicted = y_train_pred, p = ncol(x_train), df.int = df.int) - if (!is.null(x_val)) { - y_val_pred <- as.vector(predict(mod, s = lambda, newx = x_val)) - rsq_val <- get_rsq(true = y_val, predicted = y_val_pred, p = ncol(x_val), df.int = df.int, n_train = length(y_train)) - y_test_pred <- as.vector(predict(mod, s = lambda, newx = x_test)) - rsq_test <- get_rsq(true = y_test, predicted = y_test_pred, p = ncol(x_test), df.int = df.int, n_train = length(y_train)) - y_pred <- c(y_train_pred, y_val_pred, y_test_pred) - } else { - rsq_val <- rsq_test <- NA - y_pred <- y_train_pred - } - - # Calculate all NRMSE - nrmse_train <- sqrt(mean((y_train - y_train_pred)^2)) / (max(y_train) - min(y_train)) - if (!is.null(x_val)) { - nrmse_val <- sqrt(mean((y_val - y_val_pred)^2)) / (max(y_val) - min(y_val)) - nrmse_test <- sqrt(mean((y_test - y_test_pred)^2)) / (max(y_test) - min(y_test)) - } else { - nrmse_val <- nrmse_test <- y_val_pred <- y_test_pred <- NA - } - - mod_out <- list( - rsq_train = rsq_train, - rsq_val = rsq_val, - rsq_test = rsq_test, - nrmse_train = nrmse_train, - nrmse_val = nrmse_val, - nrmse_test = nrmse_test, - coefs = as.matrix(coef(mod)), - y_train_pred = y_train_pred, - y_val_pred = y_val_pred, - y_test_pred = y_test_pred, - y_pred = y_pred, - mod = mod, - df.int = df.int - ) - - return(mod_out) -} - -# x = x_train matrix -# y = y_train (dep_var) vector -lambda_seq <- function(x, y, seq_len = 100, lambda_min_ratio = 0.0001) { - mysd <- function(y) sqrt(sum((y - mean(y))^2) / length(y)) - sx <- scale(x, scale = apply(x, 2, mysd)) - check_nan <- apply(sx, 2, function(sxj) all(is.nan(sxj))) - sx <- mapply(function(sxj, v) { - return(if (v) rep(0, length(sxj)) else sxj) - }, sxj = as.data.frame(sx), v = check_nan) - sx <- as.matrix(sx, ncol = ncol(x), nrow = nrow(x)) - # sy <- as.vector(scale(y, scale=mysd(y))) - sy <- y - # 0.001 is the default smalles alpha value of glmnet for ridge (alpha = 0) - lambda_max <- max(abs(colSums(sx * sy))) / (0.001 * nrow(x)) - lambda_max_log <- log(lambda_max) - log_step <- (log(lambda_max) - log(lambda_max * lambda_min_ratio)) / (seq_len - 1) - log_seq <- seq(log(lambda_max), log(lambda_max * lambda_min_ratio), length.out = seq_len) - lambdas <- exp(log_seq) - return(lambdas) -} - -hyper_collector <- function(InputCollect, hyper_in, ts_validation, add_penalty_factor, dt_hyper_fixed = NULL, cores = 1) { - # Fetch hyper-parameters based on media - hypParamSamName <- hyper_names( - adstock = InputCollect$adstock, - all_media = InputCollect$all_media, - all_vars = names(select(InputCollect$dt_mod, -c("ds", "dep_var"))) - ) - - # Manually add other hyper-parameters - hypParamSamName <- c(hypParamSamName, HYPS_OTHERS) - - # Check hyper_fixed condition + add lambda + penalty factor hyper-parameters names - all_fixed <- check_hyper_fixed(InputCollect, dt_hyper_fixed, add_penalty_factor) - hypParamSamName <- attr(all_fixed, "hypParamSamName") - - if (!all_fixed) { - # Collect media hyperparameters - hyper_bound_list <- list() - for (i in seq_along(hypParamSamName)) { - hyper_bound_list <- append(hyper_bound_list, hyper_in[hypParamSamName[i]]) - } - - # Add lambda hyperparameter - if (!"lambda" %in% names(hyper_bound_list)) { - hyper_bound_list$lambda <- c(0, 1) - } - - # Add train_size hyperparameter - if (ts_validation) { - if (!"train_size" %in% names(hyper_bound_list)) { - hyper_bound_list$train_size <- c(0.5, 0.8) - } - message(sprintf( - "Time-series validation with train_size range of %s of the data...", - paste(formatNum(100 * hyper_bound_list$train_size, pos = "%"), collapse = "-") - )) - } else { - if ("train_size" %in% names(hyper_bound_list)) { - warning("Provided train_size but ts_validation = FALSE. Time series validation inactive.") - } - hyper_bound_list$train_size <- 1 - message("Fitting time series with all available data...") - } - - # Add penalty factor hyperparameters - for_penalty <- names(select(InputCollect$dt_mod, -.data$ds, -.data$dep_var)) - penalty_names <- paste0(for_penalty, "_penalty") - if (add_penalty_factor) { - for (penalty in penalty_names) { - if (!penalty %in% names(hyper_bound_list)) { - hyper_bound_list[[penalty]] <- c(0, 1) - } - } - } - - # Get hyperparameters for Nevergrad - hyper_bound_list_updated <- hyper_bound_list[ - which(unlist(lapply(hyper_bound_list, length) == 2)) - ] - - # Get fixed hyperparameters - hyper_bound_list_fixed <- hyper_bound_list[ - which(unlist(lapply(hyper_bound_list, length) == 1)) - ] - - hyper_list_bind <- c(hyper_bound_list_updated, hyper_bound_list_fixed) - hyper_list_all <- list() - for (i in seq_along(hypParamSamName)) { - hyper_list_all[[i]] <- hyper_list_bind[[hypParamSamName[i]]] - names(hyper_list_all)[i] <- hypParamSamName[i] - } - - dt_hyper_fixed_mod <- data.frame(bind_cols(lapply( - hyper_bound_list_fixed, function(x) rep(x, cores) - ))) - } else { - hyper_bound_list_fixed <- list() - for (i in seq_along(hypParamSamName)) { - hyper_bound_list_fixed[[i]] <- dt_hyper_fixed[[hypParamSamName[i]]] - names(hyper_bound_list_fixed)[i] <- hypParamSamName[i] - } - - hyper_list_all <- hyper_bound_list_fixed - hyper_bound_list_updated <- hyper_bound_list_fixed[ - which(unlist(lapply(hyper_bound_list_fixed, length) == 2)) - ] - - dt_hyper_fixed_mod <- data.frame(matrix(hyper_bound_list_fixed, nrow = 1)) - names(dt_hyper_fixed_mod) <- names(hyper_bound_list_fixed) - } - - return(list( - hyper_list_all = hyper_list_all, - hyper_bound_list_updated = hyper_bound_list_updated, - hyper_bound_list_fixed = hyper_bound_list_fixed, - dt_hyper_fixed_mod = dt_hyper_fixed_mod, - all_fixed = all_fixed - )) -} - -init_msgs_run <- function(InputCollect, refresh, lambda_control = NULL, quiet = FALSE) { - if (!is.null(lambda_control)) { - message("Input 'lambda_control' deprecated in v3.6.0; lambda is now selected by hyperparameter optimization") - } - if (!quiet) { - message(sprintf( - "Input data has %s %ss in total: %s to %s", - nrow(InputCollect$dt_mod), - InputCollect$intervalType, - min(InputCollect$dt_mod$ds), - max(InputCollect$dt_mod$ds) - )) - depth <- ifelse( - "refreshDepth" %in% names(InputCollect), - InputCollect$refreshDepth, - ifelse("refreshCounter" %in% names(InputCollect), - InputCollect$refreshCounter, 0 - ) - ) - refresh <- as.integer(depth) > 0 - message(sprintf( - "%s model is built on rolling window of %s %s: %s to %s", - ifelse(!refresh, "Initial", paste0("Refresh #", depth)), - InputCollect$rollingWindowLength, - InputCollect$intervalType, - InputCollect$window_start, - InputCollect$window_end - )) - if (refresh) { - message(sprintf( - "Rolling window moving forward: %s %ss", - InputCollect$refresh_steps, InputCollect$intervalType - )) - } - } -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Robyn Modelling Function +#' +#' \code{robyn_run()} consumes \code{robyn_input()} outputs, +#' runs \code{robyn_mmm()}, and collects all modeling results. +#' +#' @inheritParams robyn_allocator +#' @inheritParams robyn_outputs +#' @inheritParams robyn_inputs +#' @param dt_hyper_fixed data.frame or named list. Only provide when loading +#' old model results. It consumes hyperparameters from saved csv +#' \code{pareto_hyperparameters.csv} or JSON file to replicate a model. +#' @param ts_validation Boolean. When set to \code{TRUE}, Robyn will split data +#' by test, train, and validation partitions to validate the time series. By +#' default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be +#' customized or set to a fixed value using the hyperparameters input. For example, +#' if \code{train_size = 0.7}, validation size and test size will both be 0.15 +#' and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the +#' objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective +#' function. +#' @param add_penalty_factor Boolean. Add penalty factor hyperparameters to +#' glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because +#' this feature might add too much hyperparameter space and probably requires +#' more iterations to converge. +#' @param refresh Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}. +#' @param cores Integer. Default to \code{parallel::detectCores() - 1} (all cores +#' except one). Set to 1 if you want to turn parallel computing off. +#' @param iterations Integer. Recommended 2000 for default when using +#' \code{nevergrad_algo = "TwoPointsDE"}. +#' @param trials Integer. Recommended 5 for default +#' \code{nevergrad_algo = "TwoPointsDE"}. +#' @param nevergrad_algo Character. Default to "TwoPointsDE". Options are +#' \code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", +#' "DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", +#' "cGA", "RandomSearch")}. +#' @param intercept Boolean. Should intercept(s) be fitted (default=TRUE) or +#' set to zero (FALSE). +#' @param intercept_sign Character. Choose one of "non_negative" (default) or +#' "unconstrained". By default, if intercept is negative, Robyn will drop intercept +#' and refit the model. Consider changing intercept_sign to "unconstrained" when +#' there are \code{context_vars} with large positive values. +#' @param rssd_zero_penalty Boolean. When TRUE, the objective function +#' DECOMP.RSSD will penalize models with more 0 media effects additionally. +#' In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef +#' variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while +#' another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1. +#' @param objective_weights Numeric vector. Default to NULL to give equal weights +#' to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration +#' data is provided). When you are not calibrating, only the first 2 values for +#' \code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight +#' to the 1st (NRMSE). This is an experimental feature. There's no research on +#' optimal weight setting. Subjective weights might strongly bias modeling results. +#' @param seed Integer. For reproducible results when running nevergrad. +#' @param lambda_control Deprecated in v3.6.0. +#' @param outputs Boolean. If set to TRUE, will run \code{robyn_run()} and +#' \code{robyn_outputs()}, returning a list with OutputModels and +#' OutputCollect results. +#' @param ... Additional parameters passed to \code{robyn_outputs()}. +#' @return List. Class: \code{robyn_models}. Contains the results of all trials +#' and iterations modeled. +#' @examples +#' \dontrun{ +#' # Having InputCollect results +#' OutputModels <- robyn_run( +#' InputCollect = InputCollect, +#' cores = 2, +#' iterations = 200, +#' trials = 1 +#' ) +#' } +#' @return List. Contains all trained models. Class: \code{robyn_models}. +#' @export +robyn_run <- function(InputCollect = NULL, + dt_hyper_fixed = NULL, + json_file = NULL, + ts_validation = FALSE, + add_penalty_factor = FALSE, + refresh = FALSE, + seed = 123L, + quiet = FALSE, + cores = NULL, + trials = 5, + iterations = 2000, + rssd_zero_penalty = TRUE, + objective_weights = NULL, + nevergrad_algo = "TwoPointsDE", + intercept = TRUE, + intercept_sign = "non_negative", + lambda_control = NULL, + outputs = FALSE, + ...) { + if (isTRUE(outputs)) { + OutputModels <- robyn_run( + InputCollect = InputCollect, + dt_hyper_fixed = dt_hyper_fixed, + json_file = json_file, + add_penalty_factor = add_penalty_factor, + ts_validation = ts_validation, + refresh = refresh, + seed = seed, + quiet = quiet, + cores = cores, + trials = trials, + iterations = iterations, + rssd_zero_penalty = rssd_zero_penalty, + objective_weights = objective_weights, + nevergrad_algo = nevergrad_algo, + intercept = intercept, + intercept_sign = intercept_sign, + lambda_control = lambda_control, + outputs = FALSE, + ... + ) + OutputCollect <- robyn_outputs(InputCollect, OutputModels, ...) + return(list( + OutputModels = OutputModels, + OutputCollect = OutputCollect + )) + } + + t0 <- Sys.time() + + ### Use previously exported model using json_file + if (!is.null(json_file)) { + # InputCollect <- robyn_inputs(json_file = json_file, dt_input = dt_input, dt_holidays = dt_holidays) + if (is.null(InputCollect)) InputCollect <- robyn_inputs(json_file = json_file, ...) + json <- robyn_read(json_file, step = 2, quiet = TRUE) + dt_hyper_fixed <- json$ExportedModel$hyper_values + for (i in seq_along(json$ExportedModel)) { + assign(names(json$ExportedModel)[i], json$ExportedModel[[i]]) + } + bootstrap <- select(json$ExportedModel$summary, any_of(c("variable", "boot_mean", "ci_low", "ci_up"))) + if (is.null(seed) | length(seed) == 0) seed <- 123L + dt_hyper_fixed$solID <- json$ExportedModel$select_model + } else { + bootstrap <- NULL + } + + ##################################### + #### Set local environment + + if (!"hyperparameters" %in% names(InputCollect) || is.null(InputCollect$hyperparameters)) { + stop("Must provide 'hyperparameters' in robyn_inputs()'s output first") + } + + # Check and warn on legacy inputs (using InputCollect params as robyn_run() inputs) + InputCollect <- check_legacy_input(InputCollect, cores, iterations, trials, intercept_sign, nevergrad_algo) + # Overwrite values imported from InputCollect + legacyValues <- InputCollect[LEGACY_PARAMS] + legacyValues <- legacyValues[!unlist(lapply(legacyValues, is.null))] + if (length(legacyValues) > 0) { + for (i in seq_along(InputCollect)) assign(names(InputCollect)[i], InputCollect[[i]]) + } + + # Keep in mind: https://www.jottr.org/2022/12/05/avoid-detectcores/ + max_cores <- max(1L, parallel::detectCores(), na.rm = TRUE) + if (is.null(cores)) { + cores <- max_cores - 1 # It's recommended to always leave at least one core free + } else if (cores > max_cores) { + warning(sprintf("Max possible cores in your machine is %s (your input was %s)", max_cores, cores)) + cores <- max_cores + } + if (cores == 0) cores <- 1 + + hyps_fixed <- !is.null(dt_hyper_fixed) + if (hyps_fixed) trials <- iterations <- 1 + check_run_inputs(cores, iterations, trials, intercept_sign, nevergrad_algo) + check_iteration(InputCollect$calibration_input, iterations, trials, hyps_fixed, refresh) + init_msgs_run(InputCollect, refresh, lambda_control = NULL, quiet) + objective_weights <- check_obj_weight(InputCollect$calibration_input, objective_weights, refresh) + + ##################################### + #### Prepare hyper-parameters + hyper_collect <- hyper_collector( + InputCollect, + hyper_in = InputCollect$hyperparameters, + ts_validation = ts_validation, + add_penalty_factor = add_penalty_factor, + dt_hyper_fixed = dt_hyper_fixed, + cores = cores + ) + InputCollect$hyper_updated <- hyper_collect$hyper_list_all + + ##################################### + #### Run robyn_mmm() for each trial + + OutputModels <- robyn_train( + InputCollect, hyper_collect, + cores = cores, iterations = iterations, trials = trials, + intercept_sign = intercept_sign, intercept = intercept, + nevergrad_algo = nevergrad_algo, + dt_hyper_fixed = dt_hyper_fixed, + ts_validation = ts_validation, + add_penalty_factor = add_penalty_factor, + rssd_zero_penalty = rssd_zero_penalty, + objective_weights = objective_weights, + refresh, seed, quiet + ) + + attr(OutputModels, "hyper_fixed") <- hyper_collect$all_fixed + attr(OutputModels, "bootstrap") <- bootstrap + attr(OutputModels, "refresh") <- refresh + + if (TRUE) { + OutputModels$train_timestamp <- Sys.time() + OutputModels$cores <- cores + OutputModels$iterations <- iterations + OutputModels$trials <- trials + OutputModels$intercept <- intercept + OutputModels$intercept_sign <- intercept_sign + OutputModels$nevergrad_algo <- nevergrad_algo + OutputModels$ts_validation <- ts_validation + OutputModels$add_penalty_factor <- add_penalty_factor + OutputModels$hyper_updated <- hyper_collect$hyper_list_all + OutputModels$hyper_fixed <- hyper_collect$all_fixed + } + + # Not direct output & not all fixed hyperparameters + if (is.null(dt_hyper_fixed)) { + output <- OutputModels + } else if (!hyper_collect$all_fixed) { + # Direct output & not all fixed hyperparameters, including refresh mode + output <- robyn_outputs(InputCollect, OutputModels, refresh = refresh, ...) + } else { + if (!"clusters" %in% names(list(...))) { + # Direct output & all fixed hyperparameters, thus no cluster + output <- robyn_outputs(InputCollect, OutputModels, clusters = FALSE, ...) + } else { + output <- robyn_outputs(InputCollect, OutputModels, ...) + } + } + + # Created with assign from JSON file + if (exists("clusters")) { + if (!is.integer(get("clusters"))) { + output$clusters <- get("clusters") + } + } + + # Check convergence when more than 1 iteration + if (!hyper_collect$all_fixed) { + output[["convergence"]] <- robyn_converge(OutputModels, ...) + output[["ts_validation_plot"]] <- ts_validation(OutputModels, ...) + } else { + if ("solID" %in% names(dt_hyper_fixed)) { + output[["selectID"]] <- dt_hyper_fixed$solID + } else { + output[["selectID"]] <- OutputModels$trial1$resultCollect$resultHypParam$solID + } + if (!quiet) message("Successfully recreated model ID: ", output$selectID) + } + + # Save hyper-parameters list + output[["hyper_updated"]] <- hyper_collect$hyper_list_all + output[["seed"]] <- seed + + # Report total timing + attr(output, "runTime") <- round(difftime(Sys.time(), t0, units = "mins"), 2) + if (!quiet && iterations > 1) message(paste("Total run time:", attr(output, "runTime"), "mins")) + + class(output) <- unique(c("robyn_models", class(output))) + return(output) +} + +#' @rdname robyn_run +#' @aliases robyn_run +#' @param x \code{robyn_models()} output. +#' @export +print.robyn_models <- function(x, ...) { + is_fixed <- all(lapply(x$hyper_updated, length) == 1) + print(glued( + " + Total trials: {x$trials} + Iterations per trial: {x$iterations} {total_iters} + Runtime (minutes): {attr(x, 'runTime')} + Cores: {x$cores} + + Updated Hyper-parameters{fixed}: + {hypers} + + Nevergrad Algo: {x$nevergrad_algo} + Intercept: {x$intercept} + Intercept sign: {x$intercept_sign} + Time-series validation: {x$ts_validation} + Penalty factor: {x$add_penalty_factor} + Refresh: {isTRUE(attr(x, 'refresh'))} + + Convergence on last quantile (iters {iters}): + {convergence} + + ", + total_iters = sprintf("(%s real)", ifelse( + "trial1" %in% names(x), nrow(x$trial1$resultCollect$resultHypParam), 1 + )), + iters = ifelse(is.null(x$convergence), 1, paste(tail(x$convergence$errors$cuts, 2), collapse = ":")), + fixed = ifelse(is_fixed, " (fixed)", ""), + convergence = if (!is_fixed) paste(x$convergence$conv_msg, collapse = "\n ") else "Fixed hyper-parameters", + hypers = flatten_hyps(x$hyper_updated) + )) + + if ("robyn_outputs" %in% class(x)) { + print(glued( + " +Plot Folder: {x$plot_folder} +Calibration Constraint: {x$calibration_constraint} +Hyper-parameters fixed: {x$hyper_fixed} +Pareto-front ({x$pareto_fronts}) All solutions ({nSols}): {paste(x$allSolutions, collapse = ', ')} +{clusters_info} +", + nSols = length(x$allSolutions), + clusters_info = if ("models" %in% names(x[["clusters"]])) { + glued( + "Clusters (k = {x$clusters$n_clusters}): {paste(x$clusters$models$solID, collapse = ', ')}" + ) + } else { + NULL + } + )) + } +} + +#################################################################### +#' Train Robyn Models +#' +#' \code{robyn_train()} consumes output from \code{robyn_input()} +#' and runs the \code{robyn_mmm()} on each trial. +#' +#' @inheritParams robyn_run +#' @param hyper_collect List. Containing hyperparameter bounds. Defaults to +#' \code{InputCollect$hyperparameters}. +#' @return List. Iteration results to include in \code{robyn_run()} results. +#' @export +robyn_train <- function(InputCollect, hyper_collect, + cores, iterations, trials, + intercept_sign, intercept, + nevergrad_algo, + dt_hyper_fixed = NULL, + ts_validation = TRUE, + add_penalty_factor = FALSE, + objective_weights = NULL, + rssd_zero_penalty = TRUE, + refresh = FALSE, seed = 123, + quiet = FALSE) { + hyper_fixed <- hyper_collect$all_fixed + + if (hyper_fixed) { + OutputModels <- list() + OutputModels[[1]] <- robyn_mmm( + InputCollect = InputCollect, + hyper_collect = hyper_collect, + iterations = iterations, + cores = cores, + nevergrad_algo = nevergrad_algo, + intercept = intercept, + intercept_sign = intercept_sign, + dt_hyper_fixed = dt_hyper_fixed, + ts_validation = ts_validation, + add_penalty_factor = add_penalty_factor, + rssd_zero_penalty = rssd_zero_penalty, + objective_weights = objective_weights, + seed = seed, + quiet = quiet + ) + OutputModels[[1]]$trial <- 1 + # Set original solID (to overwrite default 1_1_1) + if ("solID" %in% names(dt_hyper_fixed)) { + these <- c("resultHypParam", "xDecompVec", "xDecompAgg", "decompSpendDist") + for (tab in these) OutputModels[[1]]$resultCollect[[tab]]$solID <- dt_hyper_fixed$solID + } + } else { + ## Run robyn_mmm() for each trial if hyperparameters are not all fixed + check_init_msg(InputCollect, cores) + if (!quiet) { + message(paste( + ">>> Starting", trials, "trials with", + iterations, "iterations each", + ifelse(is.null(InputCollect$calibration_input), "using", "with calibration using"), + nevergrad_algo, "nevergrad algorithm..." + )) + } + + OutputModels <- list() + + for (ngt in 1:trials) { # ngt = 1 + if (!quiet) message(paste(" Running trial", ngt, "of", trials)) + model_output <- robyn_mmm( + InputCollect = InputCollect, + hyper_collect = hyper_collect, + iterations = iterations, + cores = cores, + nevergrad_algo = nevergrad_algo, + intercept = intercept, + intercept_sign = intercept_sign, + ts_validation = ts_validation, + add_penalty_factor = add_penalty_factor, + rssd_zero_penalty = rssd_zero_penalty, + objective_weights = objective_weights, + refresh = refresh, + trial = ngt, + seed = seed + ngt, + quiet = quiet + ) + check_coef0 <- any(model_output$resultCollect$decompSpendDist$decomp.rssd == Inf) + if (check_coef0) { + num_coef0_mod <- filter(model_output$resultCollect$decompSpendDist, is.infinite(.data$decomp.rssd)) %>% + distinct(.data$iterNG, .data$iterPar) %>% + nrow() + num_coef0_mod <- ifelse(num_coef0_mod > iterations, iterations, num_coef0_mod) + if (!quiet) { + message(paste( + "This trial contains", num_coef0_mod, "iterations with all media coefficient = 0.", + "Please reconsider your media variable choice if the pareto choices are unreasonable.", + "\n Recommendations:", + "\n1. Increase hyperparameter ranges for 0-coef channels to give Robyn more freedom", + "\n2. Split media into sub-channels, and/or aggregate similar channels, and/or introduce other media", + "\n3. Increase trials to get more samples" + )) + } + } + model_output["trial"] <- ngt + OutputModels[[ngt]] <- model_output + } + } + names(OutputModels) <- paste0("trial", seq_along(OutputModels)) + return(OutputModels) +} + + +#################################################################### +#' Core MMM Function +#' +#' \code{robyn_mmm()} function activates Nevergrad to generate samples of +#' hyperparameters, conducts media transformation within each loop, fits the +#' Ridge regression, calibrates the model optionally, decomposes responses +#' and collects the result. It's an inner function within \code{robyn_run()}. +#' +#' @inheritParams robyn_run +#' @inheritParams robyn_allocator +#' @param hyper_collect List. Containing hyperparameter bounds. Defaults to +#' \code{InputCollect$hyperparameters}. +#' @param iterations Integer. Number of iterations to run. +#' @param trial Integer. Which trial are we running? Used to ID each model. +#' @return List. MMM results with hyperparameters values. +#' @export +robyn_mmm <- function(InputCollect, + hyper_collect, + iterations, + cores, + nevergrad_algo, + intercept = TRUE, + intercept_sign, + ts_validation = TRUE, + add_penalty_factor = FALSE, + objective_weights = NULL, + dt_hyper_fixed = NULL, + # lambda_fixed = NULL, + rssd_zero_penalty = TRUE, + refresh = FALSE, + trial = 1L, + seed = 123L, + quiet = FALSE, ...) { + if (iterations > 1) { + if (reticulate::py_module_available("nevergrad")) { + ng <- reticulate::import("nevergrad", delay_load = TRUE) + if (is.integer(seed)) { + np <- reticulate::import("numpy", delay_load = FALSE) + np$random$seed(seed) + } + } else { + stop( + "You must have nevergrad python library installed.\nPlease check our install demo: ", + "https://github.com/facebookexperimental/Robyn/blob/main/demo/install_nevergrad.R" + ) + } + } + + ################################################ + #### Collect hyperparameters + + if (TRUE) { + hypParamSamName <- names(hyper_collect$hyper_list_all) + # Optimization hyper-parameters + hyper_bound_list_updated <- hyper_collect$hyper_bound_list_updated + hyper_bound_list_updated_name <- names(hyper_bound_list_updated) + hyper_count <- length(hyper_bound_list_updated_name) + # Fixed hyper-parameters + hyper_bound_list_fixed <- hyper_collect$hyper_bound_list_fixed + hyper_bound_list_fixed_name <- names(hyper_bound_list_fixed) + hyper_count_fixed <- length(hyper_bound_list_fixed_name) + dt_hyper_fixed_mod <- hyper_collect$dt_hyper_fixed_mod + hyper_fixed <- hyper_collect$all_fixed + } + + ################################################ + #### Setup environment + + if (is.null(InputCollect$dt_mod)) { + stop("Run InputCollect$dt_mod <- robyn_engineering() first to get the dt_mod") + } + + ## Get environment for parallel backend + if (TRUE) { + dt_mod <- InputCollect$dt_mod + xDecompAggPrev <- InputCollect$xDecompAggPrev + rollingWindowStartWhich <- InputCollect$rollingWindowStartWhich + rollingWindowEndWhich <- InputCollect$rollingWindowEndWhich + refreshAddedStart <- InputCollect$refreshAddedStart + dt_modRollWind <- InputCollect$dt_modRollWind + refresh_steps <- InputCollect$refresh_steps + rollingWindowLength <- InputCollect$rollingWindowLength + paid_media_spends <- InputCollect$paid_media_spends + organic_vars <- InputCollect$organic_vars + context_vars <- InputCollect$context_vars + prophet_vars <- InputCollect$prophet_vars + adstock <- InputCollect$adstock + context_signs <- InputCollect$context_signs + paid_media_signs <- InputCollect$paid_media_signs + prophet_signs <- InputCollect$prophet_signs + organic_signs <- InputCollect$organic_signs + calibration_input <- InputCollect$calibration_input + optimizer_name <- nevergrad_algo + i <- NULL # For parallel iterations (globalVar) + } + + ################################################ + #### Get spend share + + dt_inputTrain <- InputCollect$dt_input[rollingWindowStartWhich:rollingWindowEndWhich, ] + temp <- select(dt_inputTrain, all_of(paid_media_spends)) + dt_spendShare <- data.frame( + rn = paid_media_spends, + total_spend = unlist(summarise_all(temp, sum)), + # mean_spend = unlist(summarise_all(temp, function(x) { + # ifelse(is.na(mean(x[x > 0])), 0, mean(x[x > 0])) + # })) + mean_spend = unlist(summarise_all(temp, mean)) + ) %>% + mutate(spend_share = .data$total_spend / sum(.data$total_spend)) + # When not refreshing, dt_spendShareRF = dt_spendShare + refreshAddedStartWhich <- which(dt_modRollWind$ds == refreshAddedStart) + temp <- select(dt_inputTrain, all_of(paid_media_spends)) %>% + slice(refreshAddedStartWhich:rollingWindowLength) + dt_spendShareRF <- data.frame( + rn = paid_media_spends, + total_spend = unlist(summarise_all(temp, sum)), + # mean_spend = unlist(summarise_all(temp, function(x) { + # ifelse(is.na(mean(x[x > 0])), 0, mean(x[x > 0])) + # })) + mean_spend = unlist(summarise_all(temp, mean)) + ) %>% + mutate(spend_share = .data$total_spend / sum(.data$total_spend)) + # Join both dataframes into a single one + dt_spendShare <- left_join(dt_spendShare, dt_spendShareRF, "rn", suffix = c("", "_refresh")) + + ################################################ + #### Get lambda + lambda_min_ratio <- 0.0001 # default value from glmnet + lambdas <- lambda_seq( + x = select(dt_mod, -.data$ds, -.data$dep_var), + y = dt_mod$dep_var, + seq_len = 100, lambda_min_ratio + ) + lambda_max <- max(lambdas) * 0.1 + lambda_min <- lambda_max * lambda_min_ratio + + ################################################ + #### Start Nevergrad loop + t0 <- Sys.time() + + ## Set iterations + # hyper_fixed <- hyper_count == 0 + if (hyper_fixed == FALSE) { + iterTotal <- iterations + iterPar <- cores + iterNG <- ceiling(iterations / cores) # Sometimes the progress bar may not get to 100% + } else { + iterTotal <- iterPar <- iterNG <- 1 + } + + ## Start Nevergrad optimizer + if (!hyper_fixed) { + my_tuple <- tuple(hyper_count) + instrumentation <- ng$p$Array(shape = my_tuple, lower = 0, upper = 1) + optimizer <- ng$optimizers$registry[optimizer_name](instrumentation, budget = iterTotal, num_workers = cores) + + # Set multi-objective dimensions for objective functions (errors) + if (is.null(calibration_input)) { + optimizer$tell(ng$p$MultiobjectiveReference(), tuple(1, 1)) + if (is.null(objective_weights)) { + objective_weights <- tuple(1, 1) + } else { + objective_weights <- tuple(objective_weights[1], objective_weights[2]) + } + optimizer$set_objective_weights(objective_weights) + } else { + optimizer$tell(ng$p$MultiobjectiveReference(), tuple(1, 1, 1)) + if (is.null(objective_weights)) { + objective_weights <- tuple(1, 1, 1) + } else { + objective_weights <- tuple(objective_weights[1], objective_weights[2], objective_weights[3]) + } + optimizer$set_objective_weights(objective_weights) + } + } + + ## Prepare loop + resultCollectNG <- list() + cnt <- 0 + if (!hyper_fixed && !quiet) pb <- txtProgressBar(max = iterTotal, style = 3) + + sysTimeDopar <- tryCatch( + { + system.time({ + for (lng in 1:iterNG) { # lng = 1 + nevergrad_hp <- list() + nevergrad_hp_val <- list() + hypParamSamList <- list() + hypParamSamNG <- NULL + + if (hyper_fixed == FALSE) { + # Setting initial seeds (co = cores) + for (co in 1:iterPar) { # co = 1 + ## Get hyperparameter sample with ask (random) + nevergrad_hp[[co]] <- optimizer$ask() + nevergrad_hp_val[[co]] <- nevergrad_hp[[co]]$value + ## Scale sample to given bounds using uniform distribution + for (hypNameLoop in hyper_bound_list_updated_name) { + index <- which(hypNameLoop == hyper_bound_list_updated_name) + channelBound <- unlist(hyper_bound_list_updated[hypNameLoop]) + hyppar_value <- signif(nevergrad_hp_val[[co]][index], 10) + if (length(channelBound) > 1) { + hypParamSamNG[hypNameLoop] <- qunif(hyppar_value, min(channelBound), max(channelBound)) + } else { + hypParamSamNG[hypNameLoop] <- hyppar_value + } + } + hypParamSamList[[co]] <- data.frame(t(hypParamSamNG)) + } + hypParamSamNG <- bind_rows(hypParamSamList) + names(hypParamSamNG) <- hyper_bound_list_updated_name + ## Add fixed hyperparameters + if (hyper_count_fixed != 0) { + hypParamSamNG <- cbind(hypParamSamNG, dt_hyper_fixed_mod) %>% + select(all_of(hypParamSamName)) + } + } else { + hypParamSamNG <- select(dt_hyper_fixed_mod, all_of(hypParamSamName)) + } + + # Must remain within this function for it to work + robyn_iterations <- function(i, ...) { # i=1 + t1 <- Sys.time() + #### Get hyperparameter sample + hypParamSam <- hypParamSamNG[i, ] + adstock <- check_adstock(adstock) + + #### Transform media for model fitting + temp <- run_transformations(InputCollect, hypParamSam, adstock) + dt_modSaturated <- temp$dt_modSaturated + dt_saturatedImmediate <- temp$dt_saturatedImmediate + dt_saturatedCarryover <- temp$dt_saturatedCarryover + + ##################################### + #### Split train & test and prepare data for modelling + + dt_window <- dt_modSaturated + + ## Contrast matrix because glmnet does not treat categorical variables (one hot encoding) + y_window <- dt_window$dep_var + x_window <- as.matrix(lares::ohse(select(dt_window, -.data$dep_var))) + y_train <- y_val <- y_test <- y_window + x_train <- x_val <- x_test <- x_window + + ## Split train, test, and validation sets + train_size <- hypParamSam[, "train_size"][[1]] + val_size <- test_size <- (1 - train_size) / 2 + if (train_size < 1) { + train_size_index <- floor(quantile(seq(nrow(dt_window)), train_size)) + val_size_index <- train_size_index + floor(val_size * nrow(dt_window)) + y_train <- y_window[1:train_size_index] + y_val <- y_window[(train_size_index + 1):val_size_index] + y_test <- y_window[(val_size_index + 1):length(y_window)] + x_train <- x_window[1:train_size_index, ] + x_val <- x_window[(train_size_index + 1):val_size_index, ] + x_test <- x_window[(val_size_index + 1):length(y_window), ] + } else { + y_val <- y_test <- x_val <- x_test <- NULL + } + + ## Define and set sign control + dt_sign <- select(dt_window, -.data$dep_var) + x_sign <- c(prophet_signs, context_signs, paid_media_signs, organic_signs) + names(x_sign) <- c(prophet_vars, context_vars, paid_media_spends, organic_vars) + check_factor <- unlist(lapply(dt_sign, is.factor)) + lower.limits <- rep(0, length(prophet_signs)) + upper.limits <- rep(1, length(prophet_signs)) + trend_loc <- which(colnames(x_train) == "trend") + if (length(trend_loc) > 0 & sum(x_train[, trend_loc]) < 0) { + trend_loc <- which(prophet_vars == "trend") + lower.limits[trend_loc] <- -1 + upper.limits[trend_loc] <- 0 + } + for (s in (length(prophet_signs) + 1):length(x_sign)) { + if (check_factor[s] == TRUE) { + level.n <- length(levels(unlist(dt_sign[, s, with = FALSE]))) + if (level.n <= 1) { + stop("All factor variables must have more than 1 level") + } + lower_vec <- if (x_sign[s] == "positive") { + rep(0, level.n - 1) + } else { + rep(-Inf, level.n - 1) + } + upper_vec <- if (x_sign[s] == "negative") { + rep(0, level.n - 1) + } else { + rep(Inf, level.n - 1) + } + lower.limits <- c(lower.limits, lower_vec) + upper.limits <- c(upper.limits, upper_vec) + } else { + lower.limits <- c(lower.limits, ifelse(x_sign[s] == "positive", 0, -Inf)) + upper.limits <- c(upper.limits, ifelse(x_sign[s] == "negative", 0, Inf)) + } + } + + ##################################### + #### Fit ridge regression with nevergrad's lambda + # lambdas <- lambda_seq(x_train, y_train, seq_len = 100, lambda_min_ratio = 0.0001) + # lambda_max <- max(lambdas) + lambda_hp <- unlist(hypParamSamNG$lambda[i]) + if (hyper_fixed == FALSE) { + lambda_scaled <- lambda_min + (lambda_max - lambda_min) * lambda_hp + } else { + lambda_scaled <- lambda_hp + } + + if (add_penalty_factor) { + penalty.factor <- unlist(hypParamSamNG[i, grepl("_penalty", names(hypParamSamNG))]) + } else { + penalty.factor <- rep(1, ncol(x_train)) + } + + ##################################### + ## NRMSE: Model's fit error + + ## If no lift calibration, refit using best lambda + mod_out <- model_refit( + x_train, y_train, + x_val, y_val, + x_test, y_test, + lambda = lambda_scaled, + lower.limits = lower.limits, + upper.limits = upper.limits, + intercept = intercept, + intercept_sign = intercept_sign, + penalty.factor = penalty.factor, + ... + ) + decompCollect <- model_decomp( + coefs = mod_out$coefs, + y_pred = mod_out$y_pred, + dt_modSaturated = dt_modSaturated, + dt_saturatedImmediate = dt_saturatedImmediate, + dt_saturatedCarryover = dt_saturatedCarryover, + dt_modRollWind = dt_modRollWind, + refreshAddedStart = refreshAddedStart + ) + nrmse <- ifelse(ts_validation, mod_out$nrmse_val, mod_out$nrmse_train) + mape <- 0 + df.int <- mod_out$df.int + + ##################################### + #### MAPE: Calibration error + if (!is.null(calibration_input)) { + liftCollect <- robyn_calibrate( + calibration_input = calibration_input, + df_raw = dt_mod, + hypParamSam = hypParamSam, + wind_start = rollingWindowStartWhich, + wind_end = rollingWindowEndWhich, + dayInterval = InputCollect$dayInterval, + adstock = adstock, + xDecompVec = decompCollect$xDecompVec, + coefs = decompCollect$coefsOutCat + ) + mape <- mean(liftCollect$mape_lift, na.rm = TRUE) + } + + ##################################### + #### DECOMP.RSSD: Business error + # Sum of squared distance between decomp share and spend share to be minimized + dt_decompSpendDist <- decompCollect$xDecompAgg %>% + filter(.data$rn %in% paid_media_spends) %>% + select( + .data$rn, .data$xDecompAgg, .data$xDecompPerc, .data$xDecompMeanNon0Perc, + .data$xDecompMeanNon0, .data$xDecompPercRF, .data$xDecompMeanNon0PercRF, + .data$xDecompMeanNon0RF + ) %>% + left_join( + select( + dt_spendShare, + .data$rn, .data$spend_share, .data$spend_share_refresh, + .data$mean_spend, .data$total_spend + ), + by = "rn" + ) %>% + mutate( + effect_share = .data$xDecompPerc / sum(.data$xDecompPerc), + effect_share_refresh = .data$xDecompPercRF / sum(.data$xDecompPercRF) + ) + dt_decompSpendDist <- left_join( + filter(decompCollect$xDecompAgg, .data$rn %in% paid_media_spends), + select(dt_decompSpendDist, .data$rn, contains("_spend"), contains("_share")), + by = "rn" + ) + if (!refresh) { + decomp.rssd <- sqrt(sum((dt_decompSpendDist$effect_share - dt_decompSpendDist$spend_share)^2)) + # Penalty for models with more 0-coefficients + if (rssd_zero_penalty) { + is_0eff <- round(dt_decompSpendDist$effect_share, 4) == 0 + share_0eff <- sum(is_0eff) / length(dt_decompSpendDist$effect_share) + decomp.rssd <- decomp.rssd * (1 + share_0eff) + } + } else { + dt_decompRF <- select(decompCollect$xDecompAgg, .data$rn, decomp_perc = .data$xDecompPerc) %>% + left_join(select(xDecompAggPrev, .data$rn, decomp_perc_prev = .data$xDecompPerc), + by = "rn" + ) + decomp.rssd.media <- dt_decompRF %>% + filter(.data$rn %in% paid_media_spends) %>% + summarise(rssd.media = sqrt(mean((.data$decomp_perc - .data$decomp_perc_prev)^2))) %>% + pull(.data$rssd.media) + decomp.rssd.nonmedia <- dt_decompRF %>% + filter(!.data$rn %in% paid_media_spends) %>% + summarise(rssd.nonmedia = sqrt(mean((.data$decomp_perc - .data$decomp_perc_prev)^2))) %>% + pull(.data$rssd.nonmedia) + decomp.rssd <- decomp.rssd.media + decomp.rssd.nonmedia / + (1 - refresh_steps / rollingWindowLength) + } + # When all media in this iteration have 0 coefficients + if (is.nan(decomp.rssd)) { + decomp.rssd <- Inf + dt_decompSpendDist$effect_share <- 0 + } + + ##################################### + #### Collect Multi-Objective Errors and Iteration Results + resultCollect <- list() + + # Auxiliary dynamic vector + common <- data.frame( + rsq_train = mod_out$rsq_train, + rsq_val = mod_out$rsq_val, + rsq_test = mod_out$rsq_test, + nrmse_train = mod_out$nrmse_train, + nrmse_val = mod_out$nrmse_val, + nrmse_test = mod_out$nrmse_test, + nrmse = nrmse, + decomp.rssd = decomp.rssd, + mape = mape, + lambda = lambda_scaled, + lambda_hp = lambda_hp, + lambda_max = lambda_max, + lambda_min_ratio = lambda_min_ratio, + solID = paste(trial, lng, i, sep = "_"), + trial = trial, + iterNG = lng, + iterPar = i + ) + + total_common <- ncol(common) + split_common <- which(colnames(common) == "lambda_min_ratio") + + resultCollect[["resultHypParam"]] <- as_tibble(hypParamSam) %>% + select(-.data$lambda) %>% + bind_cols(common[, 1:split_common]) %>% + mutate( + pos = prod(decompCollect$xDecompAgg$pos), + Elapsed = as.numeric(difftime(Sys.time(), t1, units = "secs")), + ElapsedAccum = as.numeric(difftime(Sys.time(), t0, units = "secs")) + ) %>% + bind_cols(common[, (split_common + 1):total_common]) %>% + dplyr::mutate_all(unlist) + + resultCollect[["xDecompAgg"]] <- decompCollect$xDecompAgg %>% + mutate(train_size = train_size) %>% + bind_cols(common) + + if (!is.null(calibration_input)) { + resultCollect[["liftCalibration"]] <- liftCollect %>% + bind_cols(common) + } + + resultCollect[["decompSpendDist"]] <- dt_decompSpendDist %>% + bind_cols(common) + + resultCollect <- append(resultCollect, as.list(common)) + return(resultCollect) + } + + ########### Parallel start + nrmse.collect <- NULL + decomp.rssd.collect <- NULL + best_mape <- Inf + if (cores == 1) { + doparCollect <- lapply(1:iterPar, robyn_iterations) + } else { + # Create cluster to minimize overhead for parallel back-end registering + if (check_parallel() && !hyper_fixed) { + registerDoParallel(cores) + } else { + registerDoSEQ() + } + suppressPackageStartupMessages( + doparCollect <- foreach(i = 1:iterPar, .options.RNG = seed) %dorng% robyn_iterations(i) + ) + } + + nrmse.collect <- unlist(lapply(doparCollect, function(x) x$nrmse)) + decomp.rssd.collect <- unlist(lapply(doparCollect, function(x) x$decomp.rssd)) + mape.lift.collect <- unlist(lapply(doparCollect, function(x) x$mape)) + + ##################################### + #### Nevergrad tells objectives + + if (!hyper_fixed) { + if (is.null(calibration_input)) { + for (co in 1:iterPar) { + optimizer$tell(nevergrad_hp[[co]], tuple(nrmse.collect[co], decomp.rssd.collect[co])) + } + } else { + for (co in 1:iterPar) { + optimizer$tell(nevergrad_hp[[co]], tuple(nrmse.collect[co], decomp.rssd.collect[co], mape.lift.collect[co])) + } + } + } + + resultCollectNG[[lng]] <- doparCollect + if (!quiet) { + cnt <- cnt + iterPar + if (!hyper_fixed) setTxtProgressBar(pb, cnt) + } + } ## end NG loop + }) # end system.time + }, + error = function(err) { + if (length(resultCollectNG) > 1) { + msg <- "Error while running robyn_mmm(); providing PARTIAL results" + warning(msg) + message(paste(msg, err, sep = "\n")) + sysTimeDopar <- rep(Sys.time() - t0, 3) + } else { + stop(err) + } + } + ) + + # stop cluster to avoid memory leaks + if (cores > 1) { + stopImplicitCluster() + registerDoSEQ() + getDoParWorkers() + } + + if (!hyper_fixed) { + cat("\r", paste("\n Finished in", round(sysTimeDopar[3] / 60, 2), "mins")) + flush.console() + close(pb) + } + + ##################################### + #### Final result collect + + resultCollect <- list() + + resultCollect[["resultHypParam"]] <- as_tibble(bind_rows( + lapply(resultCollectNG, function(x) { + bind_rows(lapply(x, function(y) y$resultHypParam)) + }) + )) + + # resultCollect[["xDecompVec"]] <- as_tibble(bind_rows( + # lapply(resultCollectNG, function(x) { + # bind_rows(lapply(x, function(y) y$xDecompVec)) + # }) + # )) + + resultCollect[["xDecompAgg"]] <- as_tibble(bind_rows( + lapply(resultCollectNG, function(x) { + bind_rows(lapply(x, function(y) y$xDecompAgg)) + }) + )) + + if (!is.null(calibration_input)) { + resultCollect[["liftCalibration"]] <- as_tibble(bind_rows( + lapply(resultCollectNG, function(x) { + bind_rows(lapply(x, function(y) y$liftCalibration)) + }) + ) %>% + arrange(.data$mape, .data$liftMedia, .data$liftStart)) + } + + resultCollect[["decompSpendDist"]] <- as_tibble(bind_rows( + lapply(resultCollectNG, function(x) { + bind_rows(lapply(x, function(y) y$decompSpendDist)) + }) + )) + + resultCollect$iter <- length(resultCollect$mape) + resultCollect$elapsed.min <- sysTimeDopar[3] / 60 + + # Adjust accumulated time + resultCollect$resultHypParam <- resultCollect$resultHypParam %>% + mutate(ElapsedAccum = .data$ElapsedAccum - min(.data$ElapsedAccum) + + .data$Elapsed[which.min(.data$ElapsedAccum)]) + + return(list( + resultCollect = resultCollect, + hyperBoundNG = hyper_bound_list_updated, + hyperBoundFixed = hyper_bound_list_fixed + )) +} + +model_decomp <- function(coefs, y_pred, + dt_modSaturated, dt_saturatedImmediate, + dt_saturatedCarryover, dt_modRollWind, + refreshAddedStart) { + ## Input for decomp + y <- dt_modSaturated$dep_var + # x <- data.frame(x) + + x <- select(dt_modSaturated, -.data$dep_var) + intercept <- coefs[1] + x_name <- names(x) + x_factor <- x_name[sapply(x, is.factor)] + + ## Decomp x + xDecomp <- data.frame(mapply(function(regressor, coeff) { + regressor * coeff + }, regressor = x, coeff = coefs[-1])) + xDecomp <- cbind(data.frame(intercept = rep(intercept, nrow(xDecomp))), xDecomp) + xDecompOut <- cbind(data.frame(ds = dt_modRollWind$ds, y = y, y_pred = y_pred), xDecomp) + + ## Decomp immediate & carryover response + sel_coef <- c(rownames(coefs), names(coefs)) %in% names(dt_saturatedImmediate) + coefs_media <- coefs[sel_coef] + names(coefs_media) <- rownames(coefs)[sel_coef] + mediaDecompImmediate <- data.frame(mapply(function(regressor, coeff) { + regressor * coeff + }, regressor = dt_saturatedImmediate, coeff = coefs_media)) + mediaDecompCarryover <- data.frame(mapply(function(regressor, coeff) { + regressor * coeff + }, regressor = dt_saturatedCarryover, coeff = coefs_media)) + + ## Output decomp + y_hat <- rowSums(xDecomp, na.rm = TRUE) + y_hat.scaled <- rowSums(abs(xDecomp), na.rm = TRUE) + xDecompOutPerc.scaled <- abs(xDecomp) / y_hat.scaled + xDecompOut.scaled <- y_hat * xDecompOutPerc.scaled + + temp <- select(xDecompOut, .data$intercept, all_of(x_name)) + xDecompOutAgg <- sapply(temp, function(x) sum(x)) + xDecompOutAggPerc <- xDecompOutAgg / sum(y_hat) + xDecompOutAggMeanNon0 <- unlist(lapply(temp, function(x) ifelse(is.na(mean(x[x > 0])), 0, mean(x[x != 0])))) + xDecompOutAggMeanNon0[is.nan(xDecompOutAggMeanNon0)] <- 0 + xDecompOutAggMeanNon0Perc <- xDecompOutAggMeanNon0 / sum(xDecompOutAggMeanNon0) + + refreshAddedStartWhich <- which(xDecompOut$ds == refreshAddedStart) + refreshAddedEnd <- max(xDecompOut$ds) + refreshAddedEndWhich <- which(xDecompOut$ds == refreshAddedEnd) + + temp <- select(xDecompOut, .data$intercept, all_of(x_name)) %>% + slice(refreshAddedStartWhich:refreshAddedEndWhich) + xDecompOutAggRF <- unlist(lapply(temp, function(x) sum(x))) + y_hatRF <- y_hat[refreshAddedStartWhich:refreshAddedEndWhich] + xDecompOutAggPercRF <- xDecompOutAggRF / sum(y_hatRF) + xDecompOutAggMeanNon0RF <- unlist(lapply(temp, function(x) ifelse(is.na(mean(x[x > 0])), 0, mean(x[x != 0])))) + xDecompOutAggMeanNon0RF[is.nan(xDecompOutAggMeanNon0RF)] <- 0 + xDecompOutAggMeanNon0PercRF <- xDecompOutAggMeanNon0RF / sum(xDecompOutAggMeanNon0RF) + + coefsOutCat <- coefsOut <- data.frame(rn = c(rownames(coefs), names(coefs)), coefs) + if (length(x_factor) > 0) { + coefsOut$rn <- sapply(x_factor, function(x) str_replace(coefsOut$rn, paste0(x, ".*"), x)) + } + rn_order <- names(xDecompOutAgg) + rn_order[rn_order == "intercept"] <- "(Intercept)" + coefsOut <- coefsOut %>% + group_by(.data$rn) %>% + rename("coef" = 2) %>% + summarise(coef = mean(.data$coef)) %>% + arrange(match(.data$rn, rn_order)) + + decompOutAgg <- as_tibble(cbind(coefsOut, data.frame( + xDecompAgg = xDecompOutAgg, + xDecompPerc = xDecompOutAggPerc, + xDecompMeanNon0 = xDecompOutAggMeanNon0, + xDecompMeanNon0Perc = xDecompOutAggMeanNon0Perc, + xDecompAggRF = xDecompOutAggRF, + xDecompPercRF = xDecompOutAggPercRF, + xDecompMeanNon0RF = xDecompOutAggMeanNon0RF, + xDecompMeanNon0PercRF = xDecompOutAggMeanNon0PercRF, + pos = xDecompOutAgg >= 0 + ))) + + decompCollect <- list( + xDecompVec = xDecompOut, xDecompVec.scaled = xDecompOut.scaled, + xDecompAgg = decompOutAgg, coefsOutCat = coefsOutCat, + mediaDecompImmediate = mutate(mediaDecompImmediate, ds = xDecompOut$ds, y = xDecompOut$y), + mediaDecompCarryover = mutate(mediaDecompCarryover, ds = xDecompOut$ds, y = xDecompOut$y) + ) + return(decompCollect) +} + +model_refit <- function(x_train, y_train, x_val, y_val, x_test, y_test, + lambda, lower.limits, upper.limits, + intercept = TRUE, + intercept_sign = "non_negative", + penalty.factor = rep(1, ncol(y_train)), + ...) { + mod <- glmnet( + x_train, + y_train, + family = "gaussian", + alpha = 0, # 0 for ridge regression + lambda = lambda, + lower.limits = lower.limits, + upper.limits = upper.limits, + type.measure = "mse", + penalty.factor = penalty.factor, + intercept = intercept, + ... + ) # coef(mod) + + df.int <- 1 + + ## Drop intercept if negative and intercept_sign == "non_negative" + if (intercept_sign == "non_negative" && coef(mod)[1] < 0) { + mod <- glmnet( + x_train, + y_train, + family = "gaussian", + alpha = 0, # 0 for ridge regression + lambda = lambda, + lower.limits = lower.limits, + upper.limits = upper.limits, + penalty.factor = penalty.factor, + intercept = FALSE, + ... + ) # coef(mod) + df.int <- 0 + } # plot(mod); print(mod) + + # Calculate all Adjusted R2 + y_train_pred <- as.vector(predict(mod, s = lambda, newx = x_train)) + rsq_train <- get_rsq(true = y_train, predicted = y_train_pred, p = ncol(x_train), df.int = df.int) + if (!is.null(x_val)) { + y_val_pred <- as.vector(predict(mod, s = lambda, newx = x_val)) + rsq_val <- get_rsq(true = y_val, predicted = y_val_pred, p = ncol(x_val), df.int = df.int, n_train = length(y_train)) + y_test_pred <- as.vector(predict(mod, s = lambda, newx = x_test)) + rsq_test <- get_rsq(true = y_test, predicted = y_test_pred, p = ncol(x_test), df.int = df.int, n_train = length(y_train)) + y_pred <- c(y_train_pred, y_val_pred, y_test_pred) + } else { + rsq_val <- rsq_test <- NA + y_pred <- y_train_pred + } + + # Calculate all NRMSE + nrmse_train <- sqrt(mean((y_train - y_train_pred)^2)) / (max(y_train) - min(y_train)) + if (!is.null(x_val)) { + nrmse_val <- sqrt(mean((y_val - y_val_pred)^2)) / (max(y_val) - min(y_val)) + nrmse_test <- sqrt(mean((y_test - y_test_pred)^2)) / (max(y_test) - min(y_test)) + } else { + nrmse_val <- nrmse_test <- y_val_pred <- y_test_pred <- NA + } + + mod_out <- list( + rsq_train = rsq_train, + rsq_val = rsq_val, + rsq_test = rsq_test, + nrmse_train = nrmse_train, + nrmse_val = nrmse_val, + nrmse_test = nrmse_test, + coefs = as.matrix(coef(mod)), + y_train_pred = y_train_pred, + y_val_pred = y_val_pred, + y_test_pred = y_test_pred, + y_pred = y_pred, + mod = mod, + df.int = df.int + ) + + return(mod_out) +} + +# x = x_train matrix +# y = y_train (dep_var) vector +lambda_seq <- function(x, y, seq_len = 100, lambda_min_ratio = 0.0001) { + mysd <- function(y) sqrt(sum((y - mean(y))^2) / length(y)) + sx <- scale(x, scale = apply(x, 2, mysd)) + check_nan <- apply(sx, 2, function(sxj) all(is.nan(sxj))) + sx <- mapply(function(sxj, v) { + return(if (v) rep(0, length(sxj)) else sxj) + }, sxj = as.data.frame(sx), v = check_nan) + sx <- as.matrix(sx, ncol = ncol(x), nrow = nrow(x)) + # sy <- as.vector(scale(y, scale=mysd(y))) + sy <- y + # 0.001 is the default smalles alpha value of glmnet for ridge (alpha = 0) + lambda_max <- max(abs(colSums(sx * sy))) / (0.001 * nrow(x)) + lambda_max_log <- log(lambda_max) + log_step <- (log(lambda_max) - log(lambda_max * lambda_min_ratio)) / (seq_len - 1) + log_seq <- seq(log(lambda_max), log(lambda_max * lambda_min_ratio), length.out = seq_len) + lambdas <- exp(log_seq) + return(lambdas) +} + +hyper_collector <- function(InputCollect, hyper_in, ts_validation, add_penalty_factor, dt_hyper_fixed = NULL, cores = 1) { + # Fetch hyper-parameters based on media + hypParamSamName <- hyper_names( + adstock = InputCollect$adstock, + all_media = InputCollect$all_media, + all_vars = names(select(InputCollect$dt_mod, -c("ds", "dep_var"))) + ) + + # Manually add other hyper-parameters + hypParamSamName <- c(hypParamSamName, HYPS_OTHERS) + + # Check hyper_fixed condition + add lambda + penalty factor hyper-parameters names + all_fixed <- check_hyper_fixed(InputCollect, dt_hyper_fixed, add_penalty_factor) + hypParamSamName <- attr(all_fixed, "hypParamSamName") + + if (!all_fixed) { + # Collect media hyperparameters + hyper_bound_list <- list() + for (i in seq_along(hypParamSamName)) { + hyper_bound_list <- append(hyper_bound_list, hyper_in[hypParamSamName[i]]) + } + + # Add lambda hyperparameter + if (!"lambda" %in% names(hyper_bound_list)) { + hyper_bound_list$lambda <- c(0, 1) + } + + # Add train_size hyperparameter + if (ts_validation) { + if (!"train_size" %in% names(hyper_bound_list)) { + hyper_bound_list$train_size <- c(0.5, 0.8) + } + message(sprintf( + "Time-series validation with train_size range of %s of the data...", + paste(formatNum(100 * hyper_bound_list$train_size, pos = "%"), collapse = "-") + )) + } else { + if ("train_size" %in% names(hyper_bound_list)) { + warning("Provided train_size but ts_validation = FALSE. Time series validation inactive.") + } + hyper_bound_list$train_size <- 1 + message("Fitting time series with all available data...") + } + + # Add penalty factor hyperparameters + for_penalty <- names(select(InputCollect$dt_mod, -.data$ds, -.data$dep_var)) + penalty_names <- paste0(for_penalty, "_penalty") + if (add_penalty_factor) { + for (penalty in penalty_names) { + if (!penalty %in% names(hyper_bound_list)) { + hyper_bound_list[[penalty]] <- c(0, 1) + } + } + } + + # Get hyperparameters for Nevergrad + hyper_bound_list_updated <- hyper_bound_list[ + which(unlist(lapply(hyper_bound_list, length) == 2)) + ] + + # Get fixed hyperparameters + hyper_bound_list_fixed <- hyper_bound_list[ + which(unlist(lapply(hyper_bound_list, length) == 1)) + ] + + hyper_list_bind <- c(hyper_bound_list_updated, hyper_bound_list_fixed) + hyper_list_all <- list() + for (i in seq_along(hypParamSamName)) { + hyper_list_all[[i]] <- hyper_list_bind[[hypParamSamName[i]]] + names(hyper_list_all)[i] <- hypParamSamName[i] + } + + dt_hyper_fixed_mod <- data.frame(bind_cols(lapply( + hyper_bound_list_fixed, function(x) rep(x, cores) + ))) + } else { + hyper_bound_list_fixed <- list() + for (i in seq_along(hypParamSamName)) { + hyper_bound_list_fixed[[i]] <- dt_hyper_fixed[[hypParamSamName[i]]] + names(hyper_bound_list_fixed)[i] <- hypParamSamName[i] + } + + hyper_list_all <- hyper_bound_list_fixed + hyper_bound_list_updated <- hyper_bound_list_fixed[ + which(unlist(lapply(hyper_bound_list_fixed, length) == 2)) + ] + + dt_hyper_fixed_mod <- data.frame(matrix(hyper_bound_list_fixed, nrow = 1)) + names(dt_hyper_fixed_mod) <- names(hyper_bound_list_fixed) + } + + return(list( + hyper_list_all = hyper_list_all, + hyper_bound_list_updated = hyper_bound_list_updated, + hyper_bound_list_fixed = hyper_bound_list_fixed, + dt_hyper_fixed_mod = dt_hyper_fixed_mod, + all_fixed = all_fixed + )) +} + +init_msgs_run <- function(InputCollect, refresh, lambda_control = NULL, quiet = FALSE) { + if (!is.null(lambda_control)) { + message("Input 'lambda_control' deprecated in v3.6.0; lambda is now selected by hyperparameter optimization") + } + if (!quiet) { + message(sprintf( + "Input data has %s %ss in total: %s to %s", + nrow(InputCollect$dt_mod), + InputCollect$intervalType, + min(InputCollect$dt_mod$ds), + max(InputCollect$dt_mod$ds) + )) + depth <- ifelse( + "refreshDepth" %in% names(InputCollect), + InputCollect$refreshDepth, + ifelse("refreshCounter" %in% names(InputCollect), + InputCollect$refreshCounter, 0 + ) + ) + refresh <- as.integer(depth) > 0 + message(sprintf( + "%s model is built on rolling window of %s %s: %s to %s", + ifelse(!refresh, "Initial", paste0("Refresh #", depth)), + InputCollect$rollingWindowLength, + InputCollect$intervalType, + InputCollect$window_start, + InputCollect$window_end + )) + if (refresh) { + message(sprintf( + "Rolling window moving forward: %s %ss", + InputCollect$refresh_steps, InputCollect$intervalType + )) + } + } +} diff --git a/R/outputs.R b/R/outputs.R index 3505f50..6693a8e 100644 --- a/R/outputs.R +++ b/R/outputs.R @@ -1,328 +1,328 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Evaluate Models and Output Results into Local Files -#' -#' Pack \code{robyn_plots()}, \code{robyn_csv()}, and \code{robyn_clusters()} -#' outcomes on \code{robyn_run()} results. When \code{UI=TRUE}, enriched -#' \code{OutputModels} results with additional plots and objects. -#' -#' @param InputCollect,OutputModels \code{robyn_inputs()} and \code{robyn_run()} -#' outcomes. -#' @param pareto_fronts Integer. Number of Pareto fronts for the output. -#' \code{pareto_fronts = 1} returns the best models trading off \code{NRMSE} & -#' \code{DECOMP.RSSD}. Increase \code{pareto_fronts} to get more model choices. -#' \code{pareto_fronts = "auto"} selects the min fronts that include at least 100 -#' candidates. To customize this threshold, set value with \code{min_candidates}. -#' @param calibration_constraint Numeric. Default to 0.1 and allows 0.01-0.1. When -#' calibrating, 0.1 means top 10% calibrated models are used for pareto-optimal -#' selection. Lower \code{calibration_constraint} increases calibration accuracy. -#' @param plot_folder Character. Path for saving plots and files. Default -#' to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}. -#' @param plot_folder_sub Character. Sub path for saving plots. Will overwrite the -#' default path with timestamp or, for refresh and allocator, simply overwrite files. -#' @param plot_pareto Boolean. Set to \code{FALSE} to deactivate plotting -#' and saving model one-pagers. Used when testing models. -#' @param clusters Boolean. Apply \code{robyn_clusters()} to output models? -#' @param select_model Character vector. Which models (by \code{solID}) do you -#' wish to plot the one-pagers and export? Default will take top -#' \code{robyn_clusters()} results. -#' @param csv_out Character. Accepts "pareto" or "all". Default to "pareto". Set -#' to "all" will output all iterations as csv. Set NULL to skip exports into CSVs. -#' @param ui Boolean. Save additional outputs for UI usage. List outcome. -#' @param export Boolean. Export outcomes into local files? -#' @param all_sol_json Logical. Add all pareto solutions to json export? -#' @param quiet Boolean. Keep messages off? -#' @param refresh Boolean. Refresh mode -#' @param ... Additional parameters passed to \code{robyn_clusters()} -#' @return (Invisible) list. Class: \code{robyn_outputs}. Contains processed -#' results based on \code{robyn_run()} results. -#' @export -robyn_outputs <- function(InputCollect, OutputModels, - pareto_fronts = "auto", - calibration_constraint = 0.1, - plot_folder = NULL, - plot_folder_sub = NULL, - plot_pareto = TRUE, - csv_out = "pareto", - clusters = TRUE, - select_model = "clusters", - ui = FALSE, export = TRUE, - all_sol_json = FALSE, - quiet = FALSE, - refresh = FALSE, ...) { - t0 <- Sys.time() - if (is.null(plot_folder)) plot_folder <- getwd() - if (export) plot_folder <- check_dir(plot_folder) - - # Check calibration constrains - calibrated <- !is.null(InputCollect$calibration_input) - all_fixed <- length(OutputModels$trial1$hyperBoundFixed) == length(OutputModels$hyper_updated) - if (!all_fixed) { - calibration_constraint <- check_calibconstr( - calibration_constraint, - OutputModels$iterations, - OutputModels$trials, - InputCollect$calibration_input, - refresh = refresh - ) - } - - ##################################### - #### Run robyn_pareto on OutputModels - - totalModels <- OutputModels$iterations * OutputModels$trials - if (!isTRUE(OutputModels$hyper_fixed)) { - message(sprintf( - ">>> Running Pareto calculations for %s models on %s front%s...", - totalModels, pareto_fronts, ifelse(pareto_fronts > 1, "s", "") - )) - } - pareto_results <- robyn_pareto( - InputCollect, OutputModels, - pareto_fronts = pareto_fronts, - calibration_constraint = calibration_constraint, - quiet = quiet, - calibrated = calibrated, - ... - ) - pareto_fronts <- pareto_results$pareto_fronts - allSolutions <- pareto_results$pareto_solutions - - ##################################### - #### Gather the results into output object - - # Auxiliary list with all results (wasn't previously exported but needed for robyn_outputs()) - allPareto <- list( - resultHypParam = pareto_results$resultHypParam, - xDecompAgg = pareto_results$xDecompAgg, - resultCalibration = pareto_results$resultCalibration, - plotDataCollect = pareto_results$plotDataCollect, - df_caov_pct = pareto_results$df_caov_pct_all - ) - - # Set folder to save outputs - depth <- ifelse( - "refreshDepth" %in% names(InputCollect), - InputCollect$refreshDepth, - ifelse("refreshCounter" %in% names(InputCollect), - InputCollect$refreshCounter, 0 - ) - ) - folder_var <- ifelse(!as.integer(depth) > 0, "init", paste0("rf", depth)) - if (is.null(plot_folder_sub)) { - plot_folder_sub <- paste("Robyn", format(Sys.time(), "%Y%m%d%H%M"), folder_var, sep = "_") - } - plot_folder <- gsub("//+", "/", paste0(plot_folder, "/", plot_folder_sub, "/")) - if (!dir.exists(plot_folder) && export) { - message("Creating directory for outputs: ", plot_folder) - dir.create(plot_folder) - } - - # Final results object - OutputCollect <- list( - resultHypParam = filter(pareto_results$resultHypParam, .data$solID %in% allSolutions), - xDecompAgg = filter(pareto_results$xDecompAgg, .data$solID %in% allSolutions), - mediaVecCollect = pareto_results$mediaVecCollect, - xDecompVecCollect = pareto_results$xDecompVecCollect, - resultCalibration = if (calibrated) { - filter(pareto_results$resultCalibration, .data$solID %in% allSolutions) - } else { - NULL - }, - allSolutions = allSolutions, - allPareto = allPareto, - calibration_constraint = calibration_constraint, - OutputModels = OutputModels, - cores = OutputModels$cores, - iterations = OutputModels$iterations, - trials = OutputModels$trials, - intercept = OutputModels$intercept, - intercept_sign = OutputModels$intercept_sign, - nevergrad_algo = OutputModels$nevergrad_algo, - add_penalty_factor = OutputModels$add_penalty_factor, - seed = OutputModels$seed, - UI = NULL, - pareto_fronts = pareto_fronts, - hyper_fixed = OutputModels$hyper_fixed, - plot_folder = plot_folder - ) - class(OutputCollect) <- c("robyn_outputs", class(OutputCollect)) - - # Cluster results and amend cluster output - if (clusters) { - if (!quiet) message(">>> Calculating clusters for model selection using Pareto fronts...") - clusterCollect <- try(robyn_clusters( - OutputCollect, - dep_var_type = InputCollect$dep_var_type, - quiet = quiet, export = export, ... - )) - if ("data" %in% names(clusterCollect)) { - OutputCollect$resultHypParam <- left_join( - OutputCollect$resultHypParam, - select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), - by = "solID" - ) - OutputCollect$xDecompAgg <- left_join( - OutputCollect$xDecompAgg, - select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), - by = "solID" - ) %>% - left_join( - select( - clusterCollect$df_cluster_ci, .data$rn, .data$cluster, .data$boot_mean, - .data$boot_se, .data$ci_low, .data$ci_up, .data$rn - ), - by = c("rn", "cluster") - ) %>% - left_join( - pareto_results$df_caov_pct_all, - by = c("solID", "rn") - ) - OutputCollect$mediaVecCollect <- left_join( - OutputCollect$mediaVecCollect, - select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), - by = "solID" - ) - OutputCollect$xDecompVecCollect <- left_join( - OutputCollect$xDecompVecCollect, - select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), - by = "solID" - ) - if (calibrated) { - OutputCollect$resultCalibration <- left_join( - OutputCollect$resultCalibration, - select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), - by = "solID" - ) - } - } else { - warning("> Skipped clustering because of memory issues") - clusters <- FALSE - } - OutputCollect[["clusters"]] <- clusterCollect - } - - if (export) { - tryCatch( - { - if (!quiet) message(paste0(">>> Collecting ", length(allSolutions), " pareto-optimum results into: ", plot_folder)) - - if (!quiet) message(">> Exporting general plots into directory...") - all_plots <- robyn_plots(InputCollect, OutputCollect, export = export, ...) - - if (csv_out %in% c("all", "pareto")) { - if (!quiet) message(paste(">> Exporting", csv_out, "results as CSVs into directory...")) - robyn_csv(InputCollect, OutputCollect, csv_out, export = export, calibrated = calibrated) - } - - if (plot_pareto) { - if (!quiet) { - message(sprintf( - ">>> Exporting %sone-pagers into directory...", ifelse(!OutputCollect$hyper_fixed, "pareto ", "") - )) - } - select_model <- if (!clusters || is.null(OutputCollect[["clusters"]])) NULL else select_model - pareto_onepagers <- robyn_onepagers( - InputCollect, OutputCollect, - select_model = select_model, - quiet = quiet, export = export, ... - ) - } - - if (all_sol_json) { - pareto_df <- OutputCollect$resultHypParam %>% - filter(.data$solID %in% allSolutions) %>% - select(any_of(c("solID", "cluster", "top_sol"))) %>% - arrange(.data$cluster, -.data$top_sol, .data$solID) - } else { - pareto_df <- NULL - } - attr(OutputCollect, "runTime") <- round( - difftime(Sys.time(), t0, units = "mins"), 2 - ) - robyn_write( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - dir = plot_folder, quiet = quiet, - pareto_df = pareto_df, ... - ) - - # For internal use -> UI Code - if (ui && plot_pareto) OutputCollect$UI$pareto_onepagers <- pareto_onepagers - OutputCollect[["UI"]] <- if (ui) list(pParFront = all_plots[["pParFront"]]) else NULL - }, - error = function(err) { - message(paste("Failed exporting results, but returned model results anyways:\n", err)) - } - ) - } - - if (!is.null(OutputModels$hyper_updated)) OutputCollect$hyper_updated <- OutputModels$hyper_updated - attr(OutputCollect, "runTime") <- round(difftime(Sys.time(), t0, units = "mins"), 2) - class(OutputCollect) <- c("robyn_outputs", class(OutputCollect)) - return(invisible(OutputCollect)) -} - -#' @rdname robyn_outputs -#' @aliases robyn_outputs -#' @param x \code{robyn_outputs()} output. -#' @export -print.robyn_outputs <- function(x, ...) { - print(glued( - " -Plot Folder: {x$plot_folder} -Calibration Constraint: {x$calibration_constraint} -Hyper-parameters fixed: {x$hyper_fixed} -Pareto-front ({x$pareto_fronts}) All solutions ({nSols}): {paste(x$allSolutions, collapse = ', ')} -{clusters_info} -", - nSols = length(x$allSolutions), - clusters_info = if ("clusters" %in% names(x)) { - glued( - "Clusters (k = {x$clusters$n_clusters}): {paste(x$clusters$models$solID, collapse = ', ')}" - ) - } else { - NULL - } - )) -} - - -#################################################################### -#' Output results into local files: CSV files -#' -#' @param OutputCollect \code{robyn_run(..., export = FALSE)} output. -#' @param calibrated Logical -#' @rdname robyn_outputs -#' @return Invisible \code{NULL}. -#' @export -robyn_csv <- function(InputCollect, OutputCollect, csv_out = NULL, export = TRUE, calibrated = FALSE) { - if (export) { - check_class("robyn_outputs", OutputCollect) - temp_all <- OutputCollect$allPareto - plot_folder <- OutputCollect$plot_folder - if ("pareto" %in% csv_out) { - write.csv(OutputCollect$resultHypParam, paste0(plot_folder, "pareto_hyperparameters.csv")) - write.csv(OutputCollect$xDecompAgg, paste0(plot_folder, "pareto_aggregated.csv")) - if (calibrated) { - write.csv(OutputCollect$resultCalibration, paste0(plot_folder, "pareto_calibration.csv")) - } - } - if ("all" %in% csv_out) { - write.csv(temp_all$resultHypParam, paste0(plot_folder, "all_hyperparameters.csv")) - write.csv(temp_all$xDecompAgg, paste0(plot_folder, "all_aggregated.csv")) - if (calibrated) { - write.csv(temp_all$resultCalibration, paste0(plot_folder, "all_calibration.csv")) - } - } - if (!is.null(csv_out)) { - write.csv(InputCollect$dt_input, paste0(plot_folder, "raw_data.csv")) - write.csv(OutputCollect$mediaVecCollect, paste0(plot_folder, "pareto_media_transform_matrix.csv")) - write.csv(OutputCollect$xDecompVecCollect, paste0(plot_folder, "pareto_alldecomp_matrix.csv")) - } - } -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Evaluate Models and Output Results into Local Files +#' +#' Pack \code{robyn_plots()}, \code{robyn_csv()}, and \code{robyn_clusters()} +#' outcomes on \code{robyn_run()} results. When \code{UI=TRUE}, enriched +#' \code{OutputModels} results with additional plots and objects. +#' +#' @param InputCollect,OutputModels \code{robyn_inputs()} and \code{robyn_run()} +#' outcomes. +#' @param pareto_fronts Integer. Number of Pareto fronts for the output. +#' \code{pareto_fronts = 1} returns the best models trading off \code{NRMSE} & +#' \code{DECOMP.RSSD}. Increase \code{pareto_fronts} to get more model choices. +#' \code{pareto_fronts = "auto"} selects the min fronts that include at least 100 +#' candidates. To customize this threshold, set value with \code{min_candidates}. +#' @param calibration_constraint Numeric. Default to 0.1 and allows 0.01-0.1. When +#' calibrating, 0.1 means top 10% calibrated models are used for pareto-optimal +#' selection. Lower \code{calibration_constraint} increases calibration accuracy. +#' @param plot_folder Character. Path for saving plots and files. Default +#' to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}. +#' @param plot_folder_sub Character. Sub path for saving plots. Will overwrite the +#' default path with timestamp or, for refresh and allocator, simply overwrite files. +#' @param plot_pareto Boolean. Set to \code{FALSE} to deactivate plotting +#' and saving model one-pagers. Used when testing models. +#' @param clusters Boolean. Apply \code{robyn_clusters()} to output models? +#' @param select_model Character vector. Which models (by \code{solID}) do you +#' wish to plot the one-pagers and export? Default will take top +#' \code{robyn_clusters()} results. +#' @param csv_out Character. Accepts "pareto" or "all". Default to "pareto". Set +#' to "all" will output all iterations as csv. Set NULL to skip exports into CSVs. +#' @param ui Boolean. Save additional outputs for UI usage. List outcome. +#' @param export Boolean. Export outcomes into local files? +#' @param all_sol_json Logical. Add all pareto solutions to json export? +#' @param quiet Boolean. Keep messages off? +#' @param refresh Boolean. Refresh mode +#' @param ... Additional parameters passed to \code{robyn_clusters()} +#' @return (Invisible) list. Class: \code{robyn_outputs}. Contains processed +#' results based on \code{robyn_run()} results. +#' @export +robyn_outputs <- function(InputCollect, OutputModels, + pareto_fronts = "auto", + calibration_constraint = 0.1, + plot_folder = NULL, + plot_folder_sub = NULL, + plot_pareto = TRUE, + csv_out = "pareto", + clusters = TRUE, + select_model = "clusters", + ui = FALSE, export = TRUE, + all_sol_json = FALSE, + quiet = FALSE, + refresh = FALSE, ...) { + t0 <- Sys.time() + if (is.null(plot_folder)) plot_folder <- getwd() + if (export) plot_folder <- check_dir(plot_folder) + + # Check calibration constrains + calibrated <- !is.null(InputCollect$calibration_input) + all_fixed <- length(OutputModels$trial1$hyperBoundFixed) == length(OutputModels$hyper_updated) + if (!all_fixed) { + calibration_constraint <- check_calibconstr( + calibration_constraint, + OutputModels$iterations, + OutputModels$trials, + InputCollect$calibration_input, + refresh = refresh + ) + } + + ##################################### + #### Run robyn_pareto on OutputModels + + totalModels <- OutputModels$iterations * OutputModels$trials + if (!isTRUE(OutputModels$hyper_fixed)) { + message(sprintf( + ">>> Running Pareto calculations for %s models on %s front%s...", + totalModels, pareto_fronts, ifelse(pareto_fronts > 1, "s", "") + )) + } + pareto_results <- robyn_pareto( + InputCollect, OutputModels, + pareto_fronts = pareto_fronts, + calibration_constraint = calibration_constraint, + quiet = quiet, + calibrated = calibrated, + ... + ) + pareto_fronts <- pareto_results$pareto_fronts + allSolutions <- pareto_results$pareto_solutions + + ##################################### + #### Gather the results into output object + + # Auxiliary list with all results (wasn't previously exported but needed for robyn_outputs()) + allPareto <- list( + resultHypParam = pareto_results$resultHypParam, + xDecompAgg = pareto_results$xDecompAgg, + resultCalibration = pareto_results$resultCalibration, + plotDataCollect = pareto_results$plotDataCollect, + df_caov_pct = pareto_results$df_caov_pct_all + ) + + # Set folder to save outputs + depth <- ifelse( + "refreshDepth" %in% names(InputCollect), + InputCollect$refreshDepth, + ifelse("refreshCounter" %in% names(InputCollect), + InputCollect$refreshCounter, 0 + ) + ) + folder_var <- ifelse(!as.integer(depth) > 0, "init", paste0("rf", depth)) + if (is.null(plot_folder_sub)) { + plot_folder_sub <- paste("Robyn", format(Sys.time(), "%Y%m%d%H%M"), folder_var, sep = "_") + } + plot_folder <- gsub("//+", "/", paste0(plot_folder, "/", plot_folder_sub, "/")) + if (!dir.exists(plot_folder) && export) { + message("Creating directory for outputs: ", plot_folder) + dir.create(plot_folder) + } + + # Final results object + OutputCollect <- list( + resultHypParam = filter(pareto_results$resultHypParam, .data$solID %in% allSolutions), + xDecompAgg = filter(pareto_results$xDecompAgg, .data$solID %in% allSolutions), + mediaVecCollect = pareto_results$mediaVecCollect, + xDecompVecCollect = pareto_results$xDecompVecCollect, + resultCalibration = if (calibrated) { + filter(pareto_results$resultCalibration, .data$solID %in% allSolutions) + } else { + NULL + }, + allSolutions = allSolutions, + allPareto = allPareto, + calibration_constraint = calibration_constraint, + OutputModels = OutputModels, + cores = OutputModels$cores, + iterations = OutputModels$iterations, + trials = OutputModels$trials, + intercept = OutputModels$intercept, + intercept_sign = OutputModels$intercept_sign, + nevergrad_algo = OutputModels$nevergrad_algo, + add_penalty_factor = OutputModels$add_penalty_factor, + seed = OutputModels$seed, + UI = NULL, + pareto_fronts = pareto_fronts, + hyper_fixed = OutputModels$hyper_fixed, + plot_folder = plot_folder + ) + class(OutputCollect) <- c("robyn_outputs", class(OutputCollect)) + + # Cluster results and amend cluster output + if (clusters) { + if (!quiet) message(">>> Calculating clusters for model selection using Pareto fronts...") + clusterCollect <- try(robyn_clusters( + OutputCollect, + dep_var_type = InputCollect$dep_var_type, + quiet = quiet, export = export, ... + )) + if ("data" %in% names(clusterCollect)) { + OutputCollect$resultHypParam <- left_join( + OutputCollect$resultHypParam, + select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), + by = "solID" + ) + OutputCollect$xDecompAgg <- left_join( + OutputCollect$xDecompAgg, + select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), + by = "solID" + ) %>% + left_join( + select( + clusterCollect$df_cluster_ci, .data$rn, .data$cluster, .data$boot_mean, + .data$boot_se, .data$ci_low, .data$ci_up, .data$rn + ), + by = c("rn", "cluster") + ) %>% + left_join( + pareto_results$df_caov_pct_all, + by = c("solID", "rn") + ) + OutputCollect$mediaVecCollect <- left_join( + OutputCollect$mediaVecCollect, + select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), + by = "solID" + ) + OutputCollect$xDecompVecCollect <- left_join( + OutputCollect$xDecompVecCollect, + select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), + by = "solID" + ) + if (calibrated) { + OutputCollect$resultCalibration <- left_join( + OutputCollect$resultCalibration, + select(clusterCollect$data, .data$solID, .data$cluster, .data$top_sol), + by = "solID" + ) + } + } else { + warning("> Skipped clustering because of memory issues") + clusters <- FALSE + } + OutputCollect[["clusters"]] <- clusterCollect + } + + if (export) { + tryCatch( + { + if (!quiet) message(paste0(">>> Collecting ", length(allSolutions), " pareto-optimum results into: ", plot_folder)) + + if (!quiet) message(">> Exporting general plots into directory...") + all_plots <- robyn_plots(InputCollect, OutputCollect, export = export, ...) + + if (csv_out %in% c("all", "pareto")) { + if (!quiet) message(paste(">> Exporting", csv_out, "results as CSVs into directory...")) + robyn_csv(InputCollect, OutputCollect, csv_out, export = export, calibrated = calibrated) + } + + if (plot_pareto) { + if (!quiet) { + message(sprintf( + ">>> Exporting %sone-pagers into directory...", ifelse(!OutputCollect$hyper_fixed, "pareto ", "") + )) + } + select_model <- if (!clusters || is.null(OutputCollect[["clusters"]])) NULL else select_model + pareto_onepagers <- robyn_onepagers( + InputCollect, OutputCollect, + select_model = select_model, + quiet = quiet, export = export, ... + ) + } + + if (all_sol_json) { + pareto_df <- OutputCollect$resultHypParam %>% + filter(.data$solID %in% allSolutions) %>% + select(any_of(c("solID", "cluster", "top_sol"))) %>% + arrange(.data$cluster, -.data$top_sol, .data$solID) + } else { + pareto_df <- NULL + } + attr(OutputCollect, "runTime") <- round( + difftime(Sys.time(), t0, units = "mins"), 2 + ) + robyn_write( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + dir = plot_folder, quiet = quiet, + pareto_df = pareto_df, ... + ) + + # For internal use -> UI Code + if (ui && plot_pareto) OutputCollect$UI$pareto_onepagers <- pareto_onepagers + OutputCollect[["UI"]] <- if (ui) list(pParFront = all_plots[["pParFront"]]) else NULL + }, + error = function(err) { + message(paste("Failed exporting results, but returned model results anyways:\n", err)) + } + ) + } + + if (!is.null(OutputModels$hyper_updated)) OutputCollect$hyper_updated <- OutputModels$hyper_updated + attr(OutputCollect, "runTime") <- round(difftime(Sys.time(), t0, units = "mins"), 2) + class(OutputCollect) <- c("robyn_outputs", class(OutputCollect)) + return(invisible(OutputCollect)) +} + +#' @rdname robyn_outputs +#' @aliases robyn_outputs +#' @param x \code{robyn_outputs()} output. +#' @export +print.robyn_outputs <- function(x, ...) { + print(glued( + " +Plot Folder: {x$plot_folder} +Calibration Constraint: {x$calibration_constraint} +Hyper-parameters fixed: {x$hyper_fixed} +Pareto-front ({x$pareto_fronts}) All solutions ({nSols}): {paste(x$allSolutions, collapse = ', ')} +{clusters_info} +", + nSols = length(x$allSolutions), + clusters_info = if ("clusters" %in% names(x)) { + glued( + "Clusters (k = {x$clusters$n_clusters}): {paste(x$clusters$models$solID, collapse = ', ')}" + ) + } else { + NULL + } + )) +} + + +#################################################################### +#' Output results into local files: CSV files +#' +#' @param OutputCollect \code{robyn_run(..., export = FALSE)} output. +#' @param calibrated Logical +#' @rdname robyn_outputs +#' @return Invisible \code{NULL}. +#' @export +robyn_csv <- function(InputCollect, OutputCollect, csv_out = NULL, export = TRUE, calibrated = FALSE) { + if (export) { + check_class("robyn_outputs", OutputCollect) + temp_all <- OutputCollect$allPareto + plot_folder <- OutputCollect$plot_folder + if ("pareto" %in% csv_out) { + write.csv(OutputCollect$resultHypParam, paste0(plot_folder, "pareto_hyperparameters.csv")) + write.csv(OutputCollect$xDecompAgg, paste0(plot_folder, "pareto_aggregated.csv")) + if (calibrated) { + write.csv(OutputCollect$resultCalibration, paste0(plot_folder, "pareto_calibration.csv")) + } + } + if ("all" %in% csv_out) { + write.csv(temp_all$resultHypParam, paste0(plot_folder, "all_hyperparameters.csv")) + write.csv(temp_all$xDecompAgg, paste0(plot_folder, "all_aggregated.csv")) + if (calibrated) { + write.csv(temp_all$resultCalibration, paste0(plot_folder, "all_calibration.csv")) + } + } + if (!is.null(csv_out)) { + write.csv(InputCollect$dt_input, paste0(plot_folder, "raw_data.csv")) + write.csv(OutputCollect$mediaVecCollect, paste0(plot_folder, "pareto_media_transform_matrix.csv")) + write.csv(OutputCollect$xDecompVecCollect, paste0(plot_folder, "pareto_alldecomp_matrix.csv")) + } + } +} diff --git a/R/pareto.R b/R/pareto.R index 3b99cda..8029d4a 100644 --- a/R/pareto.R +++ b/R/pareto.R @@ -1,620 +1,620 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -robyn_pareto <- function(InputCollect, OutputModels, - pareto_fronts = "auto", - min_candidates = 100, - calibration_constraint = 0.1, - quiet = FALSE, - calibrated = FALSE, - ...) { - hyper_fixed <- OutputModels$hyper_fixed - OutModels <- OutputModels[unlist(lapply(OutputModels, function(x) "resultCollect" %in% names(x)))] - - resultHypParam <- bind_rows(lapply(OutModels, function(x) { - mutate(x$resultCollect$resultHypParam, trial = x$trial) - })) - - xDecompAgg <- bind_rows(lapply(OutModels, function(x) { - mutate(x$resultCollect$xDecompAgg, trial = x$trial) - })) - - if (calibrated) { - resultCalibration <- bind_rows(lapply(OutModels, function(x) { - x$resultCollect$liftCalibration %>% - mutate(trial = x$trial) %>% - rename(rn = .data$liftMedia) - })) - } else { - resultCalibration <- NULL - } - - if (!hyper_fixed) { - df_names <- if (calibrated) { - c("resultHypParam", "xDecompAgg", "resultCalibration") - } else { - c("resultHypParam", "xDecompAgg") - } - for (df in df_names) { - assign(df, get(df) %>% mutate( - iterations = (.data$iterNG - 1) * OutputModels$cores + .data$iterPar - )) - } - } else if (hyper_fixed & calibrated) { - df_names <- "resultCalibration" - for (df in df_names) { - assign(df, get(df) %>% mutate( - iterations = (.data$iterNG - 1) * OutputModels$cores + .data$iterPar - )) - } - } - - # If recreated model, inherit bootstrap results - if (length(unique(xDecompAgg$solID)) == 1 & !"boot_mean" %in% colnames(xDecompAgg)) { - bootstrap <- attr(OutputModels, "bootstrap") - if (!is.null(bootstrap)) { - xDecompAgg <- left_join(xDecompAgg, bootstrap, by = c("rn" = "variable")) - } - } - - xDecompAggCoef0 <- xDecompAgg %>% - filter(.data$rn %in% InputCollect$paid_media_spends) %>% - group_by(.data$solID) %>% - summarise(coef0 = min(.data$coef, na.rm = TRUE) == 0) - - if (!hyper_fixed) { - mape_lift_quantile10 <- quantile(resultHypParam$mape, probs = calibration_constraint, na.rm = TRUE) - nrmse_quantile90 <- quantile(resultHypParam$nrmse, probs = 0.90, na.rm = TRUE) - decomprssd_quantile90 <- quantile(resultHypParam$decomp.rssd, probs = 0.90, na.rm = TRUE) - resultHypParam <- left_join(resultHypParam, xDecompAggCoef0, by = "solID") %>% - mutate( - mape.qt10 = - .data$mape <= mape_lift_quantile10 & - .data$nrmse <= nrmse_quantile90 & - .data$decomp.rssd <= decomprssd_quantile90 - ) - # Calculate Pareto-fronts (for "all" or pareto_fronts) - resultHypParamPareto <- filter(resultHypParam, .data$mape.qt10 == TRUE) - paretoResults <- pareto_front( - x = resultHypParamPareto$nrmse, - y = resultHypParamPareto$decomp.rssd, - fronts = ifelse("auto" %in% pareto_fronts, Inf, pareto_fronts), - sort = FALSE - ) - resultHypParamPareto <- resultHypParamPareto %>% - left_join(paretoResults, by = c("nrmse" = "x", "decomp.rssd" = "y")) %>% - rename("robynPareto" = "pareto_front") %>% - arrange(.data$iterNG, .data$iterPar, .data$nrmse) %>% - select(.data$solID, .data$robynPareto) %>% - group_by(.data$solID) %>% - arrange(.data$robynPareto) %>% - slice(1) - resultHypParam <- left_join(resultHypParam, resultHypParamPareto, by = "solID") - } else { - resultHypParam <- mutate(resultHypParam, mape.qt10 = TRUE, robynPareto = 1, coef0 = NA) - } - - # Calculate combined weighted error scores - resultHypParam$error_score <- errors_scores(resultHypParam, ts_validation = OutputModels$ts_validation, ...) - - # Bind robynPareto results - xDecompAgg <- left_join(xDecompAgg, select(resultHypParam, .data$robynPareto, .data$solID), by = "solID") - decompSpendDist <- bind_rows(lapply(OutModels, function(x) { - mutate(x$resultCollect$decompSpendDist, trial = x$trial) - })) %>% - { - if (!hyper_fixed) mutate(., solID = paste(.data$trial, .data$iterNG, .data$iterPar, sep = "_")) else . - } %>% - left_join(select(resultHypParam, .data$robynPareto, .data$solID), by = "solID") - - # Prepare parallel loop - if (TRUE) { - if (OutputModels$cores > 1) { - registerDoParallel(OutputModels$cores) - registerDoSEQ() - } - if (hyper_fixed) pareto_fronts <- 1 - # Get at least 100 candidates for better clustering - if (nrow(resultHypParam) == 1) pareto_fronts <- 1 - if ("auto" %in% pareto_fronts) { - n_pareto <- resultHypParam %>% - filter(!is.na(.data$robynPareto)) %>% - nrow() - if (n_pareto <= min_candidates & nrow(resultHypParam) > 1 & !calibrated) { - stop(paste( - "Less than", min_candidates, "candidates in pareto fronts.", - "Increase iterations to get more model candidates or decrease min_candidates in robyn_output()" - )) - } - auto_pareto <- resultHypParam %>% - filter(!is.na(.data$robynPareto)) %>% - group_by(.data$robynPareto) %>% - summarise(n = n_distinct(.data$solID)) %>% - mutate(n_cum = cumsum(.data$n)) %>% - filter(.data$n_cum >= min_candidates) %>% - slice(1) - message(sprintf( - ">> Automatically selected %s Pareto-fronts to contain at least %s pareto-optimal models (%s)", - auto_pareto$robynPareto, min_candidates, auto_pareto$n_cum - )) - pareto_fronts <- as.integer(auto_pareto$robynPareto) - } - pareto_fronts_vec <- 1:pareto_fronts - - decompSpendDistPar <- decompSpendDist[decompSpendDist$robynPareto %in% pareto_fronts_vec, ] - resultHypParamPar <- resultHypParam[resultHypParam$robynPareto %in% pareto_fronts_vec, ] - xDecompAggPar <- xDecompAgg[xDecompAgg$robynPareto %in% pareto_fronts_vec, ] - respN <- NULL - } - - if (!quiet) { - message(sprintf( - ">>> Calculating response curves for all models' media variables (%s)...", - nrow(decompSpendDistPar) - )) - } - run_dt_resp <- function(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) { - get_solID <- decompSpendDistPar$solID[respN] - get_spendname <- decompSpendDistPar$rn[respN] - startRW <- InputCollect$rollingWindowStartWhich - endRW <- InputCollect$rollingWindowEndWhich - - get_resp <- robyn_response( - select_model = get_solID, - metric_name = get_spendname, - # metric_value = decompSpendDistPar$total_spend[respN], - # date_range = range(InputCollect$dt_modRollWind$ds), - date_range = "all", - dt_hyppar = resultHypParamPar, - dt_coef = xDecompAggPar, - InputCollect = InputCollect, - OutputCollect = OutputModels, - quiet = TRUE, - ... - ) - # Median value (but must be within the curve) - # med_in_curve <- sort(get_resp$response_total)[round(length(get_resp$response_total) / 2)] - - ## simulate mean response adstock from get_resp$input_carryover - # mean_response <- mean(get_resp$response_total) - mean_spend_adstocked <- mean(get_resp$input_total[startRW:endRW]) - mean_carryover <- mean(get_resp$input_carryover[startRW:endRW]) - dt_hyppar <- resultHypParamPar %>% filter(.data$solID == get_solID) - chnAdstocked <- data.frame(v1 = get_resp$input_total[startRW:endRW]) - colnames(chnAdstocked) <- get_spendname - dt_coef <- xDecompAggPar %>% - filter(.data$solID == get_solID & .data$rn == get_spendname) %>% - select(c("rn", "coef")) - hills <- get_hill_params( - InputCollect, NULL, dt_hyppar, dt_coef, - mediaSpendSorted = get_spendname, - select_model = get_solID, chnAdstocked - ) - mean_response <- fx_objective( - x = decompSpendDistPar$mean_spend[respN], - coeff = hills$coefs_sorted, - alpha = hills$alphas, - inflexion = hills$inflexions, - x_hist_carryover = mean_carryover, - get_sum = FALSE - ) - dt_resp <- data.frame( - mean_response = mean_response, - mean_spend_adstocked = mean_spend_adstocked, - mean_carryover = mean_carryover, - rn = decompSpendDistPar$rn[respN], - solID = decompSpendDistPar$solID[respN] - ) - return(dt_resp) - } - if (OutputModels$cores > 1) { - resp_collect <- foreach( - respN = seq_along(decompSpendDistPar$rn), .combine = bind_rows - ) %dorng% { - run_dt_resp(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) - } - stopImplicitCluster() - } else { - resp_collect <- bind_rows(lapply(seq_along(decompSpendDistPar$rn), function(respN) { - run_dt_resp(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) - })) - } - - decompSpendDist <- left_join( - decompSpendDist, - resp_collect, - by = c("solID", "rn") - ) %>% - mutate( - roi_mean = .data$mean_response / .data$mean_spend, - roi_total = .data$xDecompAgg / .data$total_spend, - cpa_mean = .data$mean_spend / .data$mean_response, - cpa_total = .data$total_spend / .data$xDecompAgg - ) - # decompSpendDist %>% filter(solID == select_model) %>% arrange(rn) %>% select(rn, mean_spend, mean_response, roi_mean) - xDecompAgg <- left_join( - xDecompAgg, - select( - decompSpendDist, .data$rn, .data$solID, .data$total_spend, .data$mean_spend, .data$mean_spend_adstocked, .data$mean_carryover, - .data$mean_response, .data$spend_share, .data$effect_share, .data$roi_mean, .data$roi_total, .data$cpa_total - ), - by = c("solID", "rn") - ) - - # Pareto loop (no plots) - mediaVecCollect <- list() - xDecompVecCollect <- list() - plotDataCollect <- list() - df_caov_pct_all <- dplyr::tibble() - dt_mod <- InputCollect$dt_mod - dt_modRollWind <- InputCollect$dt_modRollWind - rw_start_loc <- InputCollect$rollingWindowStartWhich - rw_end_loc <- InputCollect$rollingWindowEndWhich - - for (pf in pareto_fronts_vec) { - plotMediaShare <- filter( - xDecompAgg, - .data$robynPareto == pf, - .data$rn %in% InputCollect$paid_media_spends - ) - uniqueSol <- unique(plotMediaShare$solID) - plotWaterfall <- xDecompAgg %>% filter(.data$robynPareto == pf) - if (!quiet & length(unique(xDecompAgg$solID)) > 1) { - message(sprintf(">> Pareto-Front: %s [%s models]", pf, length(uniqueSol))) - } - - # # To recreate "xDecompVec", "xDecompVecImmediate", "xDecompVecCarryover" for each model - # temp <- OutputModels[names(OutputModels) %in% paste0("trial", 1:OutputModels$trials)] - # xDecompVecImmCarr <- bind_rows(lapply(temp, function(x) x$resultCollect$xDecompVec)) - # if (!"solID" %in% colnames(xDecompVecImmCarr)) { - # xDecompVecImmCarr <- xDecompVecImmCarr %>% - # mutate(solID = paste(.data$trial, .data$iterNG, .data$iterPar, sep = "_")) %>% - # filter(.data$solID %in% uniqueSol) - # } - - # Calculations for pareto AND pareto plots - for (sid in uniqueSol) { - # parallelResult <- foreach(sid = uniqueSol) %dorng% { - if (!quiet & length(unique(xDecompAgg$solID)) > 1) { - lares::statusbar(which(sid == uniqueSol), length(uniqueSol), type = "equal") - } - - ## 1. Spend x effect share comparison - temp <- plotMediaShare[plotMediaShare$solID == sid, ] %>% - tidyr::gather( - "variable", "value", - c("spend_share", "effect_share", "roi_total", "cpa_total") - ) %>% - select(c("rn", "nrmse", "decomp.rssd", "rsq_train", "variable", "value")) %>% - mutate(rn = factor(.data$rn, levels = sort(InputCollect$paid_media_spends))) - plotMediaShareLoopBar <- filter(temp, .data$variable %in% c("spend_share", "effect_share")) - plotMediaShareLoopLine <- filter(temp, .data$variable == ifelse( - InputCollect$dep_var_type == "conversion", "cpa_total", "roi_total" - )) - line_rm_inf <- !is.infinite(plotMediaShareLoopLine$value) - ySecScale <- max(plotMediaShareLoopLine$value[line_rm_inf]) / - max(plotMediaShareLoopBar$value) * 1.1 - plot1data <- list( - plotMediaShareLoopBar = plotMediaShareLoopBar, - plotMediaShareLoopLine = plotMediaShareLoopLine, - ySecScale = ySecScale - ) - - ## 2. Waterfall - plotWaterfallLoop <- plotWaterfall %>% - filter(.data$solID == sid) %>% - arrange(.data$xDecompPerc) %>% - mutate( - end = 1 - cumsum(.data$xDecompPerc), - start = lag(.data$end), - start = ifelse(is.na(.data$start), 1, .data$start), - id = row_number(), - rn = as.factor(.data$rn), - sign = as.factor(ifelse(.data$xDecompPerc >= 0, "Positive", "Negative")) - ) %>% - select( - .data$id, .data$rn, .data$coef, - .data$xDecompAgg, .data$xDecompPerc, - .data$start, .data$end, .data$sign - ) - plot2data <- list(plotWaterfallLoop = plotWaterfallLoop) - - ## 3. Adstock rate - dt_geometric <- weibullCollect <- wb_type <- NULL - resultHypParamLoop <- resultHypParam[resultHypParam$solID == sid, ] - get_hp_names <- !endsWith(names(InputCollect$hyperparameters), "_penalty") - get_hp_names <- names(InputCollect$hyperparameters)[get_hp_names] - hypParam <- resultHypParamLoop[, get_hp_names] - if (InputCollect$adstock == "geometric") { - hypParam_thetas <- unlist(hypParam[paste0(InputCollect$all_media, "_thetas")]) - dt_geometric <- data.frame(channels = InputCollect$all_media, thetas = hypParam_thetas) - } - if (InputCollect$adstock %in% c("weibull_cdf", "weibull_pdf")) { - shapeVec <- unlist(hypParam[paste0(InputCollect$all_media, "_shapes")]) - scaleVec <- unlist(hypParam[paste0(InputCollect$all_media, "_scales")]) - wb_type <- substr(InputCollect$adstock, 9, 11) - weibullCollect <- list() - n <- 1 - for (v1 in seq_along(InputCollect$all_media)) { - dt_weibull <- data.frame( - x = 1:InputCollect$rollingWindowLength, - decay_accumulated = adstock_weibull( - 1:InputCollect$rollingWindowLength, - shape = shapeVec[v1], - scale = scaleVec[v1], - type = wb_type - )$thetaVecCum, - type = wb_type, - channel = InputCollect$all_media[v1] - ) %>% - mutate(halflife = which.min(abs(.data$decay_accumulated - 0.5))) - max_non0 <- max(which(dt_weibull$decay_accumulated > 0.001), na.rm = TRUE) - dt_weibull$cut_time <- ifelse(max_non0 <= 5, max_non0 * 2, floor(max_non0 + max_non0 / 3)) - weibullCollect[[n]] <- dt_weibull - n <- n + 1 - } - weibullCollect <- bind_rows(weibullCollect) - weibullCollect <- filter(weibullCollect, .data$x <= max(weibullCollect$cut_time)) - } - - plot3data <- list( - dt_geometric = dt_geometric, - weibullCollect = weibullCollect, - wb_type = toupper(wb_type) - ) - - ## 4. Spend response curve - dt_transformPlot <- select(dt_mod, .data$ds, all_of(InputCollect$all_media)) # independent variables - dt_transformSpend <- cbind(dt_transformPlot[, "ds"], InputCollect$dt_input[, c(InputCollect$paid_media_spends)]) # spends of indep vars - dt_transformSpendMod <- dt_transformPlot[rw_start_loc:rw_end_loc, ] - # update non-spend variables - # if (length(InputCollect$exposure_vars) > 0) { - # for (expo in InputCollect$exposure_vars) { - # sel_nls <- ifelse(InputCollect$modNLSCollect[channel == expo, rsq_nls > rsq_lm], "nls", "lm") - # dt_transformSpendMod[, (expo) := InputCollect$yhatNLSCollect[channel == expo & models == sel_nls, yhat]] - # } - # } - dt_transformAdstock <- dt_transformPlot - dt_transformSaturation <- dt_transformPlot[ - rw_start_loc:rw_end_loc, - ] - - m_decayRate <- list() - for (med in seq_along(InputCollect$all_media)) { - med_select <- InputCollect$all_media[med] - m <- dt_transformPlot[, med_select][[1]] - # Adstocking - adstock <- InputCollect$adstock - if (adstock == "geometric") { - theta <- hypParam[paste0(InputCollect$all_media[med], "_thetas")][[1]] - } - if (grepl("weibull", adstock)) { - shape <- hypParam[paste0(InputCollect$all_media[med], "_shapes")][[1]] - scale <- hypParam[paste0(InputCollect$all_media[med], "_scales")][[1]] - } - x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) - m_adstocked <- x_list$x_decayed - dt_transformAdstock[med_select] <- m_adstocked - m_adstockedRollWind <- m_adstocked[ - rw_start_loc:rw_end_loc - ] - ## Saturation - alpha <- hypParam[paste0(InputCollect$all_media[med], "_alphas")][[1]] - gamma <- hypParam[paste0(InputCollect$all_media[med], "_gammas")][[1]] - dt_transformSaturation[med_select] <- saturation_hill( - x = m_adstockedRollWind, alpha = alpha, gamma = gamma - ) - } - dt_transformSaturationDecomp <- dt_transformSaturation - for (i in seq_along(InputCollect$all_media)) { - coef <- plotWaterfallLoop$coef[plotWaterfallLoop$rn == InputCollect$all_media[i]] - dt_transformSaturationDecomp[InputCollect$all_media[i]] <- coef * - dt_transformSaturationDecomp[InputCollect$all_media[i]] - } - dt_transformSaturationSpendReverse <- dt_transformAdstock[ - rw_start_loc:rw_end_loc, - ] - - ## Reverse MM fitting - # dt_transformSaturationSpendReverse <- copy(dt_transformAdstock[, c("ds", InputCollect$all_media), with = FALSE]) - # for (i in seq_along(InputCollect$paid_media_spends)) { - # chn <- InputCollect$paid_media_vars[i] - # if (chn %in% InputCollect$paid_media_vars[InputCollect$exposure_selector]) { - # # Get Michaelis Menten nls fitting param - # get_chn <- dt_transformSaturationSpendReverse[, chn, with = FALSE] - # Vmax <- InputCollect$modNLSCollect[channel == chn, Vmax] - # Km <- InputCollect$modNLSCollect[channel == chn, Km] - # # Reverse exposure to spend - # dt_transformSaturationSpendReverse[, (chn) := mic_men(x = .SD, Vmax = Vmax, Km = Km, reverse = TRUE), .SDcols = chn] # .SD * Km / (Vmax - .SD) exposure to spend, reverse Michaelis Menthen: x = y*Km/(Vmax-y) - # } else if (chn %in% InputCollect$exposure_vars) { - # coef_lm <- InputCollect$modNLSCollect[channel == chn, coef_lm] - # dt_transformSaturationSpendReverse[, (chn) := .SD / coef_lm, .SDcols = chn] - # } - # } - # dt_transformSaturationSpendReverse <- dt_transformSaturationSpendReverse[rw_start_loc:rw_end_loc] - - dt_scurvePlot <- tidyr::gather( - dt_transformSaturationDecomp, "channel", "response", - 2:ncol(dt_transformSaturationDecomp) - ) %>% - mutate(spend = tidyr::gather( - dt_transformSaturationSpendReverse, "channel", "spend", - 2:ncol(dt_transformSaturationSpendReverse) - )$spend) - - # Remove outlier introduced by MM nls fitting - dt_scurvePlot <- dt_scurvePlot[dt_scurvePlot$spend >= 0, ] - dt_scurvePlotMean <- plotWaterfall %>% - filter(.data$solID == sid & !is.na(.data$mean_spend)) %>% - select(c(channel = "rn", "mean_spend", "mean_spend_adstocked", "mean_carryover", "mean_response", "solID")) - - # Exposure response curve - plot4data <- list( - dt_scurvePlot = dt_scurvePlot, - dt_scurvePlotMean = dt_scurvePlotMean - ) - - ## 5. Fitted vs actual - col_order <- c("ds", "dep_var", InputCollect$all_ind_vars) - dt_transformDecomp <- select( - dt_modRollWind, .data$ds, .data$dep_var, - any_of(c(InputCollect$prophet_vars, InputCollect$context_vars)) - ) %>% - bind_cols(select(dt_transformSaturation, all_of(InputCollect$all_media))) %>% - select(all_of(col_order)) - xDecompVec <- xDecompAgg %>% - filter(.data$solID == sid) %>% - select(.data$solID, .data$rn, .data$coef) %>% - tidyr::spread(.data$rn, .data$coef) - if (!("(Intercept)" %in% names(xDecompVec))) xDecompVec[["(Intercept)"]] <- 0 - xDecompVec <- select(xDecompVec, c("solID", "(Intercept)", col_order[!(col_order %in% c("ds", "dep_var"))])) - intercept <- xDecompVec$`(Intercept)` - xDecompVec <- data.frame(mapply( - function(scurved, coefs) scurved * coefs, - scurved = select(dt_transformDecomp, -.data$ds, -.data$dep_var), - coefs = select(xDecompVec, -.data$solID, -.data$`(Intercept)`) - )) - xDecompVec <- mutate(xDecompVec, - intercept = intercept, - depVarHat = rowSums(xDecompVec) + intercept, solID = sid - ) - xDecompVec <- bind_cols(select(dt_transformDecomp, .data$ds, .data$dep_var), xDecompVec) - xDecompVecPlot <- select(xDecompVec, .data$ds, .data$dep_var, .data$depVarHat) %>% - rename("actual" = "dep_var", "predicted" = "depVarHat") - xDecompVecPlotMelted <- tidyr::gather( - xDecompVecPlot, - key = "variable", value = "value", -.data$ds - ) - rsq <- filter(xDecompAgg, .data$solID == sid) %>% - pull(.data$rsq_train) %>% - .[1] - plot5data <- list(xDecompVecPlotMelted = xDecompVecPlotMelted, rsq = rsq) - - ## 6. Diagnostic: fitted vs residual - plot6data <- list(xDecompVecPlot = xDecompVecPlot) - - ## 7. Immediate vs carryover response - # temp <- filter(xDecompVecImmCarr, .data$solID == sid) - hypParamSam <- resultHypParam[resultHypParam$solID == sid, ] - dt_saturated_dfs <- run_transformations(InputCollect, hypParamSam, adstock) - coefs <- xDecompAgg$coef[xDecompAgg$solID == sid] - names(coefs) <- xDecompAgg$rn[xDecompAgg$solID == sid] - decompCollect <- model_decomp( - coefs = coefs, - y_pred = dt_saturated_dfs$dt_modSaturated$dep_var, # IS THIS RIGHT? - dt_modSaturated = dt_saturated_dfs$dt_modSaturated, - dt_saturatedImmediate = dt_saturated_dfs$dt_saturatedImmediate, - dt_saturatedCarryover = dt_saturated_dfs$dt_saturatedCarryover, - dt_modRollWind = dt_modRollWind, - refreshAddedStart = InputCollect$refreshAddedStart - ) - mediaDecompImmediate <- select(decompCollect$mediaDecompImmediate, -.data$ds, -.data$y) - colnames(mediaDecompImmediate) <- paste0(colnames(mediaDecompImmediate), "_MDI") - mediaDecompCarryover <- select(decompCollect$mediaDecompCarryover, -.data$ds, -.data$y) - colnames(mediaDecompCarryover) <- paste0(colnames(mediaDecompCarryover), "_MDC") - temp <- bind_cols( - decompCollect$xDecompVec, - mediaDecompImmediate, - mediaDecompCarryover - ) %>% mutate(solID = sid) - vec_collect <- list( - xDecompVec = select(temp, -dplyr::ends_with("_MDI"), -dplyr::ends_with("_MDC")), - xDecompVecImmediate = select(temp, -dplyr::ends_with("_MDC"), -all_of(InputCollect$all_media)), - xDecompVecCarryover = select(temp, -dplyr::ends_with("_MDI"), -all_of(InputCollect$all_media)) - ) - this <- gsub("_MDI", "", colnames(vec_collect$xDecompVecImmediate)) - colnames(vec_collect$xDecompVecImmediate) <- colnames(vec_collect$xDecompVecCarryover) <- this - df_caov <- vec_collect$xDecompVecCarryover %>% - group_by(.data$solID) %>% - summarise(across(InputCollect$all_media, sum)) - df_total <- vec_collect$xDecompVec %>% - group_by(.data$solID) %>% - summarise(across(InputCollect$all_media, sum)) - df_caov_pct <- bind_cols( - select(df_caov, .data$solID), - select(df_caov, -.data$solID) / select(df_total, -.data$solID) - ) %>% - pivot_longer(cols = InputCollect$all_media, names_to = "rn", values_to = "carryover_pct") - df_caov_pct[is.na(as.matrix(df_caov_pct))] <- 0 - df_caov_pct_all <- bind_rows(df_caov_pct_all, df_caov_pct) - # Gather everything in an aggregated format - xDecompVecImmeCaov <- bind_rows( - select(vec_collect$xDecompVecImmediate, c("ds", InputCollect$all_media, "solID")) %>% - mutate(type = "Immediate"), - select(vec_collect$xDecompVecCarryover, c("ds", InputCollect$all_media, "solID")) %>% - mutate(type = "Carryover") - ) %>% - pivot_longer(cols = InputCollect$all_media, names_to = "rn") %>% - select(c("solID", "type", "rn", "value")) %>% - group_by(.data$solID, .data$rn, .data$type) %>% - summarise(response = sum(.data$value), .groups = "drop_last") %>% - mutate(percentage = .data$response / sum(.data$response)) %>% - replace(., is.na(.), 0) %>% - left_join(df_caov_pct, c("solID", "rn")) - if (length(unique(xDecompAgg$solID)) == 1) { - xDecompVecImmeCaov$solID <- OutModels$trial1$resultCollect$resultHypParam$solID - } - plot7data <- xDecompVecImmeCaov - - ## 8. Bootstrapped ROI/CPA with CIs - # plot8data <- "Empty" # Filled when running robyn_onepagers() with clustering data - - # Gather all results - mediaVecCollect <- bind_rows(mediaVecCollect, list( - mutate(dt_transformPlot, type = "rawMedia", solID = sid), - mutate(dt_transformSpend, type = "rawSpend", solID = sid), - mutate(dt_transformSpendMod, type = "predictedExposure", solID = sid), - mutate(dt_transformAdstock, type = "adstockedMedia", solID = sid), - mutate(dt_transformSaturation, type = "saturatedMedia", solID = sid), - mutate(dt_transformSaturationSpendReverse, type = "saturatedSpendReversed", solID = sid), - mutate(dt_transformSaturationDecomp, type = "decompMedia", solID = sid) - )) - xDecompVecCollect <- bind_rows(xDecompVecCollect, xDecompVec) - plotDataCollect[[sid]] <- list( - plot1data = plot1data, - plot2data = plot2data, - plot3data = plot3data, - plot4data = plot4data, - plot5data = plot5data, - plot6data = plot6data, - plot7data = plot7data - # plot8data = plot8data - ) - } - } # end pareto front loopdev - - pareto_results <- list( - pareto_solutions = unique(xDecompVecCollect$solID), - pareto_fronts = pareto_fronts, - resultHypParam = resultHypParam, - xDecompAgg = xDecompAgg, - resultCalibration = resultCalibration, - mediaVecCollect = mediaVecCollect, - xDecompVecCollect = xDecompVecCollect, - plotDataCollect = plotDataCollect, - df_caov_pct_all = df_caov_pct_all - ) - - if (OutputModels$cores > 1) stopImplicitCluster() - - return(pareto_results) -} - -pareto_front <- function(x, y, fronts = 1, sort = TRUE) { - stopifnot(length(x) == length(y)) - d <- data.frame(x, y) - Dtemp <- D <- d[order(d$x, d$y, decreasing = FALSE), ] - df <- data.frame() - i <- 1 - while (nrow(Dtemp) >= 1 & i <= max(fronts)) { - these <- Dtemp[which(!duplicated(cummin(Dtemp$y))), ] - these$pareto_front <- i - df <- rbind(df, these) - Dtemp <- Dtemp[!row.names(Dtemp) %in% row.names(these), ] - i <- i + 1 - } - ret <- merge(x = d, y = df, by = c("x", "y"), all.x = TRUE, sort = sort) - return(ret) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +robyn_pareto <- function(InputCollect, OutputModels, + pareto_fronts = "auto", + min_candidates = 100, + calibration_constraint = 0.1, + quiet = FALSE, + calibrated = FALSE, + ...) { + hyper_fixed <- OutputModels$hyper_fixed + OutModels <- OutputModels[unlist(lapply(OutputModels, function(x) "resultCollect" %in% names(x)))] + + resultHypParam <- bind_rows(lapply(OutModels, function(x) { + mutate(x$resultCollect$resultHypParam, trial = x$trial) + })) + + xDecompAgg <- bind_rows(lapply(OutModels, function(x) { + mutate(x$resultCollect$xDecompAgg, trial = x$trial) + })) + + if (calibrated) { + resultCalibration <- bind_rows(lapply(OutModels, function(x) { + x$resultCollect$liftCalibration %>% + mutate(trial = x$trial) %>% + rename(rn = .data$liftMedia) + })) + } else { + resultCalibration <- NULL + } + + if (!hyper_fixed) { + df_names <- if (calibrated) { + c("resultHypParam", "xDecompAgg", "resultCalibration") + } else { + c("resultHypParam", "xDecompAgg") + } + for (df in df_names) { + assign(df, get(df) %>% mutate( + iterations = (.data$iterNG - 1) * OutputModels$cores + .data$iterPar + )) + } + } else if (hyper_fixed & calibrated) { + df_names <- "resultCalibration" + for (df in df_names) { + assign(df, get(df) %>% mutate( + iterations = (.data$iterNG - 1) * OutputModels$cores + .data$iterPar + )) + } + } + + # If recreated model, inherit bootstrap results + if (length(unique(xDecompAgg$solID)) == 1 & !"boot_mean" %in% colnames(xDecompAgg)) { + bootstrap <- attr(OutputModels, "bootstrap") + if (!is.null(bootstrap)) { + xDecompAgg <- left_join(xDecompAgg, bootstrap, by = c("rn" = "variable")) + } + } + + xDecompAggCoef0 <- xDecompAgg %>% + filter(.data$rn %in% InputCollect$paid_media_spends) %>% + group_by(.data$solID) %>% + summarise(coef0 = min(.data$coef, na.rm = TRUE) == 0) + + if (!hyper_fixed) { + mape_lift_quantile10 <- quantile(resultHypParam$mape, probs = calibration_constraint, na.rm = TRUE) + nrmse_quantile90 <- quantile(resultHypParam$nrmse, probs = 0.90, na.rm = TRUE) + decomprssd_quantile90 <- quantile(resultHypParam$decomp.rssd, probs = 0.90, na.rm = TRUE) + resultHypParam <- left_join(resultHypParam, xDecompAggCoef0, by = "solID") %>% + mutate( + mape.qt10 = + .data$mape <= mape_lift_quantile10 & + .data$nrmse <= nrmse_quantile90 & + .data$decomp.rssd <= decomprssd_quantile90 + ) + # Calculate Pareto-fronts (for "all" or pareto_fronts) + resultHypParamPareto <- filter(resultHypParam, .data$mape.qt10 == TRUE) + paretoResults <- pareto_front( + x = resultHypParamPareto$nrmse, + y = resultHypParamPareto$decomp.rssd, + fronts = ifelse("auto" %in% pareto_fronts, Inf, pareto_fronts), + sort = FALSE + ) + resultHypParamPareto <- resultHypParamPareto %>% + left_join(paretoResults, by = c("nrmse" = "x", "decomp.rssd" = "y")) %>% + rename("robynPareto" = "pareto_front") %>% + arrange(.data$iterNG, .data$iterPar, .data$nrmse) %>% + select(.data$solID, .data$robynPareto) %>% + group_by(.data$solID) %>% + arrange(.data$robynPareto) %>% + slice(1) + resultHypParam <- left_join(resultHypParam, resultHypParamPareto, by = "solID") + } else { + resultHypParam <- mutate(resultHypParam, mape.qt10 = TRUE, robynPareto = 1, coef0 = NA) + } + + # Calculate combined weighted error scores + resultHypParam$error_score <- errors_scores(resultHypParam, ts_validation = OutputModels$ts_validation, ...) + + # Bind robynPareto results + xDecompAgg <- left_join(xDecompAgg, select(resultHypParam, .data$robynPareto, .data$solID), by = "solID") + decompSpendDist <- bind_rows(lapply(OutModels, function(x) { + mutate(x$resultCollect$decompSpendDist, trial = x$trial) + })) %>% + { + if (!hyper_fixed) mutate(., solID = paste(.data$trial, .data$iterNG, .data$iterPar, sep = "_")) else . + } %>% + left_join(select(resultHypParam, .data$robynPareto, .data$solID), by = "solID") + + # Prepare parallel loop + if (TRUE) { + if (OutputModels$cores > 1) { + registerDoParallel(OutputModels$cores) + registerDoSEQ() + } + if (hyper_fixed) pareto_fronts <- 1 + # Get at least 100 candidates for better clustering + if (nrow(resultHypParam) == 1) pareto_fronts <- 1 + if ("auto" %in% pareto_fronts) { + n_pareto <- resultHypParam %>% + filter(!is.na(.data$robynPareto)) %>% + nrow() + if (n_pareto <= min_candidates & nrow(resultHypParam) > 1 & !calibrated) { + stop(paste( + "Less than", min_candidates, "candidates in pareto fronts.", + "Increase iterations to get more model candidates or decrease min_candidates in robyn_output()" + )) + } + auto_pareto <- resultHypParam %>% + filter(!is.na(.data$robynPareto)) %>% + group_by(.data$robynPareto) %>% + summarise(n = n_distinct(.data$solID)) %>% + mutate(n_cum = cumsum(.data$n)) %>% + filter(.data$n_cum >= min_candidates) %>% + slice(1) + message(sprintf( + ">> Automatically selected %s Pareto-fronts to contain at least %s pareto-optimal models (%s)", + auto_pareto$robynPareto, min_candidates, auto_pareto$n_cum + )) + pareto_fronts <- as.integer(auto_pareto$robynPareto) + } + pareto_fronts_vec <- 1:pareto_fronts + + decompSpendDistPar <- decompSpendDist[decompSpendDist$robynPareto %in% pareto_fronts_vec, ] + resultHypParamPar <- resultHypParam[resultHypParam$robynPareto %in% pareto_fronts_vec, ] + xDecompAggPar <- xDecompAgg[xDecompAgg$robynPareto %in% pareto_fronts_vec, ] + respN <- NULL + } + + if (!quiet) { + message(sprintf( + ">>> Calculating response curves for all models' media variables (%s)...", + nrow(decompSpendDistPar) + )) + } + run_dt_resp <- function(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) { + get_solID <- decompSpendDistPar$solID[respN] + get_spendname <- decompSpendDistPar$rn[respN] + startRW <- InputCollect$rollingWindowStartWhich + endRW <- InputCollect$rollingWindowEndWhich + + get_resp <- robyn_response( + select_model = get_solID, + metric_name = get_spendname, + # metric_value = decompSpendDistPar$total_spend[respN], + # date_range = range(InputCollect$dt_modRollWind$ds), + date_range = "all", + dt_hyppar = resultHypParamPar, + dt_coef = xDecompAggPar, + InputCollect = InputCollect, + OutputCollect = OutputModels, + quiet = TRUE, + ... + ) + # Median value (but must be within the curve) + # med_in_curve <- sort(get_resp$response_total)[round(length(get_resp$response_total) / 2)] + + ## simulate mean response adstock from get_resp$input_carryover + # mean_response <- mean(get_resp$response_total) + mean_spend_adstocked <- mean(get_resp$input_total[startRW:endRW]) + mean_carryover <- mean(get_resp$input_carryover[startRW:endRW]) + dt_hyppar <- resultHypParamPar %>% filter(.data$solID == get_solID) + chnAdstocked <- data.frame(v1 = get_resp$input_total[startRW:endRW]) + colnames(chnAdstocked) <- get_spendname + dt_coef <- xDecompAggPar %>% + filter(.data$solID == get_solID & .data$rn == get_spendname) %>% + select(c("rn", "coef")) + hills <- get_hill_params( + InputCollect, NULL, dt_hyppar, dt_coef, + mediaSpendSorted = get_spendname, + select_model = get_solID, chnAdstocked + ) + mean_response <- fx_objective( + x = decompSpendDistPar$mean_spend[respN], + coeff = hills$coefs_sorted, + alpha = hills$alphas, + inflexion = hills$inflexions, + x_hist_carryover = mean_carryover, + get_sum = FALSE + ) + dt_resp <- data.frame( + mean_response = mean_response, + mean_spend_adstocked = mean_spend_adstocked, + mean_carryover = mean_carryover, + rn = decompSpendDistPar$rn[respN], + solID = decompSpendDistPar$solID[respN] + ) + return(dt_resp) + } + if (OutputModels$cores > 1) { + resp_collect <- foreach( + respN = seq_along(decompSpendDistPar$rn), .combine = bind_rows + ) %dorng% { + run_dt_resp(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) + } + stopImplicitCluster() + } else { + resp_collect <- bind_rows(lapply(seq_along(decompSpendDistPar$rn), function(respN) { + run_dt_resp(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) + })) + } + + decompSpendDist <- left_join( + decompSpendDist, + resp_collect, + by = c("solID", "rn") + ) %>% + mutate( + roi_mean = .data$mean_response / .data$mean_spend, + roi_total = .data$xDecompAgg / .data$total_spend, + cpa_mean = .data$mean_spend / .data$mean_response, + cpa_total = .data$total_spend / .data$xDecompAgg + ) + # decompSpendDist %>% filter(solID == select_model) %>% arrange(rn) %>% select(rn, mean_spend, mean_response, roi_mean) + xDecompAgg <- left_join( + xDecompAgg, + select( + decompSpendDist, .data$rn, .data$solID, .data$total_spend, .data$mean_spend, .data$mean_spend_adstocked, .data$mean_carryover, + .data$mean_response, .data$spend_share, .data$effect_share, .data$roi_mean, .data$roi_total, .data$cpa_total + ), + by = c("solID", "rn") + ) + + # Pareto loop (no plots) + mediaVecCollect <- list() + xDecompVecCollect <- list() + plotDataCollect <- list() + df_caov_pct_all <- dplyr::tibble() + dt_mod <- InputCollect$dt_mod + dt_modRollWind <- InputCollect$dt_modRollWind + rw_start_loc <- InputCollect$rollingWindowStartWhich + rw_end_loc <- InputCollect$rollingWindowEndWhich + + for (pf in pareto_fronts_vec) { + plotMediaShare <- filter( + xDecompAgg, + .data$robynPareto == pf, + .data$rn %in% InputCollect$paid_media_spends + ) + uniqueSol <- unique(plotMediaShare$solID) + plotWaterfall <- xDecompAgg %>% filter(.data$robynPareto == pf) + if (!quiet & length(unique(xDecompAgg$solID)) > 1) { + message(sprintf(">> Pareto-Front: %s [%s models]", pf, length(uniqueSol))) + } + + # # To recreate "xDecompVec", "xDecompVecImmediate", "xDecompVecCarryover" for each model + # temp <- OutputModels[names(OutputModels) %in% paste0("trial", 1:OutputModels$trials)] + # xDecompVecImmCarr <- bind_rows(lapply(temp, function(x) x$resultCollect$xDecompVec)) + # if (!"solID" %in% colnames(xDecompVecImmCarr)) { + # xDecompVecImmCarr <- xDecompVecImmCarr %>% + # mutate(solID = paste(.data$trial, .data$iterNG, .data$iterPar, sep = "_")) %>% + # filter(.data$solID %in% uniqueSol) + # } + + # Calculations for pareto AND pareto plots + for (sid in uniqueSol) { + # parallelResult <- foreach(sid = uniqueSol) %dorng% { + if (!quiet & length(unique(xDecompAgg$solID)) > 1) { + lares::statusbar(which(sid == uniqueSol), length(uniqueSol), type = "equal") + } + + ## 1. Spend x effect share comparison + temp <- plotMediaShare[plotMediaShare$solID == sid, ] %>% + tidyr::gather( + "variable", "value", + c("spend_share", "effect_share", "roi_total", "cpa_total") + ) %>% + select(c("rn", "nrmse", "decomp.rssd", "rsq_train", "variable", "value")) %>% + mutate(rn = factor(.data$rn, levels = sort(InputCollect$paid_media_spends))) + plotMediaShareLoopBar <- filter(temp, .data$variable %in% c("spend_share", "effect_share")) + plotMediaShareLoopLine <- filter(temp, .data$variable == ifelse( + InputCollect$dep_var_type == "conversion", "cpa_total", "roi_total" + )) + line_rm_inf <- !is.infinite(plotMediaShareLoopLine$value) + ySecScale <- max(plotMediaShareLoopLine$value[line_rm_inf]) / + max(plotMediaShareLoopBar$value) * 1.1 + plot1data <- list( + plotMediaShareLoopBar = plotMediaShareLoopBar, + plotMediaShareLoopLine = plotMediaShareLoopLine, + ySecScale = ySecScale + ) + + ## 2. Waterfall + plotWaterfallLoop <- plotWaterfall %>% + filter(.data$solID == sid) %>% + arrange(.data$xDecompPerc) %>% + mutate( + end = 1 - cumsum(.data$xDecompPerc), + start = lag(.data$end), + start = ifelse(is.na(.data$start), 1, .data$start), + id = row_number(), + rn = as.factor(.data$rn), + sign = as.factor(ifelse(.data$xDecompPerc >= 0, "Positive", "Negative")) + ) %>% + select( + .data$id, .data$rn, .data$coef, + .data$xDecompAgg, .data$xDecompPerc, + .data$start, .data$end, .data$sign + ) + plot2data <- list(plotWaterfallLoop = plotWaterfallLoop) + + ## 3. Adstock rate + dt_geometric <- weibullCollect <- wb_type <- NULL + resultHypParamLoop <- resultHypParam[resultHypParam$solID == sid, ] + get_hp_names <- !endsWith(names(InputCollect$hyperparameters), "_penalty") + get_hp_names <- names(InputCollect$hyperparameters)[get_hp_names] + hypParam <- resultHypParamLoop[, get_hp_names] + if (InputCollect$adstock == "geometric") { + hypParam_thetas <- unlist(hypParam[paste0(InputCollect$all_media, "_thetas")]) + dt_geometric <- data.frame(channels = InputCollect$all_media, thetas = hypParam_thetas) + } + if (InputCollect$adstock %in% c("weibull_cdf", "weibull_pdf")) { + shapeVec <- unlist(hypParam[paste0(InputCollect$all_media, "_shapes")]) + scaleVec <- unlist(hypParam[paste0(InputCollect$all_media, "_scales")]) + wb_type <- substr(InputCollect$adstock, 9, 11) + weibullCollect <- list() + n <- 1 + for (v1 in seq_along(InputCollect$all_media)) { + dt_weibull <- data.frame( + x = 1:InputCollect$rollingWindowLength, + decay_accumulated = adstock_weibull( + 1:InputCollect$rollingWindowLength, + shape = shapeVec[v1], + scale = scaleVec[v1], + type = wb_type + )$thetaVecCum, + type = wb_type, + channel = InputCollect$all_media[v1] + ) %>% + mutate(halflife = which.min(abs(.data$decay_accumulated - 0.5))) + max_non0 <- max(which(dt_weibull$decay_accumulated > 0.001), na.rm = TRUE) + dt_weibull$cut_time <- ifelse(max_non0 <= 5, max_non0 * 2, floor(max_non0 + max_non0 / 3)) + weibullCollect[[n]] <- dt_weibull + n <- n + 1 + } + weibullCollect <- bind_rows(weibullCollect) + weibullCollect <- filter(weibullCollect, .data$x <= max(weibullCollect$cut_time)) + } + + plot3data <- list( + dt_geometric = dt_geometric, + weibullCollect = weibullCollect, + wb_type = toupper(wb_type) + ) + + ## 4. Spend response curve + dt_transformPlot <- select(dt_mod, .data$ds, all_of(InputCollect$all_media)) # independent variables + dt_transformSpend <- cbind(dt_transformPlot[, "ds"], InputCollect$dt_input[, c(InputCollect$paid_media_spends)]) # spends of indep vars + dt_transformSpendMod <- dt_transformPlot[rw_start_loc:rw_end_loc, ] + # update non-spend variables + # if (length(InputCollect$exposure_vars) > 0) { + # for (expo in InputCollect$exposure_vars) { + # sel_nls <- ifelse(InputCollect$modNLSCollect[channel == expo, rsq_nls > rsq_lm], "nls", "lm") + # dt_transformSpendMod[, (expo) := InputCollect$yhatNLSCollect[channel == expo & models == sel_nls, yhat]] + # } + # } + dt_transformAdstock <- dt_transformPlot + dt_transformSaturation <- dt_transformPlot[ + rw_start_loc:rw_end_loc, + ] + + m_decayRate <- list() + for (med in seq_along(InputCollect$all_media)) { + med_select <- InputCollect$all_media[med] + m <- dt_transformPlot[, med_select][[1]] + # Adstocking + adstock <- InputCollect$adstock + if (adstock == "geometric") { + theta <- hypParam[paste0(InputCollect$all_media[med], "_thetas")][[1]] + } + if (grepl("weibull", adstock)) { + shape <- hypParam[paste0(InputCollect$all_media[med], "_shapes")][[1]] + scale <- hypParam[paste0(InputCollect$all_media[med], "_scales")][[1]] + } + x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) + m_adstocked <- x_list$x_decayed + dt_transformAdstock[med_select] <- m_adstocked + m_adstockedRollWind <- m_adstocked[ + rw_start_loc:rw_end_loc + ] + ## Saturation + alpha <- hypParam[paste0(InputCollect$all_media[med], "_alphas")][[1]] + gamma <- hypParam[paste0(InputCollect$all_media[med], "_gammas")][[1]] + dt_transformSaturation[med_select] <- saturation_hill( + x = m_adstockedRollWind, alpha = alpha, gamma = gamma + ) + } + dt_transformSaturationDecomp <- dt_transformSaturation + for (i in seq_along(InputCollect$all_media)) { + coef <- plotWaterfallLoop$coef[plotWaterfallLoop$rn == InputCollect$all_media[i]] + dt_transformSaturationDecomp[InputCollect$all_media[i]] <- coef * + dt_transformSaturationDecomp[InputCollect$all_media[i]] + } + dt_transformSaturationSpendReverse <- dt_transformAdstock[ + rw_start_loc:rw_end_loc, + ] + + ## Reverse MM fitting + # dt_transformSaturationSpendReverse <- copy(dt_transformAdstock[, c("ds", InputCollect$all_media), with = FALSE]) + # for (i in seq_along(InputCollect$paid_media_spends)) { + # chn <- InputCollect$paid_media_vars[i] + # if (chn %in% InputCollect$paid_media_vars[InputCollect$exposure_selector]) { + # # Get Michaelis Menten nls fitting param + # get_chn <- dt_transformSaturationSpendReverse[, chn, with = FALSE] + # Vmax <- InputCollect$modNLSCollect[channel == chn, Vmax] + # Km <- InputCollect$modNLSCollect[channel == chn, Km] + # # Reverse exposure to spend + # dt_transformSaturationSpendReverse[, (chn) := mic_men(x = .SD, Vmax = Vmax, Km = Km, reverse = TRUE), .SDcols = chn] # .SD * Km / (Vmax - .SD) exposure to spend, reverse Michaelis Menthen: x = y*Km/(Vmax-y) + # } else if (chn %in% InputCollect$exposure_vars) { + # coef_lm <- InputCollect$modNLSCollect[channel == chn, coef_lm] + # dt_transformSaturationSpendReverse[, (chn) := .SD / coef_lm, .SDcols = chn] + # } + # } + # dt_transformSaturationSpendReverse <- dt_transformSaturationSpendReverse[rw_start_loc:rw_end_loc] + + dt_scurvePlot <- tidyr::gather( + dt_transformSaturationDecomp, "channel", "response", + 2:ncol(dt_transformSaturationDecomp) + ) %>% + mutate(spend = tidyr::gather( + dt_transformSaturationSpendReverse, "channel", "spend", + 2:ncol(dt_transformSaturationSpendReverse) + )$spend) + + # Remove outlier introduced by MM nls fitting + dt_scurvePlot <- dt_scurvePlot[dt_scurvePlot$spend >= 0, ] + dt_scurvePlotMean <- plotWaterfall %>% + filter(.data$solID == sid & !is.na(.data$mean_spend)) %>% + select(c(channel = "rn", "mean_spend", "mean_spend_adstocked", "mean_carryover", "mean_response", "solID")) + + # Exposure response curve + plot4data <- list( + dt_scurvePlot = dt_scurvePlot, + dt_scurvePlotMean = dt_scurvePlotMean + ) + + ## 5. Fitted vs actual + col_order <- c("ds", "dep_var", InputCollect$all_ind_vars) + dt_transformDecomp <- select( + dt_modRollWind, .data$ds, .data$dep_var, + any_of(c(InputCollect$prophet_vars, InputCollect$context_vars)) + ) %>% + bind_cols(select(dt_transformSaturation, all_of(InputCollect$all_media))) %>% + select(all_of(col_order)) + xDecompVec <- xDecompAgg %>% + filter(.data$solID == sid) %>% + select(.data$solID, .data$rn, .data$coef) %>% + tidyr::spread(.data$rn, .data$coef) + if (!("(Intercept)" %in% names(xDecompVec))) xDecompVec[["(Intercept)"]] <- 0 + xDecompVec <- select(xDecompVec, c("solID", "(Intercept)", col_order[!(col_order %in% c("ds", "dep_var"))])) + intercept <- xDecompVec$`(Intercept)` + xDecompVec <- data.frame(mapply( + function(scurved, coefs) scurved * coefs, + scurved = select(dt_transformDecomp, -.data$ds, -.data$dep_var), + coefs = select(xDecompVec, -.data$solID, -.data$`(Intercept)`) + )) + xDecompVec <- mutate(xDecompVec, + intercept = intercept, + depVarHat = rowSums(xDecompVec) + intercept, solID = sid + ) + xDecompVec <- bind_cols(select(dt_transformDecomp, .data$ds, .data$dep_var), xDecompVec) + xDecompVecPlot <- select(xDecompVec, .data$ds, .data$dep_var, .data$depVarHat) %>% + rename("actual" = "dep_var", "predicted" = "depVarHat") + xDecompVecPlotMelted <- tidyr::gather( + xDecompVecPlot, + key = "variable", value = "value", -.data$ds + ) + rsq <- filter(xDecompAgg, .data$solID == sid) %>% + pull(.data$rsq_train) %>% + .[1] + plot5data <- list(xDecompVecPlotMelted = xDecompVecPlotMelted, rsq = rsq) + + ## 6. Diagnostic: fitted vs residual + plot6data <- list(xDecompVecPlot = xDecompVecPlot) + + ## 7. Immediate vs carryover response + # temp <- filter(xDecompVecImmCarr, .data$solID == sid) + hypParamSam <- resultHypParam[resultHypParam$solID == sid, ] + dt_saturated_dfs <- run_transformations(InputCollect, hypParamSam, adstock) + coefs <- xDecompAgg$coef[xDecompAgg$solID == sid] + names(coefs) <- xDecompAgg$rn[xDecompAgg$solID == sid] + decompCollect <- model_decomp( + coefs = coefs, + y_pred = dt_saturated_dfs$dt_modSaturated$dep_var, # IS THIS RIGHT? + dt_modSaturated = dt_saturated_dfs$dt_modSaturated, + dt_saturatedImmediate = dt_saturated_dfs$dt_saturatedImmediate, + dt_saturatedCarryover = dt_saturated_dfs$dt_saturatedCarryover, + dt_modRollWind = dt_modRollWind, + refreshAddedStart = InputCollect$refreshAddedStart + ) + mediaDecompImmediate <- select(decompCollect$mediaDecompImmediate, -.data$ds, -.data$y) + colnames(mediaDecompImmediate) <- paste0(colnames(mediaDecompImmediate), "_MDI") + mediaDecompCarryover <- select(decompCollect$mediaDecompCarryover, -.data$ds, -.data$y) + colnames(mediaDecompCarryover) <- paste0(colnames(mediaDecompCarryover), "_MDC") + temp <- bind_cols( + decompCollect$xDecompVec, + mediaDecompImmediate, + mediaDecompCarryover + ) %>% mutate(solID = sid) + vec_collect <- list( + xDecompVec = select(temp, -dplyr::ends_with("_MDI"), -dplyr::ends_with("_MDC")), + xDecompVecImmediate = select(temp, -dplyr::ends_with("_MDC"), -all_of(InputCollect$all_media)), + xDecompVecCarryover = select(temp, -dplyr::ends_with("_MDI"), -all_of(InputCollect$all_media)) + ) + this <- gsub("_MDI", "", colnames(vec_collect$xDecompVecImmediate)) + colnames(vec_collect$xDecompVecImmediate) <- colnames(vec_collect$xDecompVecCarryover) <- this + df_caov <- vec_collect$xDecompVecCarryover %>% + group_by(.data$solID) %>% + summarise(across(InputCollect$all_media, sum)) + df_total <- vec_collect$xDecompVec %>% + group_by(.data$solID) %>% + summarise(across(InputCollect$all_media, sum)) + df_caov_pct <- bind_cols( + select(df_caov, .data$solID), + select(df_caov, -.data$solID) / select(df_total, -.data$solID) + ) %>% + pivot_longer(cols = InputCollect$all_media, names_to = "rn", values_to = "carryover_pct") + df_caov_pct[is.na(as.matrix(df_caov_pct))] <- 0 + df_caov_pct_all <- bind_rows(df_caov_pct_all, df_caov_pct) + # Gather everything in an aggregated format + xDecompVecImmeCaov <- bind_rows( + select(vec_collect$xDecompVecImmediate, c("ds", InputCollect$all_media, "solID")) %>% + mutate(type = "Immediate"), + select(vec_collect$xDecompVecCarryover, c("ds", InputCollect$all_media, "solID")) %>% + mutate(type = "Carryover") + ) %>% + pivot_longer(cols = InputCollect$all_media, names_to = "rn") %>% + select(c("solID", "type", "rn", "value")) %>% + group_by(.data$solID, .data$rn, .data$type) %>% + summarise(response = sum(.data$value), .groups = "drop_last") %>% + mutate(percentage = .data$response / sum(.data$response)) %>% + replace(., is.na(.), 0) %>% + left_join(df_caov_pct, c("solID", "rn")) + if (length(unique(xDecompAgg$solID)) == 1) { + xDecompVecImmeCaov$solID <- OutModels$trial1$resultCollect$resultHypParam$solID + } + plot7data <- xDecompVecImmeCaov + + ## 8. Bootstrapped ROI/CPA with CIs + # plot8data <- "Empty" # Filled when running robyn_onepagers() with clustering data + + # Gather all results + mediaVecCollect <- bind_rows(mediaVecCollect, list( + mutate(dt_transformPlot, type = "rawMedia", solID = sid), + mutate(dt_transformSpend, type = "rawSpend", solID = sid), + mutate(dt_transformSpendMod, type = "predictedExposure", solID = sid), + mutate(dt_transformAdstock, type = "adstockedMedia", solID = sid), + mutate(dt_transformSaturation, type = "saturatedMedia", solID = sid), + mutate(dt_transformSaturationSpendReverse, type = "saturatedSpendReversed", solID = sid), + mutate(dt_transformSaturationDecomp, type = "decompMedia", solID = sid) + )) + xDecompVecCollect <- bind_rows(xDecompVecCollect, xDecompVec) + plotDataCollect[[sid]] <- list( + plot1data = plot1data, + plot2data = plot2data, + plot3data = plot3data, + plot4data = plot4data, + plot5data = plot5data, + plot6data = plot6data, + plot7data = plot7data + # plot8data = plot8data + ) + } + } # end pareto front loopdev + + pareto_results <- list( + pareto_solutions = unique(xDecompVecCollect$solID), + pareto_fronts = pareto_fronts, + resultHypParam = resultHypParam, + xDecompAgg = xDecompAgg, + resultCalibration = resultCalibration, + mediaVecCollect = mediaVecCollect, + xDecompVecCollect = xDecompVecCollect, + plotDataCollect = plotDataCollect, + df_caov_pct_all = df_caov_pct_all + ) + + if (OutputModels$cores > 1) stopImplicitCluster() + + return(pareto_results) +} + +pareto_front <- function(x, y, fronts = 1, sort = TRUE) { + stopifnot(length(x) == length(y)) + d <- data.frame(x, y) + Dtemp <- D <- d[order(d$x, d$y, decreasing = FALSE), ] + df <- data.frame() + i <- 1 + while (nrow(Dtemp) >= 1 & i <= max(fronts)) { + these <- Dtemp[which(!duplicated(cummin(Dtemp$y))), ] + these$pareto_front <- i + df <- rbind(df, these) + Dtemp <- Dtemp[!row.names(Dtemp) %in% row.names(these), ] + i <- i + 1 + } + ret <- merge(x = d, y = df, by = c("x", "y"), all.x = TRUE, sort = sort) + return(ret) +} diff --git a/R/plots.R b/R/plots.R index 80be083..a8a2530 100644 --- a/R/plots.R +++ b/R/plots.R @@ -427,7 +427,7 @@ robyn_onepagers <- function( sign = as.factor(ifelse(.data$xDecompPerc >= 0, "Positive", "Negative")) ) - p2 <- ggplot(plotWaterfallLoop, aes(x = .data$id, fill = .data$sign)) + + p2 <- ggplot(plotWaterfallLoop, aes(x = .data$rn, fill = .data$sign)) + geom_rect(aes( xmin = .data$id - 0.45, xmax = .data$id + 0.45, ymin = .data$end, ymax = .data$start @@ -437,6 +437,7 @@ robyn_onepagers <- function( scale_fill_manual(values = c("Positive" = "#59B3D2", "Negative" = "#E5586E")) + theme_lares(background = "white", legend = "top") + geom_text(mapping = aes( + x = .data$id, label = paste0( formatNum(.data$xDecompAgg, abbr = TRUE), "\n", round(.data$xDecompPerc * 100, 1), "%" diff --git a/R/refresh.R b/R/refresh.R index a72a3d6..a992ac5 100644 --- a/R/refresh.R +++ b/R/refresh.R @@ -1,604 +1,604 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Build Refresh Model -#' -#' @description -#' \code{robyn_refresh()} builds updated models based on -#' the previously built models saved in the \code{Robyn.RDS} object specified -#' in \code{robyn_object}. For example, when updating the initial build with 4 -#' weeks of new data, \code{robyn_refresh()} consumes the selected model of -#' the initial build, sets lower and upper bounds of hyperparameters for the -#' new build around the selected hyperparameters of the previous build, -#' stabilizes the effect of baseline variables across old and new builds, and -#' regulates the new effect share of media variables towards the latest -#' spend level. It returns the aggregated results with all previous builds for -#' reporting purposes and produces reporting plots. -#' -#' You must run \code{robyn_save()} to select and save an initial model first, -#' before refreshing. -#' -#' \strong{When should \code{robyn_refresh()} NOT be used:} -#' The \code{robyn_refresh()} function is suitable for -#' updating within "reasonable periods". Two situations are considered better -#' to rebuild model instead of refreshing: -#' -#' 1. Most data is new: If initial model was trained with 100 weeks worth of -#' data but we add +50 weeks of new data. -#' -#' 2. New variables are added: If initial model had less variables than the ones -#' we want to start using on new refresh model. -#' -#' @inheritParams robyn_run -#' @inheritParams robyn_allocator -#' @inheritParams robyn_outputs -#' @inheritParams robyn_inputs -#' @inheritParams robyn_outputs -#' @param dt_input data.frame. Should include all previous data and newly added -#' data for the refresh. -#' @param dt_holidays data.frame. Raw input holiday data. Load standard -#' Prophet holidays using \code{data("dt_prophet_holidays")}. -#' @param refresh_steps Integer. It controls how many time units the refresh -#' model build move forward. For example, \code{refresh_steps = 4} on weekly data -#' means the \code{InputCollect$window_start} & \code{InputCollect$window_end} -#' move forward 4 weeks. If \code{refresh_steps} is smaller than the number of -#' newly provided data points, then Robyn would only use the first N steps of the -#' new data. -#' @param refresh_mode Character. Options are "auto" and "manual". In auto mode, -#' the \code{robyn_refresh()} function builds refresh models with given -#' \code{refresh_steps} repeatedly until there's no more data available. I -#' manual mode, the \code{robyn_refresh()} only moves forward \code{refresh_steps} -#' only once. "auto" mode has been deprecated when using \code{json_file} input. -#' @param refresh_iters Integer. Iterations per refresh. Rule of thumb is, the -#' more new data added, the more iterations needed. More reliable recommendation -#' still needs to be investigated. -#' @param refresh_trials Integer. Trials per refresh. Defaults to 5 trials. -#' More reliable recommendation still needs to be investigated. -#' @param bounds_freedom Numeric. Percentage of freedom we'd like to allow for the -#' new hyperparameters values compared with the model to be refreshed. -#' If set to NULL (default) the value will be calculated as -#' refresh_steps / rollingWindowLength. Applies to all hyperparameters. -#' @param version_prompt Logical. If FALSE, the model refresh version will be -#' selected based on the smallest combined error of normalized NRMSE, DECOMP.RSSD, MAPE. -#' If \code{TRUE}, a prompt will be presented to the user to select one of the refreshed -#' models (one-pagers and Pareto CSV files will already be generated). -#' @param ... Additional parameters to overwrite original custom parameters -#' passed into initial model. -#' @return List. The Robyn object, class \code{robyn_refresh}. -#' @examples -#' \dontrun{ -#' # Loading dummy data -#' data("dt_simulated_weekly") -#' data("dt_prophet_holidays") -#' # Set the (pre-trained and exported) Robyn model JSON file -#' json_file <- "~/Robyn_202208081444_init/RobynModel-2_55_4.json" -#' -#' # Run \code{robyn_refresh()} with 13 weeks cadence in auto mode -#' Robyn <- robyn_refresh( -#' json_file = json_file, -#' dt_input = dt_simulated_weekly, -#' dt_holidays = Robyn::dt_prophet_holidays, -#' refresh_steps = 13, -#' refresh_mode = "auto", -#' refresh_iters = 200, -#' refresh_trials = 5 -#' ) -#' -#' # Run \code{robyn_refresh()} with 4 weeks cadence in manual mode -#' json_file2 <- "~/Robyn_202208081444_init/Robyn_202208090847_rf/RobynModel-1_2_3.json" -#' Robyn <- robyn_refresh( -#' json_file = json_file2, -#' dt_input = dt_simulated_weekly, -#' dt_holidays = Robyn::dt_prophet_holidays, -#' refresh_steps = 4, -#' refresh_mode = "manual", -#' refresh_iters = 200, -#' refresh_trials = 5 -#' ) -#' } -#' @return List. Same as \code{robyn_run()} but with refreshed models. -#' @export -robyn_refresh <- function(json_file = NULL, - robyn_object = NULL, - dt_input = NULL, - dt_holidays = Robyn::dt_prophet_holidays, - refresh_steps = 4, - refresh_mode = "manual", - refresh_iters = 1000, - refresh_trials = 3, - bounds_freedom = NULL, - plot_folder = NULL, - plot_pareto = TRUE, - version_prompt = FALSE, - export = TRUE, - calibration_input = NULL, - objective_weights = NULL, - ...) { - refreshControl <- TRUE - while (refreshControl) { - ## Check for NA values - check_nas(dt_input) - check_nas(dt_holidays) - - ## Load initial model - if (!is.null(json_file)) { - Robyn <- list() - json <- robyn_read(json_file, step = 2, quiet = TRUE) - if (is.null(plot_folder)) plot_folder <- json$ExportedModel$plot_folder - if (!dir.exists(plot_folder) & export) { - message(sprintf( - paste0( - "NOTE: Directory from JSON file doesn't exist: %s\n", - ">> Using current working directory for outputs: %s" - ), - plot_folder, getwd() - )) - plot_folder <- getwd() - } - listInit <- suppressWarnings(robyn_recreate( - json_file = json_file, - dt_input = if (!is.null(dt_input)) dt_input else json$Extras[["raw_data"]], - dt_holidays = dt_holidays, - plot_folder = plot_folder, - quiet = FALSE, ... - )) - listInit$InputCollect$refreshSourceID <- json$ExportedModel$select_model - chainData <- robyn_chain(json_file) - listInit$InputCollect$refreshChain <- attr(chainData, "chain") - listInit$InputCollect$refreshDepth <- refreshDepth <- length(attr(chainData, "chain")) - listInit$OutputCollect$hyper_updated <- json$ExportedModel$hyper_updated - listInit$InputCollect$window_end <- json$InputCollect$window_end - Robyn[["listInit"]] <- listInit - refreshCounter <- 1 # Dummy for now (legacy) - } - if (!is.null(robyn_object)) { - RobynImported <- robyn_load(robyn_object) - Robyn <- RobynImported$Robyn - plot_folder <- RobynImported$objectPath - robyn_object <- RobynImported$robyn_object - refreshCounter <- length(Robyn) - sum(names(Robyn) == "refresh") - refreshDepth <- NULL # Dummy for now (legacy) - } - depth <- ifelse(!is.null(refreshDepth), refreshDepth, refreshCounter) - - objectCheck <- if (refreshCounter == 1) { - "listInit" - } else { - c("listInit", paste0("listRefresh", 1:(refreshCounter - 1))) - } - if (!all(objectCheck %in% names(Robyn))) { - stop( - "Saved Robyn object is corrupted. It should contain these elements:\n ", - paste(objectCheck, collapse = ", "), - ".\n Please, re run the model or fix it manually." - ) - } - - ## Check rule of thumb: 50% of data shouldn't be new - dt_input <- Robyn$listInit$InputCollect$dt_input - check_refresh_data(Robyn, dt_input) - - ## Get previous data - if (refreshCounter == 1) { - InputCollectRF <- Robyn$listInit$InputCollect - listOutputPrev <- Robyn$listInit$OutputCollect - InputCollectRF$xDecompAggPrev <- listOutputPrev$xDecompAgg - if (length(unique(Robyn$listInit$OutputCollect$resultHypParam$solID)) > 1) { - stop("Run robyn_write() first to select and export any Robyn model") - } - } else { - listName <- paste0("listRefresh", refreshCounter - 1) - InputCollectRF <- Robyn[[listName]][["InputCollect"]] - listOutputPrev <- Robyn[[listName]][["OutputCollect"]] - listReportPrev <- Robyn[[listName]][["ReportCollect"]] - ## Model selection from previous build (new normalization range for error_score) - if (!"error_score" %in% names(listOutputPrev$resultHypParam)) { - listOutputPrev$resultHypParam <- as.data.frame(listOutputPrev$resultHypParam) %>% - mutate(error_score = errors_scores(., ts_validation = listOutputPrev$OutputModels$ts_validation, ...)) - } - which_bestModRF <- which.min(listOutputPrev$resultHypParam$error_score)[1] - listOutputPrev$resultHypParam <- listOutputPrev$resultHypParam[which_bestModRF, ] - listOutputPrev$xDecompAgg <- listOutputPrev$xDecompAgg[which_bestModRF, ] - listOutputPrev$mediaVecCollect <- listOutputPrev$mediaVecCollect[which_bestModRF, ] - listOutputPrev$xDecompVecCollect <- listOutputPrev$xDecompVecCollect[which_bestModRF, ] - } - - InputCollectRF$refreshCounter <- refreshCounter - InputCollectRF$refresh_steps <- refresh_steps - if (refresh_steps >= InputCollectRF$rollingWindowLength) { - stop("Refresh input data is completely new. Please rebuild model using robyn_run().") - } - - ## Load new data - if (TRUE) { - dt_input <- as_tibble(as.data.frame(dt_input)) - date_input <- check_datevar(dt_input, InputCollectRF$date_var) - dt_input <- date_input$dt_input # sort date by ascending - InputCollectRF$dt_input <- dt_input - dt_holidays <- as_tibble(as.data.frame(dt_holidays)) - InputCollectRF$dt_holidays <- dt_holidays - } - - #### Update refresh model parameters - - ## Refresh rolling window - if (TRUE) { - InputCollectRF$refreshAddedStart <- as.Date(InputCollectRF$window_end) + InputCollectRF$dayInterval - totalDates <- as.Date(dt_input[, InputCollectRF$date_var][[1]]) - refreshStart <- InputCollectRF$window_start <- as.Date(InputCollectRF$window_start) + InputCollectRF$dayInterval * refresh_steps - refreshStartWhich <- InputCollectRF$rollingWindowStartWhich <- which.min(abs(difftime(totalDates, refreshStart, units = "days"))) - refreshEnd <- InputCollectRF$window_end <- as.Date(InputCollectRF$window_end) + InputCollectRF$dayInterval * refresh_steps - refreshEndWhich <- InputCollectRF$rollingWindowEndWhich <- which.min(abs(difftime(totalDates, refreshEnd, units = "days"))) - InputCollectRF$rollingWindowLength <- refreshEndWhich - refreshStartWhich + 1 - } - - if (refreshEnd > max(totalDates)) { - stop("Not enough data for this refresh. Input data from date ", refreshEnd, " or later required") - } - if (!is.null(json_file) && refresh_mode == "auto") { - message("Input 'refresh_mode' = 'auto' has been deprecated. Changed to 'manual'") - refresh_mode <- "manual" - } - if (refresh_mode == "manual") { - refreshLooper <- 1 - message(sprintf("\n>>> Building refresh model #%s in %s mode", depth, refresh_mode)) - refreshControl <- FALSE - } else { - refreshLooper <- floor(as.numeric(difftime(max(totalDates), refreshEnd, units = "days")) / - InputCollectRF$dayInterval / refresh_steps) - message(sprintf( - "\n>>> Building refresh model #%s in %s mode. %s more to go...", - depth, refresh_mode, refreshLooper - )) - } - - #### Update refresh model parameters - - ## Calibration new data - if (!is.null(calibration_input)) { - calibration_input <- bind_rows( - InputCollectRF$calibration_input %>% - mutate( - liftStartDate = as.Date(.data$liftStartDate), - liftEndDate = as.Date(.data$liftEndDate) - ), calibration_input - ) %>% distinct() - ## Check calibration data - calibration_input <- check_calibration( - dt_input = InputCollectRF$dt_input, - date_var = InputCollectRF$date_var, - calibration_input = calibration_input, - dayInterval = InputCollectRF$dayInterval, - dep_var = InputCollectRF$dep_var, - window_start = InputCollectRF$window_start, - window_end = InputCollectRF$window_end, - paid_media_spends = InputCollectRF$paid_media_spends, - organic_vars = InputCollectRF$organic_vars - ) - InputCollectRF$calibration_input <- calibration_input - } - - ## Refresh hyperparameter bounds - ts_validation <- ifelse( - "ts_validation" %in% names(list(...)), - isTRUE(list(...)[["ts_validation"]]), - isTRUE(Robyn$listInit$OutputCollect$OutputModels$ts_validation) - ) - InputCollectRF$hyperparameters <- refresh_hyps( - initBounds = Robyn$listInit$OutputCollect$hyper_updated, - listOutputPrev, - refresh_steps = refresh_steps, - rollingWindowLength = InputCollectRF$rollingWindowLength, - ts_validation = ts_validation, - bounds_freedom = bounds_freedom - ) - - ## Feature engineering for refreshed data - # Note that if custom prophet parameters were passed initially, - # will be used again unless changed in ... - InputCollectRF <- robyn_engineering(InputCollectRF, ...) - - ## Refresh model with adjusted decomp.rssd - # OutputCollectRF <- Robyn$listRefresh1$OutputCollect - if (is.null(InputCollectRF$calibration_input)) { - rf_cal_constr <- listOutputPrev[["calibration_constraint"]] - } else { - rf_cal_constr <- 1 - } - OutputModelsRF <- robyn_run( - InputCollect = InputCollectRF, - iterations = refresh_iters, - trials = refresh_trials, - refresh = TRUE, - add_penalty_factor = listOutputPrev[["add_penalty_factor"]], - ts_validation = ts_validation, - ... - ) - OutputCollectRF <- robyn_outputs( - InputCollectRF, OutputModelsRF, - select_model = "refreshed", - plot_folder = plot_folder, - calibration_constraint = rf_cal_constr, - export = export, - plot_pareto = plot_pareto, - objective_weights = objective_weights, - ... - ) - - ## Select winner model for current refresh (the lower error_score the better) - OutputCollectRF$resultHypParam <- OutputCollectRF$resultHypParam %>% - ungroup() %>% - arrange(.data$decomp.rssd) %>% - select(.data$solID, everything()) - bestMod <- OutputCollectRF$resultHypParam$solID[1] - # OutputCollectRF$clusters$data %>% filter(solID == bestMod) - - # Pick best model (and don't crash if not valid) - selectID <- NULL - while (length(selectID) == 0) { - if (version_prompt) { - selectID <- readline("Input model ID to use for the refresh: ") - message( - "Selected model ID: ", selectID, " for refresh model #", - depth, " based on your input" - ) - if (!selectID %in% OutputCollectRF$allSolutions) { - message(sprintf( - "Selected model (%s) NOT valid.\n Choose any of: %s", - selectID, v2t(OutputCollectRF$allSolutions) - )) - } - } else { - selectID <- bestMod - message( - "Selected model ID: ", selectID, " for refresh model #", - depth, " based on the smallest DECOMP.RSSD error" - ) - if (export) { - robyn_write(InputCollectRF, OutputCollectRF, select_model = selectID, ...) - } - } - if (!isTRUE(selectID %in% OutputCollectRF$allSolutions)) { - version_prompt <- TRUE - selectID <- NULL - } - } - OutputCollectRF$selectID <- selectID - - #### Result collect & save - - # Add refreshStatus column to multiple OutputCollectRF data.frames - these <- c("resultHypParam", "xDecompAgg", "mediaVecCollect", "xDecompVecCollect") - for (tb in these) { - OutputCollectRF[[tb]] <- OutputCollectRF[[tb]] %>% - mutate( - refreshStatus = refreshCounter, - bestModRF = .data$solID %in% bestMod - ) - } - - # Create bestModRF and refreshStatus columns to listOutputPrev data.frames - if (refreshCounter == 1) { - for (tb in these) { - listOutputPrev[[tb]] <- mutate( - listOutputPrev[[tb]], - bestModRF = TRUE, - refreshStatus = 0 - ) - } - listReportPrev <- listOutputPrev - names(listReportPrev) <- paste0(names(listReportPrev), "Report") - listReportPrev$mediaVecReport <- listOutputPrev$mediaVecCollect %>% - filter( - .data$ds >= (refreshStart - InputCollectRF$dayInterval * refresh_steps), - .data$ds <= (refreshEnd - InputCollectRF$dayInterval * refresh_steps) - ) %>% - bind_rows( - OutputCollectRF$mediaVecCollect %>% - filter( - .data$bestModRF == TRUE, - .data$ds >= InputCollectRF$refreshAddedStart, - .data$ds <= refreshEnd - ) - ) %>% - arrange(.data$type, .data$ds, .data$refreshStatus) - listReportPrev$xDecompVecReport <- listOutputPrev$xDecompVecCollect %>% - bind_rows( - OutputCollectRF$xDecompVecCollect %>% - filter( - .data$bestModRF == TRUE, - .data$ds >= InputCollectRF$refreshAddedStart, - .data$ds <= refreshEnd - ) - ) - } - - resultHypParamReport <- listReportPrev$resultHypParamReport %>% - bind_rows( - filter(OutputCollectRF$resultHypParam, .data$bestModRF == TRUE) - ) %>% - mutate(refreshStatus = row_number() - 1) - - xDecompAggReport <- listReportPrev$xDecompAggReport %>% - bind_rows( - filter(OutputCollectRF$xDecompAgg, .data$bestModRF == TRUE) %>% - mutate(refreshStatus = refreshCounter) - ) - - mediaVecReport <- as_tibble(listReportPrev$mediaVecReport) %>% - mutate(ds = as.Date(.data$ds, origin = "1970-01-01")) %>% - bind_rows( - filter( - mutate(OutputCollectRF$mediaVecCollect, - ds = as.Date(.data$ds, origin = "1970-01-01") - ), - .data$bestModRF == TRUE, - .data$ds >= InputCollectRF$refreshAddedStart, - .data$ds <= refreshEnd - ) %>% - mutate(refreshStatus = refreshCounter) - ) %>% - arrange(.data$type, .data$ds, .data$refreshStatus) - - xDecompVecReport <- listReportPrev$xDecompVecReport %>% - mutate(ds = as.Date(.data$ds, origin = "1970-01-01")) %>% - bind_rows( - filter( - mutate(OutputCollectRF$xDecompVecCollect, - ds = as.Date(.data$ds, origin = "1970-01-01") - ), - .data$bestModRF == TRUE, - .data$ds >= InputCollectRF$refreshAddedStart, - .data$ds <= refreshEnd - ) %>% - mutate(refreshStatus = refreshCounter) - ) - - #### Result objects to export - ReportCollect <- list( - resultHypParamReport = resultHypParamReport, - xDecompAggReport = xDecompAggReport, - mediaVecReport = mediaVecReport, - xDecompVecReport = xDecompVecReport, - # Selected models (original + refresh IDs) - selectIDs = resultHypParamReport$solID - ) - listNameUpdate <- paste0("listRefresh", refreshCounter) - Robyn[[listNameUpdate]] <- list( - InputCollect = InputCollectRF, - OutputCollect = OutputCollectRF, - ReportCollect = ReportCollect - ) - - #### Reporting plots - # InputCollectRF <- Robyn$listRefresh1$InputCollect - # OutputCollectRF <- Robyn$listRefresh1$OutputCollect - # ReportCollect <- Robyn$listRefresh1$ReportCollect - if (!is.null(json_file)) { - json_temp <- robyn_write( - InputCollectRF, OutputCollectRF, - select_model = selectID, - export = TRUE, quiet = TRUE, ... - ) - df <- OutputCollectRF$allPareto$plotDataCollect[[selectID]] - plots <- try(refresh_plots_json( - json_file = attr(json_temp, "json_file"), - plot_folder = OutputCollectRF$plot_folder, - df = df, listInit = listInit, export = export, ... - )) - } else { - plots <- try(refresh_plots( - InputCollectRF, OutputCollectRF, ReportCollect, export, ... - )) - } - - if (export) { - csv_folder <- OutputCollectRF$plot_folder - message(paste(">>> Exporting refresh CSVs into directory:", csv_folder)) - write.csv(resultHypParamReport, paste0(csv_folder, "report_hyperparameters.csv")) - write.csv(xDecompAggReport, paste0(csv_folder, "report_aggregated.csv")) - write.csv(mediaVecReport, paste0(csv_folder, "report_media_transform_matrix.csv")) - write.csv(xDecompVecReport, paste0(csv_folder, "report_alldecomp_matrix.csv")) - } - - if (refreshLooper == 0) { - refreshControl <- FALSE - message("Reached maximum available date. No further refresh possible") - } - } - - # Save some parameters to print - Robyn[["refresh"]] <- list( - selectIDs = ReportCollect$selectIDs, - refresh_steps = refresh_steps, - refresh_mode = refresh_mode, - refresh_trials = refresh_trials, - refresh_iters = refresh_iters, - plots = plots - ) - - # Save Robyn object locally - Robyn <- Robyn[order(names(Robyn))] - class(Robyn) <- c("robyn_refresh", class(Robyn)) - if (is.null(json_file)) { - message(">> Exporting results: ", robyn_object) - saveRDS(Robyn, file = robyn_object) - } - - return(invisible(Robyn)) -} - -#' @rdname robyn_refresh -#' @aliases robyn_refresh -#' @param x \code{robyn_refresh()} output. -#' @export -print.robyn_refresh <- function(x, ...) { - top_models <- x$refresh$selectIDs - top_models <- paste(top_models, sprintf("(%s)", 0:(length(top_models) - 1))) - print(glued( - " -Refresh Models: {length(top_models)} -Mode: {x$refresh$refresh_mode} -Steps: {x$refresh$refresh_steps} -Trials: {x$refresh$refresh_trials} -Iterations: {x$refresh$refresh_iters} - -Models (IDs): - {paste(top_models, collapse = ', ')} -" - )) -} - -#' @rdname robyn_refresh -#' @aliases robyn_refresh -#' @param x \code{robyn_refresh()} output. -#' @export -plot.robyn_refresh <- function(x, ...) plot((x$refresh$plots[[1]] / x$refresh$plots[[2]]), ...) - -refresh_hyps <- function(initBounds, listOutputPrev, refresh_steps, rollingWindowLength, - ts_validation = FALSE, bounds_freedom = NULL) { - initBoundsDis <- unlist(lapply(initBounds, function(x) ifelse(length(x) == 2, x[2] - x[1], 0))) - if (is.null(bounds_freedom)) { - newBoundsFreedom <- refresh_steps / rollingWindowLength - } else { - newBoundsFreedom <- abs(bounds_freedom) - } - message(">>> New bounds freedom: ", round(100 * newBoundsFreedom, 2), "%") - hyper_updated_prev <- initBounds - hypNames <- names(initBounds) - resultHypParam <- as_tibble(listOutputPrev$resultHypParam) - for (h in seq_along(hypNames)) { - hn <- hypNames[h] - getHyp <- resultHypParam[, hn][[1]] - getDis <- initBoundsDis[hn] - if (hn == "lambda") { - lambda_max <- unique(resultHypParam$lambda_max) - lambda_min <- lambda_max * 0.0001 - getHyp <- getHyp / (lambda_max - lambda_min) - } - getRange <- initBounds[hn][[1]] - if (length(getRange) == 2) { - newLowB <- getHyp - getDis * newBoundsFreedom - if (newLowB < getRange[1]) { - newLowB <- getRange[1] - } - newUpB <- getHyp + getDis * newBoundsFreedom - if (newUpB > getRange[2]) { - newUpB <- getRange[2] - } - newBounds <- unname(c(newLowB, newUpB)) - hyper_updated_prev[[hn]] <- newBounds - } else { - fixed <- hyper_updated_prev[hn][[1]] - hyper_updated_prev[[hn]] <- c( - fixed * (1 - newBoundsFreedom), - fixed * (1 + newBoundsFreedom) - ) - } - } - if (!ts_validation) hyper_updated_prev[["train_size"]] <- NULL - return(hyper_updated_prev) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Build Refresh Model +#' +#' @description +#' \code{robyn_refresh()} builds updated models based on +#' the previously built models saved in the \code{Robyn.RDS} object specified +#' in \code{robyn_object}. For example, when updating the initial build with 4 +#' weeks of new data, \code{robyn_refresh()} consumes the selected model of +#' the initial build, sets lower and upper bounds of hyperparameters for the +#' new build around the selected hyperparameters of the previous build, +#' stabilizes the effect of baseline variables across old and new builds, and +#' regulates the new effect share of media variables towards the latest +#' spend level. It returns the aggregated results with all previous builds for +#' reporting purposes and produces reporting plots. +#' +#' You must run \code{robyn_save()} to select and save an initial model first, +#' before refreshing. +#' +#' \strong{When should \code{robyn_refresh()} NOT be used:} +#' The \code{robyn_refresh()} function is suitable for +#' updating within "reasonable periods". Two situations are considered better +#' to rebuild model instead of refreshing: +#' +#' 1. Most data is new: If initial model was trained with 100 weeks worth of +#' data but we add +50 weeks of new data. +#' +#' 2. New variables are added: If initial model had less variables than the ones +#' we want to start using on new refresh model. +#' +#' @inheritParams robyn_run +#' @inheritParams robyn_allocator +#' @inheritParams robyn_outputs +#' @inheritParams robyn_inputs +#' @inheritParams robyn_outputs +#' @param dt_input data.frame. Should include all previous data and newly added +#' data for the refresh. +#' @param dt_holidays data.frame. Raw input holiday data. Load standard +#' Prophet holidays using \code{data("dt_prophet_holidays")}. +#' @param refresh_steps Integer. It controls how many time units the refresh +#' model build move forward. For example, \code{refresh_steps = 4} on weekly data +#' means the \code{InputCollect$window_start} & \code{InputCollect$window_end} +#' move forward 4 weeks. If \code{refresh_steps} is smaller than the number of +#' newly provided data points, then Robyn would only use the first N steps of the +#' new data. +#' @param refresh_mode Character. Options are "auto" and "manual". In auto mode, +#' the \code{robyn_refresh()} function builds refresh models with given +#' \code{refresh_steps} repeatedly until there's no more data available. I +#' manual mode, the \code{robyn_refresh()} only moves forward \code{refresh_steps} +#' only once. "auto" mode has been deprecated when using \code{json_file} input. +#' @param refresh_iters Integer. Iterations per refresh. Rule of thumb is, the +#' more new data added, the more iterations needed. More reliable recommendation +#' still needs to be investigated. +#' @param refresh_trials Integer. Trials per refresh. Defaults to 5 trials. +#' More reliable recommendation still needs to be investigated. +#' @param bounds_freedom Numeric. Percentage of freedom we'd like to allow for the +#' new hyperparameters values compared with the model to be refreshed. +#' If set to NULL (default) the value will be calculated as +#' refresh_steps / rollingWindowLength. Applies to all hyperparameters. +#' @param version_prompt Logical. If FALSE, the model refresh version will be +#' selected based on the smallest combined error of normalized NRMSE, DECOMP.RSSD, MAPE. +#' If \code{TRUE}, a prompt will be presented to the user to select one of the refreshed +#' models (one-pagers and Pareto CSV files will already be generated). +#' @param ... Additional parameters to overwrite original custom parameters +#' passed into initial model. +#' @return List. The Robyn object, class \code{robyn_refresh}. +#' @examples +#' \dontrun{ +#' # Loading dummy data +#' data("dt_simulated_weekly") +#' data("dt_prophet_holidays") +#' # Set the (pre-trained and exported) Robyn model JSON file +#' json_file <- "~/Robyn_202208081444_init/RobynModel-2_55_4.json" +#' +#' # Run \code{robyn_refresh()} with 13 weeks cadence in auto mode +#' Robyn <- robyn_refresh( +#' json_file = json_file, +#' dt_input = dt_simulated_weekly, +#' dt_holidays = Robyn::dt_prophet_holidays, +#' refresh_steps = 13, +#' refresh_mode = "auto", +#' refresh_iters = 200, +#' refresh_trials = 5 +#' ) +#' +#' # Run \code{robyn_refresh()} with 4 weeks cadence in manual mode +#' json_file2 <- "~/Robyn_202208081444_init/Robyn_202208090847_rf/RobynModel-1_2_3.json" +#' Robyn <- robyn_refresh( +#' json_file = json_file2, +#' dt_input = dt_simulated_weekly, +#' dt_holidays = Robyn::dt_prophet_holidays, +#' refresh_steps = 4, +#' refresh_mode = "manual", +#' refresh_iters = 200, +#' refresh_trials = 5 +#' ) +#' } +#' @return List. Same as \code{robyn_run()} but with refreshed models. +#' @export +robyn_refresh <- function(json_file = NULL, + robyn_object = NULL, + dt_input = NULL, + dt_holidays = Robyn::dt_prophet_holidays, + refresh_steps = 4, + refresh_mode = "manual", + refresh_iters = 1000, + refresh_trials = 3, + bounds_freedom = NULL, + plot_folder = NULL, + plot_pareto = TRUE, + version_prompt = FALSE, + export = TRUE, + calibration_input = NULL, + objective_weights = NULL, + ...) { + refreshControl <- TRUE + while (refreshControl) { + ## Check for NA values + check_nas(dt_input) + check_nas(dt_holidays) + + ## Load initial model + if (!is.null(json_file)) { + Robyn <- list() + json <- robyn_read(json_file, step = 2, quiet = TRUE) + if (is.null(plot_folder)) plot_folder <- json$ExportedModel$plot_folder + if (!dir.exists(plot_folder) & export) { + message(sprintf( + paste0( + "NOTE: Directory from JSON file doesn't exist: %s\n", + ">> Using current working directory for outputs: %s" + ), + plot_folder, getwd() + )) + plot_folder <- getwd() + } + listInit <- suppressWarnings(robyn_recreate( + json_file = json_file, + dt_input = if (!is.null(dt_input)) dt_input else json$Extras[["raw_data"]], + dt_holidays = dt_holidays, + plot_folder = plot_folder, + quiet = FALSE, ... + )) + listInit$InputCollect$refreshSourceID <- json$ExportedModel$select_model + chainData <- robyn_chain(json_file) + listInit$InputCollect$refreshChain <- attr(chainData, "chain") + listInit$InputCollect$refreshDepth <- refreshDepth <- length(attr(chainData, "chain")) + listInit$OutputCollect$hyper_updated <- json$ExportedModel$hyper_updated + listInit$InputCollect$window_end <- json$InputCollect$window_end + Robyn[["listInit"]] <- listInit + refreshCounter <- 1 # Dummy for now (legacy) + } + if (!is.null(robyn_object)) { + RobynImported <- robyn_load(robyn_object) + Robyn <- RobynImported$Robyn + plot_folder <- RobynImported$objectPath + robyn_object <- RobynImported$robyn_object + refreshCounter <- length(Robyn) - sum(names(Robyn) == "refresh") + refreshDepth <- NULL # Dummy for now (legacy) + } + depth <- ifelse(!is.null(refreshDepth), refreshDepth, refreshCounter) + + objectCheck <- if (refreshCounter == 1) { + "listInit" + } else { + c("listInit", paste0("listRefresh", 1:(refreshCounter - 1))) + } + if (!all(objectCheck %in% names(Robyn))) { + stop( + "Saved Robyn object is corrupted. It should contain these elements:\n ", + paste(objectCheck, collapse = ", "), + ".\n Please, re run the model or fix it manually." + ) + } + + ## Check rule of thumb: 50% of data shouldn't be new + dt_input <- Robyn$listInit$InputCollect$dt_input + check_refresh_data(Robyn, dt_input) + + ## Get previous data + if (refreshCounter == 1) { + InputCollectRF <- Robyn$listInit$InputCollect + listOutputPrev <- Robyn$listInit$OutputCollect + InputCollectRF$xDecompAggPrev <- listOutputPrev$xDecompAgg + if (length(unique(Robyn$listInit$OutputCollect$resultHypParam$solID)) > 1) { + stop("Run robyn_write() first to select and export any Robyn model") + } + } else { + listName <- paste0("listRefresh", refreshCounter - 1) + InputCollectRF <- Robyn[[listName]][["InputCollect"]] + listOutputPrev <- Robyn[[listName]][["OutputCollect"]] + listReportPrev <- Robyn[[listName]][["ReportCollect"]] + ## Model selection from previous build (new normalization range for error_score) + if (!"error_score" %in% names(listOutputPrev$resultHypParam)) { + listOutputPrev$resultHypParam <- as.data.frame(listOutputPrev$resultHypParam) %>% + mutate(error_score = errors_scores(., ts_validation = listOutputPrev$OutputModels$ts_validation, ...)) + } + which_bestModRF <- which.min(listOutputPrev$resultHypParam$error_score)[1] + listOutputPrev$resultHypParam <- listOutputPrev$resultHypParam[which_bestModRF, ] + listOutputPrev$xDecompAgg <- listOutputPrev$xDecompAgg[which_bestModRF, ] + listOutputPrev$mediaVecCollect <- listOutputPrev$mediaVecCollect[which_bestModRF, ] + listOutputPrev$xDecompVecCollect <- listOutputPrev$xDecompVecCollect[which_bestModRF, ] + } + + InputCollectRF$refreshCounter <- refreshCounter + InputCollectRF$refresh_steps <- refresh_steps + if (refresh_steps >= InputCollectRF$rollingWindowLength) { + stop("Refresh input data is completely new. Please rebuild model using robyn_run().") + } + + ## Load new data + if (TRUE) { + dt_input <- as_tibble(as.data.frame(dt_input)) + date_input <- check_datevar(dt_input, InputCollectRF$date_var) + dt_input <- date_input$dt_input # sort date by ascending + InputCollectRF$dt_input <- dt_input + dt_holidays <- as_tibble(as.data.frame(dt_holidays)) + InputCollectRF$dt_holidays <- dt_holidays + } + + #### Update refresh model parameters + + ## Refresh rolling window + if (TRUE) { + InputCollectRF$refreshAddedStart <- as.Date(InputCollectRF$window_end) + InputCollectRF$dayInterval + totalDates <- as.Date(dt_input[, InputCollectRF$date_var][[1]]) + refreshStart <- InputCollectRF$window_start <- as.Date(InputCollectRF$window_start) + InputCollectRF$dayInterval * refresh_steps + refreshStartWhich <- InputCollectRF$rollingWindowStartWhich <- which.min(abs(difftime(totalDates, refreshStart, units = "days"))) + refreshEnd <- InputCollectRF$window_end <- as.Date(InputCollectRF$window_end) + InputCollectRF$dayInterval * refresh_steps + refreshEndWhich <- InputCollectRF$rollingWindowEndWhich <- which.min(abs(difftime(totalDates, refreshEnd, units = "days"))) + InputCollectRF$rollingWindowLength <- refreshEndWhich - refreshStartWhich + 1 + } + + if (refreshEnd > max(totalDates)) { + stop("Not enough data for this refresh. Input data from date ", refreshEnd, " or later required") + } + if (!is.null(json_file) && refresh_mode == "auto") { + message("Input 'refresh_mode' = 'auto' has been deprecated. Changed to 'manual'") + refresh_mode <- "manual" + } + if (refresh_mode == "manual") { + refreshLooper <- 1 + message(sprintf("\n>>> Building refresh model #%s in %s mode", depth, refresh_mode)) + refreshControl <- FALSE + } else { + refreshLooper <- floor(as.numeric(difftime(max(totalDates), refreshEnd, units = "days")) / + InputCollectRF$dayInterval / refresh_steps) + message(sprintf( + "\n>>> Building refresh model #%s in %s mode. %s more to go...", + depth, refresh_mode, refreshLooper + )) + } + + #### Update refresh model parameters + + ## Calibration new data + if (!is.null(calibration_input)) { + calibration_input <- bind_rows( + InputCollectRF$calibration_input %>% + mutate( + liftStartDate = as.Date(.data$liftStartDate), + liftEndDate = as.Date(.data$liftEndDate) + ), calibration_input + ) %>% distinct() + ## Check calibration data + calibration_input <- check_calibration( + dt_input = InputCollectRF$dt_input, + date_var = InputCollectRF$date_var, + calibration_input = calibration_input, + dayInterval = InputCollectRF$dayInterval, + dep_var = InputCollectRF$dep_var, + window_start = InputCollectRF$window_start, + window_end = InputCollectRF$window_end, + paid_media_spends = InputCollectRF$paid_media_spends, + organic_vars = InputCollectRF$organic_vars + ) + InputCollectRF$calibration_input <- calibration_input + } + + ## Refresh hyperparameter bounds + ts_validation <- ifelse( + "ts_validation" %in% names(list(...)), + isTRUE(list(...)[["ts_validation"]]), + isTRUE(Robyn$listInit$OutputCollect$OutputModels$ts_validation) + ) + InputCollectRF$hyperparameters <- refresh_hyps( + initBounds = Robyn$listInit$OutputCollect$hyper_updated, + listOutputPrev, + refresh_steps = refresh_steps, + rollingWindowLength = InputCollectRF$rollingWindowLength, + ts_validation = ts_validation, + bounds_freedom = bounds_freedom + ) + + ## Feature engineering for refreshed data + # Note that if custom prophet parameters were passed initially, + # will be used again unless changed in ... + InputCollectRF <- robyn_engineering(InputCollectRF, ...) + + ## Refresh model with adjusted decomp.rssd + # OutputCollectRF <- Robyn$listRefresh1$OutputCollect + if (is.null(InputCollectRF$calibration_input)) { + rf_cal_constr <- listOutputPrev[["calibration_constraint"]] + } else { + rf_cal_constr <- 1 + } + OutputModelsRF <- robyn_run( + InputCollect = InputCollectRF, + iterations = refresh_iters, + trials = refresh_trials, + refresh = TRUE, + add_penalty_factor = listOutputPrev[["add_penalty_factor"]], + ts_validation = ts_validation, + ... + ) + OutputCollectRF <- robyn_outputs( + InputCollectRF, OutputModelsRF, + select_model = "refreshed", + plot_folder = plot_folder, + calibration_constraint = rf_cal_constr, + export = export, + plot_pareto = plot_pareto, + objective_weights = objective_weights, + ... + ) + + ## Select winner model for current refresh (the lower error_score the better) + OutputCollectRF$resultHypParam <- OutputCollectRF$resultHypParam %>% + ungroup() %>% + arrange(.data$decomp.rssd) %>% + select(.data$solID, everything()) + bestMod <- OutputCollectRF$resultHypParam$solID[1] + # OutputCollectRF$clusters$data %>% filter(solID == bestMod) + + # Pick best model (and don't crash if not valid) + selectID <- NULL + while (length(selectID) == 0) { + if (version_prompt) { + selectID <- readline("Input model ID to use for the refresh: ") + message( + "Selected model ID: ", selectID, " for refresh model #", + depth, " based on your input" + ) + if (!selectID %in% OutputCollectRF$allSolutions) { + message(sprintf( + "Selected model (%s) NOT valid.\n Choose any of: %s", + selectID, v2t(OutputCollectRF$allSolutions) + )) + } + } else { + selectID <- bestMod + message( + "Selected model ID: ", selectID, " for refresh model #", + depth, " based on the smallest DECOMP.RSSD error" + ) + if (export) { + robyn_write(InputCollectRF, OutputCollectRF, select_model = selectID, ...) + } + } + if (!isTRUE(selectID %in% OutputCollectRF$allSolutions)) { + version_prompt <- TRUE + selectID <- NULL + } + } + OutputCollectRF$selectID <- selectID + + #### Result collect & save + + # Add refreshStatus column to multiple OutputCollectRF data.frames + these <- c("resultHypParam", "xDecompAgg", "mediaVecCollect", "xDecompVecCollect") + for (tb in these) { + OutputCollectRF[[tb]] <- OutputCollectRF[[tb]] %>% + mutate( + refreshStatus = refreshCounter, + bestModRF = .data$solID %in% bestMod + ) + } + + # Create bestModRF and refreshStatus columns to listOutputPrev data.frames + if (refreshCounter == 1) { + for (tb in these) { + listOutputPrev[[tb]] <- mutate( + listOutputPrev[[tb]], + bestModRF = TRUE, + refreshStatus = 0 + ) + } + listReportPrev <- listOutputPrev + names(listReportPrev) <- paste0(names(listReportPrev), "Report") + listReportPrev$mediaVecReport <- listOutputPrev$mediaVecCollect %>% + filter( + .data$ds >= (refreshStart - InputCollectRF$dayInterval * refresh_steps), + .data$ds <= (refreshEnd - InputCollectRF$dayInterval * refresh_steps) + ) %>% + bind_rows( + OutputCollectRF$mediaVecCollect %>% + filter( + .data$bestModRF == TRUE, + .data$ds >= InputCollectRF$refreshAddedStart, + .data$ds <= refreshEnd + ) + ) %>% + arrange(.data$type, .data$ds, .data$refreshStatus) + listReportPrev$xDecompVecReport <- listOutputPrev$xDecompVecCollect %>% + bind_rows( + OutputCollectRF$xDecompVecCollect %>% + filter( + .data$bestModRF == TRUE, + .data$ds >= InputCollectRF$refreshAddedStart, + .data$ds <= refreshEnd + ) + ) + } + + resultHypParamReport <- listReportPrev$resultHypParamReport %>% + bind_rows( + filter(OutputCollectRF$resultHypParam, .data$bestModRF == TRUE) + ) %>% + mutate(refreshStatus = row_number() - 1) + + xDecompAggReport <- listReportPrev$xDecompAggReport %>% + bind_rows( + filter(OutputCollectRF$xDecompAgg, .data$bestModRF == TRUE) %>% + mutate(refreshStatus = refreshCounter) + ) + + mediaVecReport <- as_tibble(listReportPrev$mediaVecReport) %>% + mutate(ds = as.Date(.data$ds, origin = "1970-01-01")) %>% + bind_rows( + filter( + mutate(OutputCollectRF$mediaVecCollect, + ds = as.Date(.data$ds, origin = "1970-01-01") + ), + .data$bestModRF == TRUE, + .data$ds >= InputCollectRF$refreshAddedStart, + .data$ds <= refreshEnd + ) %>% + mutate(refreshStatus = refreshCounter) + ) %>% + arrange(.data$type, .data$ds, .data$refreshStatus) + + xDecompVecReport <- listReportPrev$xDecompVecReport %>% + mutate(ds = as.Date(.data$ds, origin = "1970-01-01")) %>% + bind_rows( + filter( + mutate(OutputCollectRF$xDecompVecCollect, + ds = as.Date(.data$ds, origin = "1970-01-01") + ), + .data$bestModRF == TRUE, + .data$ds >= InputCollectRF$refreshAddedStart, + .data$ds <= refreshEnd + ) %>% + mutate(refreshStatus = refreshCounter) + ) + + #### Result objects to export + ReportCollect <- list( + resultHypParamReport = resultHypParamReport, + xDecompAggReport = xDecompAggReport, + mediaVecReport = mediaVecReport, + xDecompVecReport = xDecompVecReport, + # Selected models (original + refresh IDs) + selectIDs = resultHypParamReport$solID + ) + listNameUpdate <- paste0("listRefresh", refreshCounter) + Robyn[[listNameUpdate]] <- list( + InputCollect = InputCollectRF, + OutputCollect = OutputCollectRF, + ReportCollect = ReportCollect + ) + + #### Reporting plots + # InputCollectRF <- Robyn$listRefresh1$InputCollect + # OutputCollectRF <- Robyn$listRefresh1$OutputCollect + # ReportCollect <- Robyn$listRefresh1$ReportCollect + if (!is.null(json_file)) { + json_temp <- robyn_write( + InputCollectRF, OutputCollectRF, + select_model = selectID, + export = TRUE, quiet = TRUE, ... + ) + df <- OutputCollectRF$allPareto$plotDataCollect[[selectID]] + plots <- try(refresh_plots_json( + json_file = attr(json_temp, "json_file"), + plot_folder = OutputCollectRF$plot_folder, + df = df, listInit = listInit, export = export, ... + )) + } else { + plots <- try(refresh_plots( + InputCollectRF, OutputCollectRF, ReportCollect, export, ... + )) + } + + if (export) { + csv_folder <- OutputCollectRF$plot_folder + message(paste(">>> Exporting refresh CSVs into directory:", csv_folder)) + write.csv(resultHypParamReport, paste0(csv_folder, "report_hyperparameters.csv")) + write.csv(xDecompAggReport, paste0(csv_folder, "report_aggregated.csv")) + write.csv(mediaVecReport, paste0(csv_folder, "report_media_transform_matrix.csv")) + write.csv(xDecompVecReport, paste0(csv_folder, "report_alldecomp_matrix.csv")) + } + + if (refreshLooper == 0) { + refreshControl <- FALSE + message("Reached maximum available date. No further refresh possible") + } + } + + # Save some parameters to print + Robyn[["refresh"]] <- list( + selectIDs = ReportCollect$selectIDs, + refresh_steps = refresh_steps, + refresh_mode = refresh_mode, + refresh_trials = refresh_trials, + refresh_iters = refresh_iters, + plots = plots + ) + + # Save Robyn object locally + Robyn <- Robyn[order(names(Robyn))] + class(Robyn) <- c("robyn_refresh", class(Robyn)) + if (is.null(json_file)) { + message(">> Exporting results: ", robyn_object) + saveRDS(Robyn, file = robyn_object) + } + + return(invisible(Robyn)) +} + +#' @rdname robyn_refresh +#' @aliases robyn_refresh +#' @param x \code{robyn_refresh()} output. +#' @export +print.robyn_refresh <- function(x, ...) { + top_models <- x$refresh$selectIDs + top_models <- paste(top_models, sprintf("(%s)", 0:(length(top_models) - 1))) + print(glued( + " +Refresh Models: {length(top_models)} +Mode: {x$refresh$refresh_mode} +Steps: {x$refresh$refresh_steps} +Trials: {x$refresh$refresh_trials} +Iterations: {x$refresh$refresh_iters} + +Models (IDs): + {paste(top_models, collapse = ', ')} +" + )) +} + +#' @rdname robyn_refresh +#' @aliases robyn_refresh +#' @param x \code{robyn_refresh()} output. +#' @export +plot.robyn_refresh <- function(x, ...) plot((x$refresh$plots[[1]] / x$refresh$plots[[2]]), ...) + +refresh_hyps <- function(initBounds, listOutputPrev, refresh_steps, rollingWindowLength, + ts_validation = FALSE, bounds_freedom = NULL) { + initBoundsDis <- unlist(lapply(initBounds, function(x) ifelse(length(x) == 2, x[2] - x[1], 0))) + if (is.null(bounds_freedom)) { + newBoundsFreedom <- refresh_steps / rollingWindowLength + } else { + newBoundsFreedom <- abs(bounds_freedom) + } + message(">>> New bounds freedom: ", round(100 * newBoundsFreedom, 2), "%") + hyper_updated_prev <- initBounds + hypNames <- names(initBounds) + resultHypParam <- as_tibble(listOutputPrev$resultHypParam) + for (h in seq_along(hypNames)) { + hn <- hypNames[h] + getHyp <- resultHypParam[, hn][[1]] + getDis <- initBoundsDis[hn] + if (hn == "lambda") { + lambda_max <- unique(resultHypParam$lambda_max) + lambda_min <- lambda_max * 0.0001 + getHyp <- getHyp / (lambda_max - lambda_min) + } + getRange <- initBounds[hn][[1]] + if (length(getRange) == 2) { + newLowB <- getHyp - getDis * newBoundsFreedom + if (newLowB < getRange[1]) { + newLowB <- getRange[1] + } + newUpB <- getHyp + getDis * newBoundsFreedom + if (newUpB > getRange[2]) { + newUpB <- getRange[2] + } + newBounds <- unname(c(newLowB, newUpB)) + hyper_updated_prev[[hn]] <- newBounds + } else { + fixed <- hyper_updated_prev[hn][[1]] + hyper_updated_prev[[hn]] <- c( + fixed * (1 - newBoundsFreedom), + fixed * (1 + newBoundsFreedom) + ) + } + } + if (!ts_validation) hyper_updated_prev[["train_size"]] <- NULL + return(hyper_updated_prev) +} diff --git a/R/response.R b/R/response.R index 011d8b4..7334a7b 100644 --- a/R/response.R +++ b/R/response.R @@ -1,402 +1,402 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -#################################################################### -#' Response and Saturation Curves -#' -#' \code{robyn_response()} returns the response for a given -#' spend level of a given \code{paid_media_vars} from a selected model -#' result and selected model build (initial model, refresh model, etc.). -#' -#' @inheritParams robyn_allocator -#' @param metric_name A character. Selected media variable for the response. -#' Must be one value from paid_media_spends, paid_media_vars or organic_vars -#' @param metric_value Numeric. Desired metric value to return a response for. -#' @param dt_hyppar A data.frame. When \code{robyn_object} is not provided, use -#' \code{dt_hyppar = OutputCollect$resultHypParam}. It must be provided along -#' \code{select_model}, \code{dt_coef} and \code{InputCollect}. -#' @param dt_coef A data.frame. When \code{robyn_object} is not provided, use -#' \code{dt_coef = OutputCollect$xDecompAgg}. It must be provided along -#' \code{select_model}, \code{dt_hyppar} and \code{InputCollect}. -#' @examples -#' \dontrun{ -#' # Having InputCollect and OutputCollect objects -#' ## Recreate original saturation curve -#' Response <- robyn_response( -#' InputCollect = InputCollect, -#' OutputCollect = OutputCollect, -#' select_model = select_model, -#' metric_name = "facebook_S" -#' ) -#' Response$plot -#' -#' ## Or you can call a JSON file directly (a bit slower) -#' # Response <- robyn_response( -#' # json_file = "your_json_path.json", -#' # dt_input = dt_simulated_weekly, -#' # dt_holidays = dt_prophet_holidays, -#' # metric_name = "facebook_S" -#' # ) -#' -#' ## Get the "next 100 dollar" marginal response on Spend1 -#' Spend1 <- 20000 -#' Response1 <- robyn_response( -#' InputCollect = InputCollect, -#' OutputCollect = OutputCollect, -#' select_model = select_model, -#' metric_name = "facebook_S", -#' metric_value = Spend1, # total budget for date_range -#' date_range = "last_1" # last two periods -#' ) -#' Response1$plot -#' -#' Spend2 <- Spend1 + 100 -#' Response2 <- robyn_response( -#' InputCollect = InputCollect, -#' OutputCollect = OutputCollect, -#' select_model = select_model, -#' metric_name = "facebook_S", -#' metric_value = Spend2, -#' date_range = "last_1" -#' ) -#' # ROAS for the 100$ from Spend1 level -#' (Response2$response_total - Response1$response_total) / (Spend2 - Spend1) -#' -#' ## Get response from for a given budget and date_range -#' Spend3 <- 100000 -#' Response3 <- robyn_response( -#' InputCollect = InputCollect, -#' OutputCollect = OutputCollect, -#' select_model = select_model, -#' metric_name = "facebook_S", -#' metric_value = Spend3, # total budget for date_range -#' date_range = "last_5" # last 5 periods -#' ) -#' Response3$plot -#' -#' ## Example of getting paid media exposure response curves -#' imps <- 10000000 -#' response_imps <- robyn_response( -#' InputCollect = InputCollect, -#' OutputCollect = OutputCollect, -#' select_model = select_model, -#' metric_name = "facebook_I", -#' metric_value = imps -#' ) -#' response_imps$response_total / imps * 1000 -#' response_imps$plot -#' -#' ## Example of getting organic media exposure response curves -#' sendings <- 30000 -#' response_sending <- robyn_response( -#' InputCollect = InputCollect, -#' OutputCollect = OutputCollect, -#' select_model = select_model, -#' metric_name = "newsletter", -#' metric_value = sendings -#' ) -#' # response per 1000 sendings -#' response_sending$response_total / sendings * 1000 -#' response_sending$plot -#' } -#' @return List. Response value and plot. Class: \code{robyn_response}. -#' @export -robyn_response <- function(InputCollect = NULL, - OutputCollect = NULL, - json_file = NULL, - robyn_object = NULL, - select_build = NULL, - select_model = NULL, - metric_name = NULL, - metric_value = NULL, - date_range = NULL, - dt_hyppar = NULL, - dt_coef = NULL, - quiet = FALSE, - ...) { - ## Get input - - ### Use previously exported model using json_file - if (!is.null(json_file)) { - if (is.null(InputCollect)) InputCollect <- robyn_inputs(json_file = json_file, ...) - if (is.null(OutputCollect)) { - OutputCollect <- robyn_run( - InputCollect = InputCollect, - json_file = json_file, - export = FALSE, - quiet = quiet, - ... - ) - } - if (is.null(dt_hyppar)) dt_hyppar <- OutputCollect$resultHypParam - if (is.null(dt_coef)) dt_coef <- OutputCollect$xDecompAgg - } else { - if (!is.null(robyn_object)) { - if (!file.exists(robyn_object)) { - stop("File does not exist or is somewhere else. Check: ", robyn_object) - } else { - Robyn <- readRDS(robyn_object) - objectPath <- dirname(robyn_object) - objectName <- sub("'\\..*$", "", basename(robyn_object)) - } - select_build_all <- 0:(length(Robyn) - 1) - if (is.null(select_build)) { - select_build <- max(select_build_all) - if (!quiet && length(select_build_all) > 1) { - message( - "Using latest model: ", ifelse(select_build == 0, "initial model", paste0("refresh model #", select_build)), - " for the response function. Use parameter 'select_build' to specify which run to use" - ) - } - } - if (!(select_build %in% select_build_all) || length(select_build) != 1) { - stop("'select_build' must be one value of ", paste(select_build_all, collapse = ", ")) - } - listName <- ifelse(select_build == 0, "listInit", paste0("listRefresh", select_build)) - InputCollect <- Robyn[[listName]][["InputCollect"]] - OutputCollect <- Robyn[[listName]][["OutputCollect"]] - dt_hyppar <- OutputCollect$resultHypParam - dt_coef <- OutputCollect$xDecompAgg - } else { - # Try to get some pre-filled values - if (is.null(dt_hyppar)) dt_hyppar <- OutputCollect$resultHypParam - if (is.null(dt_coef)) dt_coef <- OutputCollect$xDecompAgg - if (any(is.null(dt_hyppar), is.null(dt_coef), is.null(InputCollect), is.null(OutputCollect))) { - stop("When 'robyn_object' is not provided, 'InputCollect' & 'OutputCollect' must be provided") - } - } - } - - if ("selectID" %in% names(OutputCollect)) { - select_model <- OutputCollect$selectID - } - - ## Prep environment - if (TRUE) { - dt_input <- InputCollect$dt_input - startRW <- InputCollect$rollingWindowStartWhich - endRW <- InputCollect$rollingWindowEndWhich - adstock <- InputCollect$adstock - spendExpoMod <- InputCollect$modNLS$results - paid_media_vars <- InputCollect$paid_media_vars - paid_media_spends <- InputCollect$paid_media_spends - exposure_vars <- InputCollect$exposure_vars - organic_vars <- InputCollect$organic_vars - allSolutions <- unique(dt_hyppar$solID) - dayInterval <- InputCollect$dayInterval - } - - if (!isTRUE(select_model %in% allSolutions) || is.null(select_model)) { - stop(paste0( - "Input 'select_model' must be one of these values: ", - paste(allSolutions, collapse = ", ") - )) - } - - ## Get use case based on inputs - usecase <- which_usecase(metric_value, date_range) - - ## Check inputs with usecases - metric_type <- check_metric_type(metric_name, paid_media_spends, paid_media_vars, exposure_vars, organic_vars) - all_dates <- pull(dt_input, InputCollect$date_var) - all_values <- pull(dt_input, metric_name) - - if (usecase == "all_historical_vec") { - ds_list <- check_metric_dates(date_range = "all", all_dates[1:endRW], dayInterval, quiet, ...) - metric_value <- NULL - # val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) - } else if (usecase == "unit_metric_default_last_n") { - ds_list <- check_metric_dates(date_range = paste0("last_", length(metric_value)), all_dates[1:endRW], dayInterval, quiet, ...) - # val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) - } else { - ds_list <- check_metric_dates(date_range, all_dates[1:endRW], dayInterval, quiet, ...) - } - val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) - date_range_updated <- ds_list$date_range_updated - metric_value_updated <- val_list$metric_value_updated - all_values_updated <- val_list$all_values_updated - - ## Transform exposure to spend when necessary - if (metric_type == "exposure") { - get_spend_name <- paid_media_spends[which(paid_media_vars == metric_name)] - # expo_vec <- dt_input[, metric_name][[1]] - # Use non-0 mean as marginal level if metric_value not provided - # if (is.null(metric_value)) { - # metric_value <- mean(expo_vec[startRW:endRW][expo_vec[startRW:endRW] > 0]) - # if (!quiet) message("Input 'metric_value' not provided. Using mean of ", metric_name, " instead") - # } - # Fit spend to exposure - # spend_vec <- dt_input[, get_spend_name][[1]] - if (is.null(spendExpoMod)) { - stop("Can't calculate exposure to spend response. Please, recreate your InputCollect object") - } - temp <- filter(spendExpoMod, .data$channel == metric_name) - nls_select <- temp$rsq_nls > temp$rsq_lm - if (nls_select) { - Vmax <- spendExpoMod$Vmax[spendExpoMod$channel == metric_name] - Km <- spendExpoMod$Km[spendExpoMod$channel == metric_name] - input_immediate <- mic_men(x = metric_value_updated, Vmax = Vmax, Km = Km, reverse = TRUE) - } else { - coef_lm <- spendExpoMod$coef_lm[spendExpoMod$channel == metric_name] - input_immediate <- metric_value_updated / coef_lm - } - all_values_updated[ds_list$metric_loc] <- input_immediate - hpm_name <- get_spend_name - } else { - # use non-0 means marginal level if spend not provided - # if (is.null(metric_value)) { - # metric_value <- mean(media_vec[startRW:endRW][media_vec[startRW:endRW] > 0]) - # if (!quiet) message("Input 'metric_value' not provided. Using mean of ", metric_name, " instead") - # } - input_immediate <- metric_value_updated - hpm_name <- metric_name - } - - ## Adstocking original - media_vec_origin <- dt_input[, metric_name][[1]] - theta <- scale <- shape <- NULL - if (adstock == "geometric") { - theta <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_thetas")]][[1]] - } - if (grepl("weibull", adstock)) { - shape <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_shapes")]][[1]] - scale <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_scales")]][[1]] - } - x_list <- transform_adstock(media_vec_origin, adstock, theta = theta, shape = shape, scale = scale) - m_adstocked <- x_list$x_decayed - # net_carryover_ref <- m_adstocked - media_vec_origin - - ## Adstocking simulation - x_list_sim <- transform_adstock(all_values_updated, adstock, theta = theta, shape = shape, scale = scale) - media_vec_sim <- x_list_sim$x_decayed - media_vec_sim_imme <- if (adstock == "weibull_pdf") x_list_sim$x_imme else x_list_sim$x - input_total <- media_vec_sim[ds_list$metric_loc] - input_immediate <- media_vec_sim_imme[ds_list$metric_loc] - input_carryover <- input_total - input_immediate - - ## Saturation - m_adstockedRW <- m_adstocked[startRW:endRW] - alpha <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_alphas")]], 1) - gamma <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_gammas")]], 1) - if (usecase == "all_historical_vec") { - metric_saturated_total <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) - metric_saturated_carryover <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) - } else { - metric_saturated_total <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma, x_marginal = input_total) - metric_saturated_carryover <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma, x_marginal = input_carryover) - } - metric_saturated_immediate <- metric_saturated_total - metric_saturated_carryover - - ## Decomp - coeff <- dt_coef[dt_coef$solID == select_model & dt_coef$rn == hpm_name, ][["coef"]] - m_saturated <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) - m_resposne <- m_saturated * coeff - response_total <- as.numeric(metric_saturated_total * coeff) - response_carryover <- as.numeric(metric_saturated_carryover * coeff) - response_immediate <- response_total - response_carryover - - dt_line <- data.frame(metric = m_adstockedRW, response = m_resposne, channel = metric_name) - if (usecase == "all_historical_vec") { - dt_point <- data.frame(input = input_total[startRW:endRW], output = response_total, ds = date_range_updated[startRW:endRW]) - dt_point_caov <- data.frame(input = input_carryover[startRW:endRW], output = response_carryover) - dt_point_imme <- data.frame(input = input_immediate[startRW:endRW], output = response_immediate) - } else { - dt_point <- data.frame(input = input_total, output = response_total, ds = date_range_updated) - dt_point_caov <- data.frame(input = input_carryover, output = response_carryover) - dt_point_imme <- data.frame(input = input_immediate, output = response_immediate) - } - - ## Plot optimal response - p_res <- ggplot(dt_line, aes(x = .data$metric, y = .data$response)) + - geom_line(color = "steelblue") + - geom_point(data = dt_point, aes(x = .data$input, y = .data$output), size = 3) + - labs( - title = paste( - "Saturation curve of", - ifelse(metric_type == "organic", "organic", "paid"), - "media:", metric_name, - ifelse(!is.null(date_range_updated), "adstocked", ""), - ifelse(metric_type == "spend", "spend metric", "exposure metric") - ), - subtitle = ifelse(length(unique(input_total)) == 1, sprintf( - paste( - "Carryover* Response: %s @ Input %s", - "Immediate Response: %s @ Input %s", - "Total (C+I) Response: %s @ Input %s", - sep = "\n" - ), - num_abbr(dt_point_caov$output), num_abbr(dt_point_caov$input), - num_abbr(dt_point_imme$output), num_abbr(dt_point_imme$input), - num_abbr(dt_point$output), num_abbr(dt_point$input) - ), ""), - x = "Input", y = "Response", - caption = sprintf( - "Response period: %s%s%s", - head(date_range_updated, 1), - ifelse(length(date_range_updated) > 1, paste(" to", tail(date_range_updated, 1)), ""), - ifelse(length(date_range_updated) > 1, paste0(" [", length(date_range_updated), " periods]"), "") - ) - ) + - theme_lares(background = "white") + - scale_x_abbr() + - scale_y_abbr() - if (length(unique(metric_value)) == 1) { - p_res <- p_res + - geom_point(data = dt_point_caov, aes(x = .data$input, y = .data$output), size = 3, shape = 8) - } - - ret <- list( - metric_name = metric_name, - date = date_range_updated, - input_total = input_total, - input_carryover = input_carryover, - input_immediate = input_immediate, - response_total = response_total, - response_carryover = response_carryover, - response_immediate = response_immediate, - usecase = usecase, - plot = p_res - ) - class(ret) <- unique(c("robyn_response", class(ret))) - return(ret) -} - -which_usecase <- function(metric_value, date_range) { - usecase <- case_when( - # Case 1: raw historical spend and all dates -> model decomp as out of the model (no mean spends) - is.null(metric_value) & is.null(date_range) ~ "all_historical_vec", - # Case 2: same as case 1 for date_range - is.null(metric_value) & !is.null(date_range) ~ "selected_historical_vec", - ######### Simulations: use metric_value, not the historical real spend anymore - # Cases 3-4: metric_value for "total budget" for date_range period - length(metric_value) == 1 & is.null(date_range) ~ "total_metric_default_range", - length(metric_value) == 1 & !is.null(date_range) ~ "total_metric_selected_range", - # Cases 5-6: individual period values, not total; requires date_range to be the same length as metric_value - length(metric_value) > 1 & is.null(date_range) ~ "unit_metric_default_last_n", - TRUE ~ "unit_metric_selected_dates" - ) - if (!is.null(date_range)) { - if (length(date_range) == 1 & as.character(date_range[1]) == "all") { - usecase <- "all_historical_vec" - } - } - return(usecase) -} - -# ####### SCENARIOS CHECK FOR date_range -# metric_value <- 71427 -# all_dates <- dt_input$DATE -# check_metric_dates(metric_value, date_range = NULL, all_dates, quiet = FALSE) -# check_metric_dates(metric_value, date_range = "last", all_dates, quiet = FALSE) -# check_metric_dates(metric_value, date_range = "last_5", all_dates, quiet = FALSE) -# check_metric_dates(metric_value, date_range = "all", all_dates, quiet = FALSE) -# check_metric_dates(metric_value, date_range = c("2018-01-01"), all_dates, quiet = FALSE) -# check_metric_dates(metric_value, date_range = c("2018-01-01", "2018-07-11"), all_dates, quiet = FALSE) # WARNING -# check_metric_dates(metric_value, date_range = c("2018-01-01", "2018-07-09"), all_dates, quiet = FALSE) -# check_metric_dates(c(50000, 60000), date_range = "last_4", all_dates, quiet = FALSE) # ERROR -# check_metric_dates(c(50000, 60000), date_range = "last_2", all_dates, quiet = FALSE) -# check_metric_dates(c(50000, 60000), date_range = c("2018-12-31", "2019-01-07"), all_dates, quiet = FALSE) -# check_metric_dates(c(50000, 60000), date_range = c("2018-12-31"), all_dates, quiet = FALSE) # ERROR -# check_metric_dates(0, date_range = c("2018-12-31"), all_dates, quiet = FALSE) +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#################################################################### +#' Response and Saturation Curves +#' +#' \code{robyn_response()} returns the response for a given +#' spend level of a given \code{paid_media_vars} from a selected model +#' result and selected model build (initial model, refresh model, etc.). +#' +#' @inheritParams robyn_allocator +#' @param metric_name A character. Selected media variable for the response. +#' Must be one value from paid_media_spends, paid_media_vars or organic_vars +#' @param metric_value Numeric. Desired metric value to return a response for. +#' @param dt_hyppar A data.frame. When \code{robyn_object} is not provided, use +#' \code{dt_hyppar = OutputCollect$resultHypParam}. It must be provided along +#' \code{select_model}, \code{dt_coef} and \code{InputCollect}. +#' @param dt_coef A data.frame. When \code{robyn_object} is not provided, use +#' \code{dt_coef = OutputCollect$xDecompAgg}. It must be provided along +#' \code{select_model}, \code{dt_hyppar} and \code{InputCollect}. +#' @examples +#' \dontrun{ +#' # Having InputCollect and OutputCollect objects +#' ## Recreate original saturation curve +#' Response <- robyn_response( +#' InputCollect = InputCollect, +#' OutputCollect = OutputCollect, +#' select_model = select_model, +#' metric_name = "facebook_S" +#' ) +#' Response$plot +#' +#' ## Or you can call a JSON file directly (a bit slower) +#' # Response <- robyn_response( +#' # json_file = "your_json_path.json", +#' # dt_input = dt_simulated_weekly, +#' # dt_holidays = dt_prophet_holidays, +#' # metric_name = "facebook_S" +#' # ) +#' +#' ## Get the "next 100 dollar" marginal response on Spend1 +#' Spend1 <- 20000 +#' Response1 <- robyn_response( +#' InputCollect = InputCollect, +#' OutputCollect = OutputCollect, +#' select_model = select_model, +#' metric_name = "facebook_S", +#' metric_value = Spend1, # total budget for date_range +#' date_range = "last_1" # last two periods +#' ) +#' Response1$plot +#' +#' Spend2 <- Spend1 + 100 +#' Response2 <- robyn_response( +#' InputCollect = InputCollect, +#' OutputCollect = OutputCollect, +#' select_model = select_model, +#' metric_name = "facebook_S", +#' metric_value = Spend2, +#' date_range = "last_1" +#' ) +#' # ROAS for the 100$ from Spend1 level +#' (Response2$response_total - Response1$response_total) / (Spend2 - Spend1) +#' +#' ## Get response from for a given budget and date_range +#' Spend3 <- 100000 +#' Response3 <- robyn_response( +#' InputCollect = InputCollect, +#' OutputCollect = OutputCollect, +#' select_model = select_model, +#' metric_name = "facebook_S", +#' metric_value = Spend3, # total budget for date_range +#' date_range = "last_5" # last 5 periods +#' ) +#' Response3$plot +#' +#' ## Example of getting paid media exposure response curves +#' imps <- 10000000 +#' response_imps <- robyn_response( +#' InputCollect = InputCollect, +#' OutputCollect = OutputCollect, +#' select_model = select_model, +#' metric_name = "facebook_I", +#' metric_value = imps +#' ) +#' response_imps$response_total / imps * 1000 +#' response_imps$plot +#' +#' ## Example of getting organic media exposure response curves +#' sendings <- 30000 +#' response_sending <- robyn_response( +#' InputCollect = InputCollect, +#' OutputCollect = OutputCollect, +#' select_model = select_model, +#' metric_name = "newsletter", +#' metric_value = sendings +#' ) +#' # response per 1000 sendings +#' response_sending$response_total / sendings * 1000 +#' response_sending$plot +#' } +#' @return List. Response value and plot. Class: \code{robyn_response}. +#' @export +robyn_response <- function(InputCollect = NULL, + OutputCollect = NULL, + json_file = NULL, + robyn_object = NULL, + select_build = NULL, + select_model = NULL, + metric_name = NULL, + metric_value = NULL, + date_range = NULL, + dt_hyppar = NULL, + dt_coef = NULL, + quiet = FALSE, + ...) { + ## Get input + + ### Use previously exported model using json_file + if (!is.null(json_file)) { + if (is.null(InputCollect)) InputCollect <- robyn_inputs(json_file = json_file, ...) + if (is.null(OutputCollect)) { + OutputCollect <- robyn_run( + InputCollect = InputCollect, + json_file = json_file, + export = FALSE, + quiet = quiet, + ... + ) + } + if (is.null(dt_hyppar)) dt_hyppar <- OutputCollect$resultHypParam + if (is.null(dt_coef)) dt_coef <- OutputCollect$xDecompAgg + } else { + if (!is.null(robyn_object)) { + if (!file.exists(robyn_object)) { + stop("File does not exist or is somewhere else. Check: ", robyn_object) + } else { + Robyn <- readRDS(robyn_object) + objectPath <- dirname(robyn_object) + objectName <- sub("'\\..*$", "", basename(robyn_object)) + } + select_build_all <- 0:(length(Robyn) - 1) + if (is.null(select_build)) { + select_build <- max(select_build_all) + if (!quiet && length(select_build_all) > 1) { + message( + "Using latest model: ", ifelse(select_build == 0, "initial model", paste0("refresh model #", select_build)), + " for the response function. Use parameter 'select_build' to specify which run to use" + ) + } + } + if (!(select_build %in% select_build_all) || length(select_build) != 1) { + stop("'select_build' must be one value of ", paste(select_build_all, collapse = ", ")) + } + listName <- ifelse(select_build == 0, "listInit", paste0("listRefresh", select_build)) + InputCollect <- Robyn[[listName]][["InputCollect"]] + OutputCollect <- Robyn[[listName]][["OutputCollect"]] + dt_hyppar <- OutputCollect$resultHypParam + dt_coef <- OutputCollect$xDecompAgg + } else { + # Try to get some pre-filled values + if (is.null(dt_hyppar)) dt_hyppar <- OutputCollect$resultHypParam + if (is.null(dt_coef)) dt_coef <- OutputCollect$xDecompAgg + if (any(is.null(dt_hyppar), is.null(dt_coef), is.null(InputCollect), is.null(OutputCollect))) { + stop("When 'robyn_object' is not provided, 'InputCollect' & 'OutputCollect' must be provided") + } + } + } + + if ("selectID" %in% names(OutputCollect)) { + select_model <- OutputCollect$selectID + } + + ## Prep environment + if (TRUE) { + dt_input <- InputCollect$dt_input + startRW <- InputCollect$rollingWindowStartWhich + endRW <- InputCollect$rollingWindowEndWhich + adstock <- InputCollect$adstock + spendExpoMod <- InputCollect$modNLS$results + paid_media_vars <- InputCollect$paid_media_vars + paid_media_spends <- InputCollect$paid_media_spends + exposure_vars <- InputCollect$exposure_vars + organic_vars <- InputCollect$organic_vars + allSolutions <- unique(dt_hyppar$solID) + dayInterval <- InputCollect$dayInterval + } + + if (!isTRUE(select_model %in% allSolutions) || is.null(select_model)) { + stop(paste0( + "Input 'select_model' must be one of these values: ", + paste(allSolutions, collapse = ", ") + )) + } + + ## Get use case based on inputs + usecase <- which_usecase(metric_value, date_range) + + ## Check inputs with usecases + metric_type <- check_metric_type(metric_name, paid_media_spends, paid_media_vars, exposure_vars, organic_vars) + all_dates <- pull(dt_input, InputCollect$date_var) + all_values <- pull(dt_input, metric_name) + + if (usecase == "all_historical_vec") { + ds_list <- check_metric_dates(date_range = "all", all_dates[1:endRW], dayInterval, quiet, ...) + metric_value <- NULL + # val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) + } else if (usecase == "unit_metric_default_last_n") { + ds_list <- check_metric_dates(date_range = paste0("last_", length(metric_value)), all_dates[1:endRW], dayInterval, quiet, ...) + # val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) + } else { + ds_list <- check_metric_dates(date_range, all_dates[1:endRW], dayInterval, quiet, ...) + } + val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) + date_range_updated <- ds_list$date_range_updated + metric_value_updated <- val_list$metric_value_updated + all_values_updated <- val_list$all_values_updated + + ## Transform exposure to spend when necessary + if (metric_type == "exposure") { + get_spend_name <- paid_media_spends[which(paid_media_vars == metric_name)] + # expo_vec <- dt_input[, metric_name][[1]] + # Use non-0 mean as marginal level if metric_value not provided + # if (is.null(metric_value)) { + # metric_value <- mean(expo_vec[startRW:endRW][expo_vec[startRW:endRW] > 0]) + # if (!quiet) message("Input 'metric_value' not provided. Using mean of ", metric_name, " instead") + # } + # Fit spend to exposure + # spend_vec <- dt_input[, get_spend_name][[1]] + if (is.null(spendExpoMod)) { + stop("Can't calculate exposure to spend response. Please, recreate your InputCollect object") + } + temp <- filter(spendExpoMod, .data$channel == metric_name) + nls_select <- temp$rsq_nls > temp$rsq_lm + if (nls_select) { + Vmax <- spendExpoMod$Vmax[spendExpoMod$channel == metric_name] + Km <- spendExpoMod$Km[spendExpoMod$channel == metric_name] + input_immediate <- mic_men(x = metric_value_updated, Vmax = Vmax, Km = Km, reverse = TRUE) + } else { + coef_lm <- spendExpoMod$coef_lm[spendExpoMod$channel == metric_name] + input_immediate <- metric_value_updated / coef_lm + } + all_values_updated[ds_list$metric_loc] <- input_immediate + hpm_name <- get_spend_name + } else { + # use non-0 means marginal level if spend not provided + # if (is.null(metric_value)) { + # metric_value <- mean(media_vec[startRW:endRW][media_vec[startRW:endRW] > 0]) + # if (!quiet) message("Input 'metric_value' not provided. Using mean of ", metric_name, " instead") + # } + input_immediate <- metric_value_updated + hpm_name <- metric_name + } + + ## Adstocking original + media_vec_origin <- dt_input[, metric_name][[1]] + theta <- scale <- shape <- NULL + if (adstock == "geometric") { + theta <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_thetas")]][[1]] + } + if (grepl("weibull", adstock)) { + shape <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_shapes")]][[1]] + scale <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_scales")]][[1]] + } + x_list <- transform_adstock(media_vec_origin, adstock, theta = theta, shape = shape, scale = scale) + m_adstocked <- x_list$x_decayed + # net_carryover_ref <- m_adstocked - media_vec_origin + + ## Adstocking simulation + x_list_sim <- transform_adstock(all_values_updated, adstock, theta = theta, shape = shape, scale = scale) + media_vec_sim <- x_list_sim$x_decayed + media_vec_sim_imme <- if (adstock == "weibull_pdf") x_list_sim$x_imme else x_list_sim$x + input_total <- media_vec_sim[ds_list$metric_loc] + input_immediate <- media_vec_sim_imme[ds_list$metric_loc] + input_carryover <- input_total - input_immediate + + ## Saturation + m_adstockedRW <- m_adstocked[startRW:endRW] + alpha <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_alphas")]], 1) + gamma <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_gammas")]], 1) + if (usecase == "all_historical_vec") { + metric_saturated_total <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) + metric_saturated_carryover <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) + } else { + metric_saturated_total <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma, x_marginal = input_total) + metric_saturated_carryover <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma, x_marginal = input_carryover) + } + metric_saturated_immediate <- metric_saturated_total - metric_saturated_carryover + + ## Decomp + coeff <- dt_coef[dt_coef$solID == select_model & dt_coef$rn == hpm_name, ][["coef"]] + m_saturated <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) + m_resposne <- m_saturated * coeff + response_total <- as.numeric(metric_saturated_total * coeff) + response_carryover <- as.numeric(metric_saturated_carryover * coeff) + response_immediate <- response_total - response_carryover + + dt_line <- data.frame(metric = m_adstockedRW, response = m_resposne, channel = metric_name) + if (usecase == "all_historical_vec") { + dt_point <- data.frame(input = input_total[startRW:endRW], output = response_total, ds = date_range_updated[startRW:endRW]) + dt_point_caov <- data.frame(input = input_carryover[startRW:endRW], output = response_carryover) + dt_point_imme <- data.frame(input = input_immediate[startRW:endRW], output = response_immediate) + } else { + dt_point <- data.frame(input = input_total, output = response_total, ds = date_range_updated) + dt_point_caov <- data.frame(input = input_carryover, output = response_carryover) + dt_point_imme <- data.frame(input = input_immediate, output = response_immediate) + } + + ## Plot optimal response + p_res <- ggplot(dt_line, aes(x = .data$metric, y = .data$response)) + + geom_line(color = "steelblue") + + geom_point(data = dt_point, aes(x = .data$input, y = .data$output), size = 3) + + labs( + title = paste( + "Saturation curve of", + ifelse(metric_type == "organic", "organic", "paid"), + "media:", metric_name, + ifelse(!is.null(date_range_updated), "adstocked", ""), + ifelse(metric_type == "spend", "spend metric", "exposure metric") + ), + subtitle = ifelse(length(unique(input_total)) == 1, sprintf( + paste( + "Carryover* Response: %s @ Input %s", + "Immediate Response: %s @ Input %s", + "Total (C+I) Response: %s @ Input %s", + sep = "\n" + ), + num_abbr(dt_point_caov$output), num_abbr(dt_point_caov$input), + num_abbr(dt_point_imme$output), num_abbr(dt_point_imme$input), + num_abbr(dt_point$output), num_abbr(dt_point$input) + ), ""), + x = "Input", y = "Response", + caption = sprintf( + "Response period: %s%s%s", + head(date_range_updated, 1), + ifelse(length(date_range_updated) > 1, paste(" to", tail(date_range_updated, 1)), ""), + ifelse(length(date_range_updated) > 1, paste0(" [", length(date_range_updated), " periods]"), "") + ) + ) + + theme_lares(background = "white") + + scale_x_abbr() + + scale_y_abbr() + if (length(unique(metric_value)) == 1) { + p_res <- p_res + + geom_point(data = dt_point_caov, aes(x = .data$input, y = .data$output), size = 3, shape = 8) + } + + ret <- list( + metric_name = metric_name, + date = date_range_updated, + input_total = input_total, + input_carryover = input_carryover, + input_immediate = input_immediate, + response_total = response_total, + response_carryover = response_carryover, + response_immediate = response_immediate, + usecase = usecase, + plot = p_res + ) + class(ret) <- unique(c("robyn_response", class(ret))) + return(ret) +} + +which_usecase <- function(metric_value, date_range) { + usecase <- case_when( + # Case 1: raw historical spend and all dates -> model decomp as out of the model (no mean spends) + is.null(metric_value) & is.null(date_range) ~ "all_historical_vec", + # Case 2: same as case 1 for date_range + is.null(metric_value) & !is.null(date_range) ~ "selected_historical_vec", + ######### Simulations: use metric_value, not the historical real spend anymore + # Cases 3-4: metric_value for "total budget" for date_range period + length(metric_value) == 1 & is.null(date_range) ~ "total_metric_default_range", + length(metric_value) == 1 & !is.null(date_range) ~ "total_metric_selected_range", + # Cases 5-6: individual period values, not total; requires date_range to be the same length as metric_value + length(metric_value) > 1 & is.null(date_range) ~ "unit_metric_default_last_n", + TRUE ~ "unit_metric_selected_dates" + ) + if (!is.null(date_range)) { + if (length(date_range) == 1 & as.character(date_range[1]) == "all") { + usecase <- "all_historical_vec" + } + } + return(usecase) +} + +# ####### SCENARIOS CHECK FOR date_range +# metric_value <- 71427 +# all_dates <- dt_input$DATE +# check_metric_dates(metric_value, date_range = NULL, all_dates, quiet = FALSE) +# check_metric_dates(metric_value, date_range = "last", all_dates, quiet = FALSE) +# check_metric_dates(metric_value, date_range = "last_5", all_dates, quiet = FALSE) +# check_metric_dates(metric_value, date_range = "all", all_dates, quiet = FALSE) +# check_metric_dates(metric_value, date_range = c("2018-01-01"), all_dates, quiet = FALSE) +# check_metric_dates(metric_value, date_range = c("2018-01-01", "2018-07-11"), all_dates, quiet = FALSE) # WARNING +# check_metric_dates(metric_value, date_range = c("2018-01-01", "2018-07-09"), all_dates, quiet = FALSE) +# check_metric_dates(c(50000, 60000), date_range = "last_4", all_dates, quiet = FALSE) # ERROR +# check_metric_dates(c(50000, 60000), date_range = "last_2", all_dates, quiet = FALSE) +# check_metric_dates(c(50000, 60000), date_range = c("2018-12-31", "2019-01-07"), all_dates, quiet = FALSE) +# check_metric_dates(c(50000, 60000), date_range = c("2018-12-31"), all_dates, quiet = FALSE) # ERROR +# check_metric_dates(0, date_range = c("2018-12-31"), all_dates, quiet = FALSE) diff --git a/R/transformation.R b/R/transformation.R index 3ef08fe..dbf6512 100644 --- a/R/transformation.R +++ b/R/transformation.R @@ -1,442 +1,442 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -# Includes function mic_men, adstock_geometric, adstock_weibull, -# saturation_hill, plot_adstock, plot_saturation - -#################################################################### -#' Michaelis-Menten Transformation -#' -#' The Michaelis-Menten \code{mic_men()} function is used to fit the spend -#' exposure relationship for paid media variables, when exposure metrics like -#' impressions, clicks or GRPs are provided in \code{paid_media_vars} instead -#' of spend metric. -#' -#' @family Transformations -#' @param x Numeric value or vector. Input media spend when -#' \code{reverse = FALSE}. Input media exposure metrics (impression, clicks, -#' GRPs, etc.) when \code{reverse = TRUE}. -#' @param Vmax Numeric Indicates maximum rate achieved by the system. -#' @param Km Numeric. The Michaelis constant. -#' @param reverse Boolean. Input media spend when \code{reverse = FALSE}. -#' Input media exposure metrics (impression, clicks, GRPs etc.) when \code{reverse = TRUE}. -#' @examples -#' mic_men(x = 5:10, Vmax = 5, Km = 0.5) -#' @return Numeric values. Transformed values. -#' @export -mic_men <- function(x, Vmax, Km, reverse = FALSE) { - if (!reverse) { - mm_out <- Vmax * x / (Km + x) - } else { - mm_out <- spend <- x * Km / (Vmax - x) - } - return(mm_out) -} - - -#################################################################### -#' Adstocking Transformation (Geometric and Weibull) -#' -#' \code{adstock_geometric()} for Geometric Adstocking is the classic one-parametric -#' adstock function. -#' -#' @family Transformations -#' @param x A numeric vector. -#' @param theta Numeric. Theta is the only parameter on Geometric Adstocking and means -#' fixed decay rate. Assuming TV spend on day 1 is 100€ and theta = 0.7, then day 2 has -#' 100 x 0.7 = 70€ worth of effect carried-over from day 1, day 3 has 70 x 0.7 = 49€ -#' from day 2 etc. Rule-of-thumb for common media genre: TV c(0.3, 0.8), OOH/Print/ -#' Radio c(0.1, 0.4), digital c(0, 0.3). -#' @examples -#' adstock_geometric(rep(100, 5), theta = 0.5) -#' @return Numeric values. Transformed values. -#' @rdname adstocks -#' @export -adstock_geometric <- function(x, theta) { - stopifnot(length(theta) == 1) - if (length(x) > 1) { - x_decayed <- c(x[1], rep(0, length(x) - 1)) - for (xi in 2:length(x_decayed)) { - x_decayed[xi] <- x[xi] + theta * x_decayed[xi - 1] - } - thetaVecCum <- theta - for (t in 2:length(x)) { - thetaVecCum[t] <- thetaVecCum[t - 1] * theta - } # plot(thetaVecCum) - } else { - x_decayed <- x - thetaVecCum <- theta - } - inflation_total <- sum(x_decayed) / sum(x) - return(list(x = x, x_decayed = x_decayed, thetaVecCum = thetaVecCum, inflation_total = inflation_total)) -} - - -#################################################################### -#' Adstocking Transformation (Geometric and Weibull) -#' -#' \code{adstock_weibull()} for Weibull Adstocking is a two-parametric adstock -#' function that allows changing decay rate over time, as opposed to the fixed -#' decay rate over time as in Geometric adstock. It has two options, the cumulative -#' density function "CDF" or the probability density function "PDF". -#' -#' \describe{ -#' \item{Weibull's CDF (Cumulative Distribution Function)}{has -#' two parameters, shape & scale, and has flexible decay rate, compared to Geometric -#' adstock with fixed decay rate. The shape parameter controls the shape of the decay -#' curve. Recommended bound is c(0.0001, 2). The larger the shape, the more S-shape. The -#' smaller, the more L-shape. Scale controls the inflexion point of the decay curve. We -#' recommend very conservative bounce of c(0, 0.1), because scale increases the adstock -#' half-life greatly.} -#' \item{Weibull's PDF (Probability Density Function)}{also shape & scale as parameter -#' and also has flexible decay rate as Weibull CDF. The difference is that Weibull PDF -#' offers lagged effect. When shape > 2, the curve peaks after x = 0 and has NULL slope at -#' x = 0, enabling lagged effect and sharper increase and decrease of adstock, while the -#' scale parameter indicates the limit of the relative position of the peak at x axis; when -#' 1 < shape < 2, the curve peaks after x = 0 and has infinite positive slope at x = 0, -#' enabling lagged effect and slower increase and decrease of adstock, while scale has the -#' same effect as above; when shape = 1, the curve peaks at x = 0 and reduces to exponential -#' decay, while scale controls the inflexion point; when 0 < shape < 1, the curve peaks at -#' x = 0 and has increasing decay, while scale controls the inflexion point. When all -#' possible shapes are relevant, we recommend c(0.0001, 10) as bounds for shape; when only -#' strong lagged effect is of interest, we recommend c(2.0001, 10) as bound for shape. In -#' all cases, we recommend conservative bound of c(0, 0.1) for scale. Due to the great -#' flexibility of Weibull PDF, meaning more freedom in hyperparameter spaces for Nevergrad -#' to explore, it also requires larger iterations to converge.} -#' } -#' -#' Run \code{plot_adstock()} to see the difference visually. -#' -#' @param shape,scale Numeric. Check "Details" section for more details. -#' @param windlen Integer. Length of modelling window. By default, same length as \code{x}. -#' @param type Character. Accepts "CDF" or "PDF". CDF, or cumulative density -#' function of the Weibull function allows changing decay rate over time in both -#' C and S shape, while the peak value will always stay at the first period, -#' meaning no lagged effect. PDF, or the probability density function, enables -#' peak value occurring after the first period when shape >=1, allowing lagged -#' effect. -#' @examples -#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "CDF") -#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "PDF") -#' -#' # Wrapped function for either adstock -#' transform_adstock(rep(100, 10), "weibull_pdf", shape = 1, scale = 0.5) -#' @rdname adstocks -#' @export -adstock_weibull <- function(x, shape, scale, windlen = length(x), type = "cdf") { - stopifnot(length(shape) == 1) - stopifnot(length(scale) == 1) - if (length(x) > 1) { - check_opts(tolower(type), c("cdf", "pdf")) - x_bin <- 1:windlen - scaleTrans <- round(quantile(1:windlen, scale), 0) - if (shape == 0 | scale == 0) { - x_decayed <- x - thetaVecCum <- thetaVec <- rep(0, windlen) - x_imme <- x - } else { - if ("cdf" %in% tolower(type)) { - thetaVec <- c(1, 1 - pweibull(head(x_bin, -1), shape = shape, scale = scaleTrans)) # plot(thetaVec) - thetaVecCum <- cumprod(thetaVec) # plot(thetaVecCum) - } else if ("pdf" %in% tolower(type)) { - thetaVecCum <- .normalize(dweibull(x_bin, shape = shape, scale = scaleTrans)) # plot(thetaVecCum) - } - x_decayed <- mapply(function(x_val, x_pos) { - x.vec <- c(rep(0, x_pos - 1), rep(x_val, windlen - x_pos + 1)) - thetaVecCumLag <- lag(thetaVecCum, x_pos - 1, default = 0) - x.prod <- x.vec * thetaVecCumLag - return(x.prod) - }, x_val = x, x_pos = seq_along(x)) - x_imme <- diag(x_decayed) - x_decayed <- rowSums(x_decayed)[seq_along(x)] - } - } else { - x_decayed <- x_imme <- x - thetaVecCum <- 1 - } - inflation_total <- sum(x_decayed) / sum(x) - return(list( - x = x, - x_decayed = x_decayed, - thetaVecCum = thetaVecCum, - inflation_total = inflation_total, - x_imme = x_imme - )) -} - -#' @rdname adstocks -#' @param adstock Character. One of: "geometric", "weibull_cdf", "weibull_pdf". -#' @export -transform_adstock <- function(x, adstock, theta = NULL, shape = NULL, scale = NULL, windlen = length(x)) { - check_adstock(adstock) - if (adstock == "geometric") { - x_list_sim <- adstock_geometric(x = x, theta = theta) - } else if (adstock == "weibull_cdf") { - x_list_sim <- adstock_weibull(x = x, shape = shape, scale = scale, windlen = windlen, type = "cdf") - } else if (adstock == "weibull_pdf") { - x_list_sim <- adstock_weibull(x = x, shape = shape, scale = scale, windlen = windlen, type = "pdf") - } - return(x_list_sim) -} - -.normalize <- function(x) { - if (diff(range(x)) == 0) { - return(c(1, rep(0, length(x) - 1))) - } else { - return((x - min(x)) / (max(x) - min(x))) - } -} - -#################################################################### -#' Hill Saturation Transformation -#' -#' \code{saturation_hill} is a two-parametric version of the Hill -#' function that allows the saturation curve to flip between S and C shape. -#' -#' @family Transformations -#' @param x Numeric vector. -#' @param alpha Numeric. Alpha controls the shape of the saturation curve. -#' The larger the alpha, the more S-shape. The smaller, the more C-shape. -#' @param gamma Numeric. Gamma controls the inflexion point of the -#' saturation curve. The larger the gamma, the later the inflexion point occurs. -#' @param x_marginal Numeric. When provided, the function returns the -#' Hill-transformed value of the x_marginal input. -#' @examples -#' saturation_hill(c(100, 150, 170, 190, 200), alpha = 3, gamma = 0.5) -#' @return Numeric values. Transformed values. -#' @export -saturation_hill <- function(x, alpha, gamma, x_marginal = NULL) { - stopifnot(length(alpha) == 1) - stopifnot(length(gamma) == 1) - inflexion <- c(range(x) %*% c(1 - gamma, gamma)) # linear interpolation by dot product - if (is.null(x_marginal)) { - x_scurve <- x**alpha / (x**alpha + inflexion**alpha) # plot(x_scurve) summary(x_scurve) - } else { - x_scurve <- x_marginal**alpha / (x_marginal**alpha + inflexion**alpha) - } - return(x_scurve) -} - - -#################################################################### -#' Adstocking Help Plot -#' -#' @param plot Boolean. Do you wish to return the plot? -#' @rdname adstocks -#' @export -plot_adstock <- function(plot = TRUE) { - if (plot) { - ## Plot geometric - geomCollect <- list() - thetaVec <- c(0.01, 0.05, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9) - - for (v in seq_along(thetaVec)) { - thetaVecCum <- 1 - for (t in 2:100) { - thetaVecCum[t] <- thetaVecCum[t - 1] * thetaVec[v] - } - dt_geom <- data.frame( - x = 1:100, - decay_accumulated = thetaVecCum, - theta = thetaVec[v] - ) - dt_geom$halflife <- which.min(abs(dt_geom$decay_accumulated - 0.5)) - geomCollect[[v]] <- dt_geom - } - geomCollect <- bind_rows(geomCollect) - geomCollect$theta_halflife <- paste(geomCollect$theta, geomCollect$halflife, sep = "_") - - p1 <- ggplot(geomCollect, aes(x = .data$x, y = .data$decay_accumulated)) + - geom_line(aes(color = .data$theta_halflife)) + - geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray") + - geom_text(aes(x = max(.data$x), y = 0.5, vjust = -0.5, hjust = 1, label = "Halflife"), colour = "gray") + - labs( - title = "Geometric Adstock\n(Fixed decay rate)", - subtitle = "Halflife = time until effect reduces to 50%", - x = "Time unit", - y = "Media decay accumulated" - ) + - theme_lares(background = "white", pal = 2) - - ## Plot weibull - weibullCollect <- list() - shapeVec <- c(0.5, 1, 2, 9) - scaleVec <- c(0.01, 0.05, 0.1, 0.15, 0.2, 0.5) - types <- c("CDF", "PDF") - n <- 1 - for (t in seq_along(types)) { - for (v1 in seq_along(shapeVec)) { - for (v2 in seq_along(scaleVec)) { - dt_weibull <- data.frame( - x = 1:100, - decay_accumulated = adstock_weibull( - 1:100, - shape = shapeVec[v1], scale = scaleVec[v2], - type = tolower(types[t]) - )$thetaVecCum, - shape = paste0("shape=", shapeVec[v1]), - scale = as.factor(scaleVec[v2]), - type = types[t] - ) - dt_weibull$halflife <- which.min(abs(dt_weibull$decay_accumulated - 0.5)) - weibullCollect[[n]] <- dt_weibull - n <- n + 1 - } - } - } - weibullCollect <- bind_rows(weibullCollect) - - p2 <- ggplot(weibullCollect, aes(x = .data$x, y = .data$decay_accumulated)) + - geom_line(aes(color = .data$scale)) + - facet_grid(.data$shape ~ .data$type) + - geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray") + - geom_text(aes(x = max(.data$x), y = 0.5, vjust = -0.5, hjust = 1, label = "Halflife"), - colour = "gray" - ) + - labs( - title = "Weibull Adstock CDF vs PDF\n(Flexible decay rate)", - subtitle = "Halflife = time until effect reduces to 50%", - x = "Time unit", - y = "Media decay accumulated" - ) + - theme_lares(background = "white", pal = 2) - return(wrap_plots(A = p1, B = p2, design = "ABB")) - } -} - - -#################################################################### -#' Saturation Help Plot -#' -#' Produce example plots for the Hill saturation curve. -#' -#' @inheritParams plot_adstock -#' @rdname saturation_hill -#' @export -plot_saturation <- function(plot = TRUE) { - if (plot) { - xSample <- 1:100 - alphaSamp <- c(0.1, 0.5, 1, 2, 3) - gammaSamp <- c(0.1, 0.3, 0.5, 0.7, 0.9) - - ## Plot alphas - hillAlphaCollect <- list() - for (i in seq_along(alphaSamp)) { - hillAlphaCollect[[i]] <- data.frame( - x = xSample, - y = xSample**alphaSamp[i] / (xSample**alphaSamp[i] + (0.5 * 100)**alphaSamp[i]), - alpha = alphaSamp[i] - ) - } - hillAlphaCollect <- bind_rows(hillAlphaCollect) - hillAlphaCollect$alpha <- as.factor(hillAlphaCollect$alpha) - p1 <- ggplot(hillAlphaCollect, aes(x = .data$x, y = .data$y, color = .data$alpha)) + - geom_line() + - labs( - title = "Cost response with hill function", - subtitle = "Alpha changes while gamma = 0.5" - ) + - theme_lares(background = "white", pal = 2) - - ## Plot gammas - hillGammaCollect <- list() - for (i in seq_along(gammaSamp)) { - hillGammaCollect[[i]] <- data.frame( - x = xSample, - y = xSample**2 / (xSample**2 + (gammaSamp[i] * 100)**2), - gamma = gammaSamp[i] - ) - } - hillGammaCollect <- bind_rows(hillGammaCollect) - hillGammaCollect$gamma <- as.factor(hillGammaCollect$gamma) - p2 <- ggplot(hillGammaCollect, aes(x = .data$x, y = .data$y, color = .data$gamma)) + - geom_line() + - labs( - title = "Cost response with hill function", - subtitle = "Gamma changes while alpha = 2" - ) + - theme_lares(background = "white", pal = 2) - - return(p1 + p2) - } -} - -#### Transform media for model fitting -run_transformations <- function(InputCollect, hypParamSam, adstock) { - all_media <- InputCollect$all_media - rollingWindowStartWhich <- InputCollect$rollingWindowStartWhich - rollingWindowEndWhich <- InputCollect$rollingWindowEndWhich - dt_modAdstocked <- select(InputCollect$dt_mod, -.data$ds) - - mediaAdstocked <- list() - # mediaImmediate <- list() - # mediaCarryover <- list() - # mediaVecCum <- list() - mediaSaturated <- list() - mediaSaturatedImmediate <- list() - mediaSaturatedCarryover <- list() - - for (v in seq_along(all_media)) { - ################################################ - ## 1. Adstocking (whole data) - # Decayed/adstocked response = Immediate response + Carryover response - m <- dt_modAdstocked[, all_media[v]][[1]] - if (adstock == "geometric") { - theta <- hypParamSam[paste0(all_media[v], "_thetas")][[1]][[1]] - } - if (grepl("weibull", adstock)) { - shape <- hypParamSam[paste0(all_media[v], "_shapes")][[1]][[1]] - scale <- hypParamSam[paste0(all_media[v], "_scales")][[1]][[1]] - } - x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) - m_imme <- if (adstock == "weibull_pdf") x_list$x_imme else m - m_adstocked <- x_list$x_decayed - mediaAdstocked[[v]] <- m_adstocked - m_carryover <- m_adstocked - m_imme - # mediaImmediate[[v]] <- m_imme - # mediaCarryover[[v]] <- m_carryover - # mediaVecCum[[v]] <- x_list$thetaVecCum - - ################################################ - ## 2. Saturation (only window data) - # Saturated response = Immediate response + carryover response - m_adstockedRollWind <- m_adstocked[rollingWindowStartWhich:rollingWindowEndWhich] - m_carryoverRollWind <- m_carryover[rollingWindowStartWhich:rollingWindowEndWhich] - - alpha <- hypParamSam[paste0(all_media[v], "_alphas")][[1]][[1]] - gamma <- hypParamSam[paste0(all_media[v], "_gammas")][[1]][[1]] - mediaSaturated[[v]] <- saturation_hill( - m_adstockedRollWind, - alpha = alpha, gamma = gamma - ) - mediaSaturatedCarryover[[v]] <- saturation_hill( - m_adstockedRollWind, - alpha = alpha, gamma = gamma, x_marginal = m_carryoverRollWind - ) - mediaSaturatedImmediate[[v]] <- mediaSaturated[[v]] - mediaSaturatedCarryover[[v]] - # plot(m_adstockedRollWind, mediaSaturated[[1]]) - } - - names(mediaAdstocked) <- names(mediaSaturated) <- names(mediaSaturatedImmediate) <- - names(mediaSaturatedCarryover) <- all_media - dt_modAdstocked <- dt_modAdstocked %>% - select(-all_of(all_media)) %>% - bind_cols(mediaAdstocked) - # dt_mediaImmediate <- bind_cols(mediaImmediate) - # dt_mediaCarryover <- bind_cols(mediaCarryover) - # mediaVecCum <- bind_cols(mediaVecCum) - dt_modSaturated <- dt_modAdstocked[rollingWindowStartWhich:rollingWindowEndWhich, ] %>% - select(-all_of(all_media)) %>% - bind_cols(mediaSaturated) - dt_saturatedImmediate <- bind_cols(mediaSaturatedImmediate) - dt_saturatedImmediate[is.na(dt_saturatedImmediate)] <- 0 - dt_saturatedCarryover <- bind_cols(mediaSaturatedCarryover) - dt_saturatedCarryover[is.na(dt_saturatedCarryover)] <- 0 - return(list( - dt_modSaturated = dt_modSaturated, - dt_saturatedImmediate = dt_saturatedImmediate, - dt_saturatedCarryover = dt_saturatedCarryover - )) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# Includes function mic_men, adstock_geometric, adstock_weibull, +# saturation_hill, plot_adstock, plot_saturation + +#################################################################### +#' Michaelis-Menten Transformation +#' +#' The Michaelis-Menten \code{mic_men()} function is used to fit the spend +#' exposure relationship for paid media variables, when exposure metrics like +#' impressions, clicks or GRPs are provided in \code{paid_media_vars} instead +#' of spend metric. +#' +#' @family Transformations +#' @param x Numeric value or vector. Input media spend when +#' \code{reverse = FALSE}. Input media exposure metrics (impression, clicks, +#' GRPs, etc.) when \code{reverse = TRUE}. +#' @param Vmax Numeric Indicates maximum rate achieved by the system. +#' @param Km Numeric. The Michaelis constant. +#' @param reverse Boolean. Input media spend when \code{reverse = FALSE}. +#' Input media exposure metrics (impression, clicks, GRPs etc.) when \code{reverse = TRUE}. +#' @examples +#' mic_men(x = 5:10, Vmax = 5, Km = 0.5) +#' @return Numeric values. Transformed values. +#' @export +mic_men <- function(x, Vmax, Km, reverse = FALSE) { + if (!reverse) { + mm_out <- Vmax * x / (Km + x) + } else { + mm_out <- spend <- x * Km / (Vmax - x) + } + return(mm_out) +} + + +#################################################################### +#' Adstocking Transformation (Geometric and Weibull) +#' +#' \code{adstock_geometric()} for Geometric Adstocking is the classic one-parametric +#' adstock function. +#' +#' @family Transformations +#' @param x A numeric vector. +#' @param theta Numeric. Theta is the only parameter on Geometric Adstocking and means +#' fixed decay rate. Assuming TV spend on day 1 is 100€ and theta = 0.7, then day 2 has +#' 100 x 0.7 = 70€ worth of effect carried-over from day 1, day 3 has 70 x 0.7 = 49€ +#' from day 2 etc. Rule-of-thumb for common media genre: TV c(0.3, 0.8), OOH/Print/ +#' Radio c(0.1, 0.4), digital c(0, 0.3). +#' @examples +#' adstock_geometric(rep(100, 5), theta = 0.5) +#' @return Numeric values. Transformed values. +#' @rdname adstocks +#' @export +adstock_geometric <- function(x, theta) { + stopifnot(length(theta) == 1) + if (length(x) > 1) { + x_decayed <- c(x[1], rep(0, length(x) - 1)) + for (xi in 2:length(x_decayed)) { + x_decayed[xi] <- x[xi] + theta * x_decayed[xi - 1] + } + thetaVecCum <- theta + for (t in 2:length(x)) { + thetaVecCum[t] <- thetaVecCum[t - 1] * theta + } # plot(thetaVecCum) + } else { + x_decayed <- x + thetaVecCum <- theta + } + inflation_total <- sum(x_decayed) / sum(x) + return(list(x = x, x_decayed = x_decayed, thetaVecCum = thetaVecCum, inflation_total = inflation_total)) +} + + +#################################################################### +#' Adstocking Transformation (Geometric and Weibull) +#' +#' \code{adstock_weibull()} for Weibull Adstocking is a two-parametric adstock +#' function that allows changing decay rate over time, as opposed to the fixed +#' decay rate over time as in Geometric adstock. It has two options, the cumulative +#' density function "CDF" or the probability density function "PDF". +#' +#' \describe{ +#' \item{Weibull's CDF (Cumulative Distribution Function)}{has +#' two parameters, shape & scale, and has flexible decay rate, compared to Geometric +#' adstock with fixed decay rate. The shape parameter controls the shape of the decay +#' curve. Recommended bound is c(0.0001, 2). The larger the shape, the more S-shape. The +#' smaller, the more L-shape. Scale controls the inflexion point of the decay curve. We +#' recommend very conservative bounce of c(0, 0.1), because scale increases the adstock +#' half-life greatly.} +#' \item{Weibull's PDF (Probability Density Function)}{also shape & scale as parameter +#' and also has flexible decay rate as Weibull CDF. The difference is that Weibull PDF +#' offers lagged effect. When shape > 2, the curve peaks after x = 0 and has NULL slope at +#' x = 0, enabling lagged effect and sharper increase and decrease of adstock, while the +#' scale parameter indicates the limit of the relative position of the peak at x axis; when +#' 1 < shape < 2, the curve peaks after x = 0 and has infinite positive slope at x = 0, +#' enabling lagged effect and slower increase and decrease of adstock, while scale has the +#' same effect as above; when shape = 1, the curve peaks at x = 0 and reduces to exponential +#' decay, while scale controls the inflexion point; when 0 < shape < 1, the curve peaks at +#' x = 0 and has increasing decay, while scale controls the inflexion point. When all +#' possible shapes are relevant, we recommend c(0.0001, 10) as bounds for shape; when only +#' strong lagged effect is of interest, we recommend c(2.0001, 10) as bound for shape. In +#' all cases, we recommend conservative bound of c(0, 0.1) for scale. Due to the great +#' flexibility of Weibull PDF, meaning more freedom in hyperparameter spaces for Nevergrad +#' to explore, it also requires larger iterations to converge.} +#' } +#' +#' Run \code{plot_adstock()} to see the difference visually. +#' +#' @param shape,scale Numeric. Check "Details" section for more details. +#' @param windlen Integer. Length of modelling window. By default, same length as \code{x}. +#' @param type Character. Accepts "CDF" or "PDF". CDF, or cumulative density +#' function of the Weibull function allows changing decay rate over time in both +#' C and S shape, while the peak value will always stay at the first period, +#' meaning no lagged effect. PDF, or the probability density function, enables +#' peak value occurring after the first period when shape >=1, allowing lagged +#' effect. +#' @examples +#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "CDF") +#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "PDF") +#' +#' # Wrapped function for either adstock +#' transform_adstock(rep(100, 10), "weibull_pdf", shape = 1, scale = 0.5) +#' @rdname adstocks +#' @export +adstock_weibull <- function(x, shape, scale, windlen = length(x), type = "cdf") { + stopifnot(length(shape) == 1) + stopifnot(length(scale) == 1) + if (length(x) > 1) { + check_opts(tolower(type), c("cdf", "pdf")) + x_bin <- 1:windlen + scaleTrans <- round(quantile(1:windlen, scale), 0) + if (shape == 0 | scale == 0) { + x_decayed <- x + thetaVecCum <- thetaVec <- rep(0, windlen) + x_imme <- x + } else { + if ("cdf" %in% tolower(type)) { + thetaVec <- c(1, 1 - pweibull(head(x_bin, -1), shape = shape, scale = scaleTrans)) # plot(thetaVec) + thetaVecCum <- cumprod(thetaVec) # plot(thetaVecCum) + } else if ("pdf" %in% tolower(type)) { + thetaVecCum <- .normalize(dweibull(x_bin, shape = shape, scale = scaleTrans)) # plot(thetaVecCum) + } + x_decayed <- mapply(function(x_val, x_pos) { + x.vec <- c(rep(0, x_pos - 1), rep(x_val, windlen - x_pos + 1)) + thetaVecCumLag <- lag(thetaVecCum, x_pos - 1, default = 0) + x.prod <- x.vec * thetaVecCumLag + return(x.prod) + }, x_val = x, x_pos = seq_along(x)) + x_imme <- diag(x_decayed) + x_decayed <- rowSums(x_decayed)[seq_along(x)] + } + } else { + x_decayed <- x_imme <- x + thetaVecCum <- 1 + } + inflation_total <- sum(x_decayed) / sum(x) + return(list( + x = x, + x_decayed = x_decayed, + thetaVecCum = thetaVecCum, + inflation_total = inflation_total, + x_imme = x_imme + )) +} + +#' @rdname adstocks +#' @param adstock Character. One of: "geometric", "weibull_cdf", "weibull_pdf". +#' @export +transform_adstock <- function(x, adstock, theta = NULL, shape = NULL, scale = NULL, windlen = length(x)) { + check_adstock(adstock) + if (adstock == "geometric") { + x_list_sim <- adstock_geometric(x = x, theta = theta) + } else if (adstock == "weibull_cdf") { + x_list_sim <- adstock_weibull(x = x, shape = shape, scale = scale, windlen = windlen, type = "cdf") + } else if (adstock == "weibull_pdf") { + x_list_sim <- adstock_weibull(x = x, shape = shape, scale = scale, windlen = windlen, type = "pdf") + } + return(x_list_sim) +} + +.normalize <- function(x) { + if (diff(range(x)) == 0) { + return(c(1, rep(0, length(x) - 1))) + } else { + return((x - min(x)) / (max(x) - min(x))) + } +} + +#################################################################### +#' Hill Saturation Transformation +#' +#' \code{saturation_hill} is a two-parametric version of the Hill +#' function that allows the saturation curve to flip between S and C shape. +#' +#' @family Transformations +#' @param x Numeric vector. +#' @param alpha Numeric. Alpha controls the shape of the saturation curve. +#' The larger the alpha, the more S-shape. The smaller, the more C-shape. +#' @param gamma Numeric. Gamma controls the inflexion point of the +#' saturation curve. The larger the gamma, the later the inflexion point occurs. +#' @param x_marginal Numeric. When provided, the function returns the +#' Hill-transformed value of the x_marginal input. +#' @examples +#' saturation_hill(c(100, 150, 170, 190, 200), alpha = 3, gamma = 0.5) +#' @return Numeric values. Transformed values. +#' @export +saturation_hill <- function(x, alpha, gamma, x_marginal = NULL) { + stopifnot(length(alpha) == 1) + stopifnot(length(gamma) == 1) + inflexion <- c(range(x) %*% c(1 - gamma, gamma)) # linear interpolation by dot product + if (is.null(x_marginal)) { + x_scurve <- x**alpha / (x**alpha + inflexion**alpha) # plot(x_scurve) summary(x_scurve) + } else { + x_scurve <- x_marginal**alpha / (x_marginal**alpha + inflexion**alpha) + } + return(x_scurve) +} + + +#################################################################### +#' Adstocking Help Plot +#' +#' @param plot Boolean. Do you wish to return the plot? +#' @rdname adstocks +#' @export +plot_adstock <- function(plot = TRUE) { + if (plot) { + ## Plot geometric + geomCollect <- list() + thetaVec <- c(0.01, 0.05, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9) + + for (v in seq_along(thetaVec)) { + thetaVecCum <- 1 + for (t in 2:100) { + thetaVecCum[t] <- thetaVecCum[t - 1] * thetaVec[v] + } + dt_geom <- data.frame( + x = 1:100, + decay_accumulated = thetaVecCum, + theta = thetaVec[v] + ) + dt_geom$halflife <- which.min(abs(dt_geom$decay_accumulated - 0.5)) + geomCollect[[v]] <- dt_geom + } + geomCollect <- bind_rows(geomCollect) + geomCollect$theta_halflife <- paste(geomCollect$theta, geomCollect$halflife, sep = "_") + + p1 <- ggplot(geomCollect, aes(x = .data$x, y = .data$decay_accumulated)) + + geom_line(aes(color = .data$theta_halflife)) + + geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray") + + geom_text(aes(x = max(.data$x), y = 0.5, vjust = -0.5, hjust = 1, label = "Halflife"), colour = "gray") + + labs( + title = "Geometric Adstock\n(Fixed decay rate)", + subtitle = "Halflife = time until effect reduces to 50%", + x = "Time unit", + y = "Media decay accumulated" + ) + + theme_lares(background = "white", pal = 2) + + ## Plot weibull + weibullCollect <- list() + shapeVec <- c(0.5, 1, 2, 9) + scaleVec <- c(0.01, 0.05, 0.1, 0.15, 0.2, 0.5) + types <- c("CDF", "PDF") + n <- 1 + for (t in seq_along(types)) { + for (v1 in seq_along(shapeVec)) { + for (v2 in seq_along(scaleVec)) { + dt_weibull <- data.frame( + x = 1:100, + decay_accumulated = adstock_weibull( + 1:100, + shape = shapeVec[v1], scale = scaleVec[v2], + type = tolower(types[t]) + )$thetaVecCum, + shape = paste0("shape=", shapeVec[v1]), + scale = as.factor(scaleVec[v2]), + type = types[t] + ) + dt_weibull$halflife <- which.min(abs(dt_weibull$decay_accumulated - 0.5)) + weibullCollect[[n]] <- dt_weibull + n <- n + 1 + } + } + } + weibullCollect <- bind_rows(weibullCollect) + + p2 <- ggplot(weibullCollect, aes(x = .data$x, y = .data$decay_accumulated)) + + geom_line(aes(color = .data$scale)) + + facet_grid(.data$shape ~ .data$type) + + geom_hline(yintercept = 0.5, linetype = "dashed", color = "gray") + + geom_text(aes(x = max(.data$x), y = 0.5, vjust = -0.5, hjust = 1, label = "Halflife"), + colour = "gray" + ) + + labs( + title = "Weibull Adstock CDF vs PDF\n(Flexible decay rate)", + subtitle = "Halflife = time until effect reduces to 50%", + x = "Time unit", + y = "Media decay accumulated" + ) + + theme_lares(background = "white", pal = 2) + return(wrap_plots(A = p1, B = p2, design = "ABB")) + } +} + + +#################################################################### +#' Saturation Help Plot +#' +#' Produce example plots for the Hill saturation curve. +#' +#' @inheritParams plot_adstock +#' @rdname saturation_hill +#' @export +plot_saturation <- function(plot = TRUE) { + if (plot) { + xSample <- 1:100 + alphaSamp <- c(0.1, 0.5, 1, 2, 3) + gammaSamp <- c(0.1, 0.3, 0.5, 0.7, 0.9) + + ## Plot alphas + hillAlphaCollect <- list() + for (i in seq_along(alphaSamp)) { + hillAlphaCollect[[i]] <- data.frame( + x = xSample, + y = xSample**alphaSamp[i] / (xSample**alphaSamp[i] + (0.5 * 100)**alphaSamp[i]), + alpha = alphaSamp[i] + ) + } + hillAlphaCollect <- bind_rows(hillAlphaCollect) + hillAlphaCollect$alpha <- as.factor(hillAlphaCollect$alpha) + p1 <- ggplot(hillAlphaCollect, aes(x = .data$x, y = .data$y, color = .data$alpha)) + + geom_line() + + labs( + title = "Cost response with hill function", + subtitle = "Alpha changes while gamma = 0.5" + ) + + theme_lares(background = "white", pal = 2) + + ## Plot gammas + hillGammaCollect <- list() + for (i in seq_along(gammaSamp)) { + hillGammaCollect[[i]] <- data.frame( + x = xSample, + y = xSample**2 / (xSample**2 + (gammaSamp[i] * 100)**2), + gamma = gammaSamp[i] + ) + } + hillGammaCollect <- bind_rows(hillGammaCollect) + hillGammaCollect$gamma <- as.factor(hillGammaCollect$gamma) + p2 <- ggplot(hillGammaCollect, aes(x = .data$x, y = .data$y, color = .data$gamma)) + + geom_line() + + labs( + title = "Cost response with hill function", + subtitle = "Gamma changes while alpha = 2" + ) + + theme_lares(background = "white", pal = 2) + + return(p1 + p2) + } +} + +#### Transform media for model fitting +run_transformations <- function(InputCollect, hypParamSam, adstock) { + all_media <- InputCollect$all_media + rollingWindowStartWhich <- InputCollect$rollingWindowStartWhich + rollingWindowEndWhich <- InputCollect$rollingWindowEndWhich + dt_modAdstocked <- select(InputCollect$dt_mod, -.data$ds) + + mediaAdstocked <- list() + # mediaImmediate <- list() + # mediaCarryover <- list() + # mediaVecCum <- list() + mediaSaturated <- list() + mediaSaturatedImmediate <- list() + mediaSaturatedCarryover <- list() + + for (v in seq_along(all_media)) { + ################################################ + ## 1. Adstocking (whole data) + # Decayed/adstocked response = Immediate response + Carryover response + m <- dt_modAdstocked[, all_media[v]][[1]] + if (adstock == "geometric") { + theta <- hypParamSam[paste0(all_media[v], "_thetas")][[1]][[1]] + } + if (grepl("weibull", adstock)) { + shape <- hypParamSam[paste0(all_media[v], "_shapes")][[1]][[1]] + scale <- hypParamSam[paste0(all_media[v], "_scales")][[1]][[1]] + } + x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) + m_imme <- if (adstock == "weibull_pdf") x_list$x_imme else m + m_adstocked <- x_list$x_decayed + mediaAdstocked[[v]] <- m_adstocked + m_carryover <- m_adstocked - m_imme + # mediaImmediate[[v]] <- m_imme + # mediaCarryover[[v]] <- m_carryover + # mediaVecCum[[v]] <- x_list$thetaVecCum + + ################################################ + ## 2. Saturation (only window data) + # Saturated response = Immediate response + carryover response + m_adstockedRollWind <- m_adstocked[rollingWindowStartWhich:rollingWindowEndWhich] + m_carryoverRollWind <- m_carryover[rollingWindowStartWhich:rollingWindowEndWhich] + + alpha <- hypParamSam[paste0(all_media[v], "_alphas")][[1]][[1]] + gamma <- hypParamSam[paste0(all_media[v], "_gammas")][[1]][[1]] + mediaSaturated[[v]] <- saturation_hill( + m_adstockedRollWind, + alpha = alpha, gamma = gamma + ) + mediaSaturatedCarryover[[v]] <- saturation_hill( + m_adstockedRollWind, + alpha = alpha, gamma = gamma, x_marginal = m_carryoverRollWind + ) + mediaSaturatedImmediate[[v]] <- mediaSaturated[[v]] - mediaSaturatedCarryover[[v]] + # plot(m_adstockedRollWind, mediaSaturated[[1]]) + } + + names(mediaAdstocked) <- names(mediaSaturated) <- names(mediaSaturatedImmediate) <- + names(mediaSaturatedCarryover) <- all_media + dt_modAdstocked <- dt_modAdstocked %>% + select(-all_of(all_media)) %>% + bind_cols(mediaAdstocked) + # dt_mediaImmediate <- bind_cols(mediaImmediate) + # dt_mediaCarryover <- bind_cols(mediaCarryover) + # mediaVecCum <- bind_cols(mediaVecCum) + dt_modSaturated <- dt_modAdstocked[rollingWindowStartWhich:rollingWindowEndWhich, ] %>% + select(-all_of(all_media)) %>% + bind_cols(mediaSaturated) + dt_saturatedImmediate <- bind_cols(mediaSaturatedImmediate) + dt_saturatedImmediate[is.na(dt_saturatedImmediate)] <- 0 + dt_saturatedCarryover <- bind_cols(mediaSaturatedCarryover) + dt_saturatedCarryover[is.na(dt_saturatedCarryover)] <- 0 + return(list( + dt_modSaturated = dt_modSaturated, + dt_saturatedImmediate = dt_saturatedImmediate, + dt_saturatedCarryover = dt_saturatedCarryover + )) +} diff --git a/R/zzz.R b/R/zzz.R index 76e68e8..79889cd 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,8 +1,8 @@ -# Copyright (c) Meta Platforms, Inc. and its affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -.onLoad <- function(libname, pkgname) { - reticulate::configure_environment(pkgname) -} +# Copyright (c) Meta Platforms, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +.onLoad <- function(libname, pkgname) { + reticulate::configure_environment(pkgname) +} diff --git a/man/Robyn.Rd b/man/Robyn.Rd index 382a92a..e2712bd 100644 --- a/man/Robyn.Rd +++ b/man/Robyn.Rd @@ -1,31 +1,31 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/imports.R -\docType{package} -\name{Robyn} -\alias{Robyn} -\alias{Robyn-package} -\title{Robyn MMM Project from Meta Marketing Science} -\description{ -Robyn is an automated Marketing Mix Modeling (MMM) code. It aims to reduce human -bias by means of ridge regression and evolutionary algorithms, enables actionable -decision making providing a budget allocator and diminishing returns curves and -allows ground-truth calibration to account for causation. -} -\seealso{ -Useful links: -\itemize{ - \item \url{https://github.com/facebookexperimental/Robyn} - \item \url{https://facebookexperimental.github.io/Robyn/} - \item Report bugs at \url{https://github.com/facebookexperimental/Robyn/issues} -} - -} -\author{ -Gufeng Zhou (gufeng@meta.com) - -Leonel Sentana (leonelsentana@meta.com) - -Igor Skokan (igorskokan@meta.com) - -Bernardo Lares (bernardolares@meta.com) -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/imports.R +\docType{package} +\name{Robyn} +\alias{Robyn} +\alias{Robyn-package} +\title{Robyn MMM Project from Meta Marketing Science} +\description{ +Robyn is an automated Marketing Mix Modeling (MMM) code. It aims to reduce human +bias by means of ridge regression and evolutionary algorithms, enables actionable +decision making providing a budget allocator and diminishing returns curves and +allows ground-truth calibration to account for causation. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/facebookexperimental/Robyn} + \item \url{https://facebookexperimental.github.io/Robyn/} + \item Report bugs at \url{https://github.com/facebookexperimental/Robyn/issues} +} + +} +\author{ +Gufeng Zhou (gufeng@meta.com) + +Leonel Sentana (leonelsentana@meta.com) + +Igor Skokan (igorskokan@meta.com) + +Bernardo Lares (bernardolares@meta.com) +} diff --git a/man/adstocks.Rd b/man/adstocks.Rd index 035e2de..6da57ef 100644 --- a/man/adstocks.Rd +++ b/man/adstocks.Rd @@ -1,102 +1,102 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transformation.R -\name{adstock_geometric} -\alias{adstock_geometric} -\alias{adstock_weibull} -\alias{transform_adstock} -\alias{plot_adstock} -\title{Adstocking Transformation (Geometric and Weibull)} -\usage{ -adstock_geometric(x, theta) - -adstock_weibull(x, shape, scale, windlen = length(x), type = "cdf") - -transform_adstock( - x, - adstock, - theta = NULL, - shape = NULL, - scale = NULL, - windlen = length(x) -) - -plot_adstock(plot = TRUE) -} -\arguments{ -\item{x}{A numeric vector.} - -\item{theta}{Numeric. Theta is the only parameter on Geometric Adstocking and means -fixed decay rate. Assuming TV spend on day 1 is 100€ and theta = 0.7, then day 2 has -100 x 0.7 = 70€ worth of effect carried-over from day 1, day 3 has 70 x 0.7 = 49€ -from day 2 etc. Rule-of-thumb for common media genre: TV c(0.3, 0.8), OOH/Print/ -Radio c(0.1, 0.4), digital c(0, 0.3).} - -\item{shape, scale}{Numeric. Check "Details" section for more details.} - -\item{windlen}{Integer. Length of modelling window. By default, same length as \code{x}.} - -\item{type}{Character. Accepts "CDF" or "PDF". CDF, or cumulative density -function of the Weibull function allows changing decay rate over time in both -C and S shape, while the peak value will always stay at the first period, -meaning no lagged effect. PDF, or the probability density function, enables -peak value occurring after the first period when shape >=1, allowing lagged -effect.} - -\item{adstock}{Character. One of: "geometric", "weibull_cdf", "weibull_pdf".} - -\item{plot}{Boolean. Do you wish to return the plot?} -} -\value{ -Numeric values. Transformed values. -} -\description{ -\code{adstock_geometric()} for Geometric Adstocking is the classic one-parametric -adstock function. - -\code{adstock_weibull()} for Weibull Adstocking is a two-parametric adstock -function that allows changing decay rate over time, as opposed to the fixed -decay rate over time as in Geometric adstock. It has two options, the cumulative -density function "CDF" or the probability density function "PDF". -} -\details{ -\describe{ - \item{Weibull's CDF (Cumulative Distribution Function)}{has -two parameters, shape & scale, and has flexible decay rate, compared to Geometric -adstock with fixed decay rate. The shape parameter controls the shape of the decay -curve. Recommended bound is c(0.0001, 2). The larger the shape, the more S-shape. The -smaller, the more L-shape. Scale controls the inflexion point of the decay curve. We -recommend very conservative bounce of c(0, 0.1), because scale increases the adstock -half-life greatly.} - \item{Weibull's PDF (Probability Density Function)}{also shape & scale as parameter -and also has flexible decay rate as Weibull CDF. The difference is that Weibull PDF -offers lagged effect. When shape > 2, the curve peaks after x = 0 and has NULL slope at -x = 0, enabling lagged effect and sharper increase and decrease of adstock, while the -scale parameter indicates the limit of the relative position of the peak at x axis; when -1 < shape < 2, the curve peaks after x = 0 and has infinite positive slope at x = 0, -enabling lagged effect and slower increase and decrease of adstock, while scale has the -same effect as above; when shape = 1, the curve peaks at x = 0 and reduces to exponential -decay, while scale controls the inflexion point; when 0 < shape < 1, the curve peaks at -x = 0 and has increasing decay, while scale controls the inflexion point. When all -possible shapes are relevant, we recommend c(0.0001, 10) as bounds for shape; when only -strong lagged effect is of interest, we recommend c(2.0001, 10) as bound for shape. In -all cases, we recommend conservative bound of c(0, 0.1) for scale. Due to the great -flexibility of Weibull PDF, meaning more freedom in hyperparameter spaces for Nevergrad -to explore, it also requires larger iterations to converge.} -} - -Run \code{plot_adstock()} to see the difference visually. -} -\examples{ -adstock_geometric(rep(100, 5), theta = 0.5) -adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "CDF") -adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "PDF") - -# Wrapped function for either adstock -transform_adstock(rep(100, 10), "weibull_pdf", shape = 1, scale = 0.5) -} -\seealso{ -Other Transformations: -\code{\link{mic_men}()}, -\code{\link{saturation_hill}()} -} -\concept{Transformations} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transformation.R +\name{adstock_geometric} +\alias{adstock_geometric} +\alias{adstock_weibull} +\alias{transform_adstock} +\alias{plot_adstock} +\title{Adstocking Transformation (Geometric and Weibull)} +\usage{ +adstock_geometric(x, theta) + +adstock_weibull(x, shape, scale, windlen = length(x), type = "cdf") + +transform_adstock( + x, + adstock, + theta = NULL, + shape = NULL, + scale = NULL, + windlen = length(x) +) + +plot_adstock(plot = TRUE) +} +\arguments{ +\item{x}{A numeric vector.} + +\item{theta}{Numeric. Theta is the only parameter on Geometric Adstocking and means +fixed decay rate. Assuming TV spend on day 1 is 100€ and theta = 0.7, then day 2 has +100 x 0.7 = 70€ worth of effect carried-over from day 1, day 3 has 70 x 0.7 = 49€ +from day 2 etc. Rule-of-thumb for common media genre: TV c(0.3, 0.8), OOH/Print/ +Radio c(0.1, 0.4), digital c(0, 0.3).} + +\item{shape, scale}{Numeric. Check "Details" section for more details.} + +\item{windlen}{Integer. Length of modelling window. By default, same length as \code{x}.} + +\item{type}{Character. Accepts "CDF" or "PDF". CDF, or cumulative density +function of the Weibull function allows changing decay rate over time in both +C and S shape, while the peak value will always stay at the first period, +meaning no lagged effect. PDF, or the probability density function, enables +peak value occurring after the first period when shape >=1, allowing lagged +effect.} + +\item{adstock}{Character. One of: "geometric", "weibull_cdf", "weibull_pdf".} + +\item{plot}{Boolean. Do you wish to return the plot?} +} +\value{ +Numeric values. Transformed values. +} +\description{ +\code{adstock_geometric()} for Geometric Adstocking is the classic one-parametric +adstock function. + +\code{adstock_weibull()} for Weibull Adstocking is a two-parametric adstock +function that allows changing decay rate over time, as opposed to the fixed +decay rate over time as in Geometric adstock. It has two options, the cumulative +density function "CDF" or the probability density function "PDF". +} +\details{ +\describe{ + \item{Weibull's CDF (Cumulative Distribution Function)}{has +two parameters, shape & scale, and has flexible decay rate, compared to Geometric +adstock with fixed decay rate. The shape parameter controls the shape of the decay +curve. Recommended bound is c(0.0001, 2). The larger the shape, the more S-shape. The +smaller, the more L-shape. Scale controls the inflexion point of the decay curve. We +recommend very conservative bounce of c(0, 0.1), because scale increases the adstock +half-life greatly.} + \item{Weibull's PDF (Probability Density Function)}{also shape & scale as parameter +and also has flexible decay rate as Weibull CDF. The difference is that Weibull PDF +offers lagged effect. When shape > 2, the curve peaks after x = 0 and has NULL slope at +x = 0, enabling lagged effect and sharper increase and decrease of adstock, while the +scale parameter indicates the limit of the relative position of the peak at x axis; when +1 < shape < 2, the curve peaks after x = 0 and has infinite positive slope at x = 0, +enabling lagged effect and slower increase and decrease of adstock, while scale has the +same effect as above; when shape = 1, the curve peaks at x = 0 and reduces to exponential +decay, while scale controls the inflexion point; when 0 < shape < 1, the curve peaks at +x = 0 and has increasing decay, while scale controls the inflexion point. When all +possible shapes are relevant, we recommend c(0.0001, 10) as bounds for shape; when only +strong lagged effect is of interest, we recommend c(2.0001, 10) as bound for shape. In +all cases, we recommend conservative bound of c(0, 0.1) for scale. Due to the great +flexibility of Weibull PDF, meaning more freedom in hyperparameter spaces for Nevergrad +to explore, it also requires larger iterations to converge.} +} + +Run \code{plot_adstock()} to see the difference visually. +} +\examples{ +adstock_geometric(rep(100, 5), theta = 0.5) +adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "CDF") +adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "PDF") + +# Wrapped function for either adstock +transform_adstock(rep(100, 10), "weibull_pdf", shape = 1, scale = 0.5) +} +\seealso{ +Other Transformations: +\code{\link{mic_men}()}, +\code{\link{saturation_hill}()} +} +\concept{Transformations} diff --git a/man/dt_prophet_holidays.Rd b/man/dt_prophet_holidays.Rd index 64b564f..2ec9cde 100644 --- a/man/dt_prophet_holidays.Rd +++ b/man/dt_prophet_holidays.Rd @@ -1,38 +1,38 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{dt_prophet_holidays} -\alias{dt_prophet_holidays} -\title{Robyn Dataset: Holidays by Country} -\format{ -An object of class \code{"data.frame"} -\describe{ - \item{ds}{Date} - \item{holiday}{Name of celebrated holiday} - \item{country}{Code for the country (Alpha-2)} - \item{year}{Year of \code{ds}} -} -} -\usage{ -data(dt_prophet_holidays) -} -\value{ -data.frame - -Dataframe. Contains \code{prophet}'s default holidays by country. -} -\description{ -Contains \code{prophet}'s "new" default holidays by country. -When using own holidays, please keep the header -\code{c("ds", "holiday", "country", "year")}. -} -\examples{ -data(dt_prophet_holidays) -head(dt_prophet_holidays) -} -\seealso{ -Other Dataset: -\code{\link{dt_simulated_weekly}} -} -\concept{Dataset} -\keyword{datasets} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{dt_prophet_holidays} +\alias{dt_prophet_holidays} +\title{Robyn Dataset: Holidays by Country} +\format{ +An object of class \code{"data.frame"} +\describe{ + \item{ds}{Date} + \item{holiday}{Name of celebrated holiday} + \item{country}{Code for the country (Alpha-2)} + \item{year}{Year of \code{ds}} +} +} +\usage{ +data(dt_prophet_holidays) +} +\value{ +data.frame + +Dataframe. Contains \code{prophet}'s default holidays by country. +} +\description{ +Contains \code{prophet}'s "new" default holidays by country. +When using own holidays, please keep the header +\code{c("ds", "holiday", "country", "year")}. +} +\examples{ +data(dt_prophet_holidays) +head(dt_prophet_holidays) +} +\seealso{ +Other Dataset: +\code{\link{dt_simulated_weekly}} +} +\concept{Dataset} +\keyword{datasets} diff --git a/man/dt_simulated_weekly.Rd b/man/dt_simulated_weekly.Rd index 6234623..026ffea 100644 --- a/man/dt_simulated_weekly.Rd +++ b/man/dt_simulated_weekly.Rd @@ -1,37 +1,37 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{dt_simulated_weekly} -\alias{dt_simulated_weekly} -\title{Robyn Dataset: MMM Demo Data} -\format{ -An object of class \code{"data.frame"} -\describe{ - \item{DATE}{Date} - \item{revenue}{Daily total revenue} - \item{tv_S}{Television} - \item{ooh_S}{Out of home} - \item{...}{...} -} -} -\usage{ -data(dt_simulated_weekly) -} -\value{ -data.frame - -Dataframe. Contains simulated dummy dataset to test and run demo. -} -\description{ -Simulated MMM data. Input time series should be daily, weekly or monthly. -} -\examples{ -data(dt_simulated_weekly) -head(dt_simulated_weekly) -} -\seealso{ -Other Dataset: -\code{\link{dt_prophet_holidays}} -} -\concept{Dataset} -\keyword{datasets} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{dt_simulated_weekly} +\alias{dt_simulated_weekly} +\title{Robyn Dataset: MMM Demo Data} +\format{ +An object of class \code{"data.frame"} +\describe{ + \item{DATE}{Date} + \item{revenue}{Daily total revenue} + \item{tv_S}{Television} + \item{ooh_S}{Out of home} + \item{...}{...} +} +} +\usage{ +data(dt_simulated_weekly) +} +\value{ +data.frame + +Dataframe. Contains simulated dummy dataset to test and run demo. +} +\description{ +Simulated MMM data. Input time series should be daily, weekly or monthly. +} +\examples{ +data(dt_simulated_weekly) +head(dt_simulated_weekly) +} +\seealso{ +Other Dataset: +\code{\link{dt_prophet_holidays}} +} +\concept{Dataset} +\keyword{datasets} diff --git a/man/fit_spend_exposure.Rd b/man/fit_spend_exposure.Rd index 100e153..16b9d0b 100644 --- a/man/fit_spend_exposure.Rd +++ b/man/fit_spend_exposure.Rd @@ -1,29 +1,29 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inputs.R -\name{fit_spend_exposure} -\alias{fit_spend_exposure} -\title{Fit a nonlinear model for media spend and exposure} -\usage{ -fit_spend_exposure(dt_spendModInput, mediaCostFactor, paid_media_var) -} -\arguments{ -\item{dt_spendModInput}{data.frame. Containing channel spends and -exposure data.} - -\item{mediaCostFactor}{Numeric vector. The ratio between raw media -exposure and spend metrics.} - -\item{paid_media_var}{Character. Paid media variable.} -} -\value{ -List. Containing the all spend-exposure model results. -} -\description{ -This function is called in \code{robyn_engineering()}. It uses -the Michaelis-Menten function to fit the nonlinear model. Fallback -model is the simple linear model \code{lm()} in case the nonlinear -model is fitting worse. A bad fit here might result in unreasonable -model results. Two options are recommended: Either splitting the -channel into sub-channels to achieve better fit, or just use -spend as \code{paid_media_vars} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inputs.R +\name{fit_spend_exposure} +\alias{fit_spend_exposure} +\title{Fit a nonlinear model for media spend and exposure} +\usage{ +fit_spend_exposure(dt_spendModInput, mediaCostFactor, paid_media_var) +} +\arguments{ +\item{dt_spendModInput}{data.frame. Containing channel spends and +exposure data.} + +\item{mediaCostFactor}{Numeric vector. The ratio between raw media +exposure and spend metrics.} + +\item{paid_media_var}{Character. Paid media variable.} +} +\value{ +List. Containing the all spend-exposure model results. +} +\description{ +This function is called in \code{robyn_engineering()}. It uses +the Michaelis-Menten function to fit the nonlinear model. Fallback +model is the simple linear model \code{lm()} in case the nonlinear +model is fitting worse. A bad fit here might result in unreasonable +model results. Two options are recommended: Either splitting the +channel into sub-channels to achieve better fit, or just use +spend as \code{paid_media_vars} +} diff --git a/man/hyper_limits.Rd b/man/hyper_limits.Rd index abbe3f9..406c3e7 100644 --- a/man/hyper_limits.Rd +++ b/man/hyper_limits.Rd @@ -1,18 +1,18 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inputs.R -\name{hyper_limits} -\alias{hyper_limits} -\title{Check hyperparameter limits} -\usage{ -hyper_limits() -} -\value{ -Dataframe. Contains upper and lower bounds for each hyperparameter. -} -\description{ -Reference data.frame that shows the upper and lower bounds valid -for each hyperparameter. -} -\examples{ -hyper_limits() -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inputs.R +\name{hyper_limits} +\alias{hyper_limits} +\title{Check hyperparameter limits} +\usage{ +hyper_limits() +} +\value{ +Dataframe. Contains upper and lower bounds for each hyperparameter. +} +\description{ +Reference data.frame that shows the upper and lower bounds valid +for each hyperparameter. +} +\examples{ +hyper_limits() +} diff --git a/man/hyper_names.Rd b/man/hyper_names.Rd index 63738ce..3cb0442 100644 --- a/man/hyper_names.Rd +++ b/man/hyper_names.Rd @@ -1,101 +1,101 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inputs.R -\name{hyper_names} -\alias{hyper_names} -\title{Get correct hyperparameter names} -\usage{ -hyper_names(adstock, all_media, all_vars = NULL) -} -\arguments{ -\item{adstock}{Character. Default to \code{InputCollect$adstock}. -Accepts "geometric", "weibull_cdf" or "weibull_pdf"} - -\item{all_media}{Character vector. Default to \code{InputCollect$all_media}. -Includes \code{InputCollect$paid_media_spends} and \code{InputCollect$organic_vars}.} - -\item{all_vars}{Used to check the penalties inputs, especially for refreshing models.} -} -\value{ -Character vector. Names of hyper-parameters that should be defined. -} -\description{ -Output all hyperparameter names and help specifying the list of -hyperparameters that is inserted into \code{robyn_inputs(hyperparameters = ...)} -} -\section{Guide to setup hyperparameters}{ - - \enumerate{ - \item Get correct hyperparameter names: - All variables in \code{paid_media_vars} or \code{organic_vars} require hyperprameters - and will be transformed by adstock & saturation. Difference between \code{paid_media_vars} - and \code{organic_vars} is that \code{paid_media_vars} has spend that - needs to be specified in \code{paid_media_spends} specifically. Run \code{hyper_names()} - to get correct hyperparameter names. All names in hyperparameters must - equal names from \code{hyper_names()}, case sensitive. - \item Get guidance for setting hyperparameter bounds: - For geometric adstock, use theta, alpha & gamma. For both weibull adstock options, - use shape, scale, alpha, gamma. - \itemize{ - \item Theta: In geometric adstock, theta is decay rate. guideline for usual media genre: - TV c(0.3, 0.8), OOH/Print/Radio c(0.1, 0.4), digital c(0, 0.3) - \item Shape: In weibull adstock, shape controls the decay shape. Recommended c(0.0001, 2). - The larger, the more S-shape. The smaller, the more L-shape. Channel-type specific - values still to be investigated - \item Scale: In weibull adstock, scale controls the decay inflexion point. Very conservative - recommended bounce c(0, 0.1), because scale can increase adstocking half-life greatly. - Channel-type specific values still to be investigated - \item Gamma: In s-curve transformation with hill function, gamma controls the inflexion point. - Recommended bounce c(0.3, 1). The larger the gamma, the later the inflection point - in the response curve - } - \item Set each hyperparameter bounds. They either contains two values e.g. c(0, 0.5), - or only one value (in which case you've "fixed" that hyperparameter) -} -} - -\section{Helper plots}{ - -\describe{ - \item{plot_adstock}{Get adstock transformation example plot, -helping you understand geometric/theta and weibull/shape/scale transformation} - \item{plot_saturation}{Get saturation curve transformation example plot, -helping you understand hill/alpha/gamma transformation} -} -} - -\examples{ -\donttest{ -media <- c("facebook_S", "print_S", "tv_S") -hyper_names(adstock = "geometric", all_media = media) - -hyperparameters <- list( - facebook_S_alphas = c(0.5, 3), # example bounds for alpha - facebook_S_gammas = c(0.3, 1), # example bounds for gamma - facebook_S_thetas = c(0, 0.3), # example bounds for theta - print_S_alphas = c(0.5, 3), - print_S_gammas = c(0.3, 1), - print_S_thetas = c(0.1, 0.4), - tv_S_alphas = c(0.5, 3), - tv_S_gammas = c(0.3, 1), - tv_S_thetas = c(0.3, 0.8) -) - -# Define hyper_names for weibull adstock -hyper_names(adstock = "weibull", all_media = media) - -hyperparameters <- list( - facebook_S_alphas = c(0.5, 3), # example bounds for alpha - facebook_S_gammas = c(0.3, 1), # example bounds for gamma - facebook_S_shapes = c(0.0001, 2), # example bounds for shape - facebook_S_scales = c(0, 0.1), # example bounds for scale - print_S_alphas = c(0.5, 3), - print_S_gammas = c(0.3, 1), - print_S_shapes = c(0.0001, 2), - print_S_scales = c(0, 0.1), - tv_S_alphas = c(0.5, 3), - tv_S_gammas = c(0.3, 1), - tv_S_shapes = c(0.0001, 2), - tv_S_scales = c(0, 0.1) -) -} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inputs.R +\name{hyper_names} +\alias{hyper_names} +\title{Get correct hyperparameter names} +\usage{ +hyper_names(adstock, all_media, all_vars = NULL) +} +\arguments{ +\item{adstock}{Character. Default to \code{InputCollect$adstock}. +Accepts "geometric", "weibull_cdf" or "weibull_pdf"} + +\item{all_media}{Character vector. Default to \code{InputCollect$all_media}. +Includes \code{InputCollect$paid_media_spends} and \code{InputCollect$organic_vars}.} + +\item{all_vars}{Used to check the penalties inputs, especially for refreshing models.} +} +\value{ +Character vector. Names of hyper-parameters that should be defined. +} +\description{ +Output all hyperparameter names and help specifying the list of +hyperparameters that is inserted into \code{robyn_inputs(hyperparameters = ...)} +} +\section{Guide to setup hyperparameters}{ + + \enumerate{ + \item Get correct hyperparameter names: + All variables in \code{paid_media_vars} or \code{organic_vars} require hyperprameters + and will be transformed by adstock & saturation. Difference between \code{paid_media_vars} + and \code{organic_vars} is that \code{paid_media_vars} has spend that + needs to be specified in \code{paid_media_spends} specifically. Run \code{hyper_names()} + to get correct hyperparameter names. All names in hyperparameters must + equal names from \code{hyper_names()}, case sensitive. + \item Get guidance for setting hyperparameter bounds: + For geometric adstock, use theta, alpha & gamma. For both weibull adstock options, + use shape, scale, alpha, gamma. + \itemize{ + \item Theta: In geometric adstock, theta is decay rate. guideline for usual media genre: + TV c(0.3, 0.8), OOH/Print/Radio c(0.1, 0.4), digital c(0, 0.3) + \item Shape: In weibull adstock, shape controls the decay shape. Recommended c(0.0001, 2). + The larger, the more S-shape. The smaller, the more L-shape. Channel-type specific + values still to be investigated + \item Scale: In weibull adstock, scale controls the decay inflexion point. Very conservative + recommended bounce c(0, 0.1), because scale can increase adstocking half-life greatly. + Channel-type specific values still to be investigated + \item Gamma: In s-curve transformation with hill function, gamma controls the inflexion point. + Recommended bounce c(0.3, 1). The larger the gamma, the later the inflection point + in the response curve + } + \item Set each hyperparameter bounds. They either contains two values e.g. c(0, 0.5), + or only one value (in which case you've "fixed" that hyperparameter) +} +} + +\section{Helper plots}{ + +\describe{ + \item{plot_adstock}{Get adstock transformation example plot, +helping you understand geometric/theta and weibull/shape/scale transformation} + \item{plot_saturation}{Get saturation curve transformation example plot, +helping you understand hill/alpha/gamma transformation} +} +} + +\examples{ +\donttest{ +media <- c("facebook_S", "print_S", "tv_S") +hyper_names(adstock = "geometric", all_media = media) + +hyperparameters <- list( + facebook_S_alphas = c(0.5, 3), # example bounds for alpha + facebook_S_gammas = c(0.3, 1), # example bounds for gamma + facebook_S_thetas = c(0, 0.3), # example bounds for theta + print_S_alphas = c(0.5, 3), + print_S_gammas = c(0.3, 1), + print_S_thetas = c(0.1, 0.4), + tv_S_alphas = c(0.5, 3), + tv_S_gammas = c(0.3, 1), + tv_S_thetas = c(0.3, 0.8) +) + +# Define hyper_names for weibull adstock +hyper_names(adstock = "weibull", all_media = media) + +hyperparameters <- list( + facebook_S_alphas = c(0.5, 3), # example bounds for alpha + facebook_S_gammas = c(0.3, 1), # example bounds for gamma + facebook_S_shapes = c(0.0001, 2), # example bounds for shape + facebook_S_scales = c(0, 0.1), # example bounds for scale + print_S_alphas = c(0.5, 3), + print_S_gammas = c(0.3, 1), + print_S_shapes = c(0.0001, 2), + print_S_scales = c(0, 0.1), + tv_S_alphas = c(0.5, 3), + tv_S_gammas = c(0.3, 1), + tv_S_shapes = c(0.0001, 2), + tv_S_scales = c(0, 0.1) +) +} +} diff --git a/man/mic_men.Rd b/man/mic_men.Rd index 2c5793c..01501c4 100644 --- a/man/mic_men.Rd +++ b/man/mic_men.Rd @@ -1,38 +1,38 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transformation.R -\name{mic_men} -\alias{mic_men} -\title{Michaelis-Menten Transformation} -\usage{ -mic_men(x, Vmax, Km, reverse = FALSE) -} -\arguments{ -\item{x}{Numeric value or vector. Input media spend when -\code{reverse = FALSE}. Input media exposure metrics (impression, clicks, -GRPs, etc.) when \code{reverse = TRUE}.} - -\item{Vmax}{Numeric Indicates maximum rate achieved by the system.} - -\item{Km}{Numeric. The Michaelis constant.} - -\item{reverse}{Boolean. Input media spend when \code{reverse = FALSE}. -Input media exposure metrics (impression, clicks, GRPs etc.) when \code{reverse = TRUE}.} -} -\value{ -Numeric values. Transformed values. -} -\description{ -The Michaelis-Menten \code{mic_men()} function is used to fit the spend -exposure relationship for paid media variables, when exposure metrics like -impressions, clicks or GRPs are provided in \code{paid_media_vars} instead -of spend metric. -} -\examples{ -mic_men(x = 5:10, Vmax = 5, Km = 0.5) -} -\seealso{ -Other Transformations: -\code{\link{adstock_geometric}()}, -\code{\link{saturation_hill}()} -} -\concept{Transformations} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transformation.R +\name{mic_men} +\alias{mic_men} +\title{Michaelis-Menten Transformation} +\usage{ +mic_men(x, Vmax, Km, reverse = FALSE) +} +\arguments{ +\item{x}{Numeric value or vector. Input media spend when +\code{reverse = FALSE}. Input media exposure metrics (impression, clicks, +GRPs, etc.) when \code{reverse = TRUE}.} + +\item{Vmax}{Numeric Indicates maximum rate achieved by the system.} + +\item{Km}{Numeric. The Michaelis constant.} + +\item{reverse}{Boolean. Input media spend when \code{reverse = FALSE}. +Input media exposure metrics (impression, clicks, GRPs etc.) when \code{reverse = TRUE}.} +} +\value{ +Numeric values. Transformed values. +} +\description{ +The Michaelis-Menten \code{mic_men()} function is used to fit the spend +exposure relationship for paid media variables, when exposure metrics like +impressions, clicks or GRPs are provided in \code{paid_media_vars} instead +of spend metric. +} +\examples{ +mic_men(x = 5:10, Vmax = 5, Km = 0.5) +} +\seealso{ +Other Transformations: +\code{\link{adstock_geometric}()}, +\code{\link{saturation_hill}()} +} +\concept{Transformations} diff --git a/man/prophet_decomp.Rd b/man/prophet_decomp.Rd index 8567b70..b60bae4 100644 --- a/man/prophet_decomp.Rd +++ b/man/prophet_decomp.Rd @@ -1,45 +1,45 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inputs.R -\name{prophet_decomp} -\alias{prophet_decomp} -\title{Conduct prophet decomposition} -\usage{ -prophet_decomp( - dt_transform, - dt_holidays, - prophet_country, - prophet_vars, - prophet_signs, - factor_vars, - context_vars, - organic_vars, - paid_media_spends, - intervalType, - dayInterval, - custom_params -) -} -\arguments{ -\item{dt_transform}{A data.frame with all model features. -Must contain \code{ds} column for time variable values and -\code{dep_var} column for dependent variable values.} - -\item{dt_holidays}{data.frame. Raw input holiday data. Load standard -Prophet holidays using \code{data("dt_prophet_holidays")}} - -\item{context_vars, paid_media_spends, intervalType, dayInterval, prophet_country, prophet_vars, prophet_signs, factor_vars}{As included in \code{InputCollect}} - -\item{organic_vars}{Character vector. Typically newsletter sendings, -push-notifications, social media posts etc. Compared to \code{paid_media_vars} -\code{organic_vars} are often marketing activities without clear spends.} - -\item{custom_params}{List. Custom parameters passed to \code{prophet()}} -} -\value{ -A list containing all prophet decomposition output. -} -\description{ -When \code{prophet_vars} in \code{robyn_inputs()} is specified, this -function decomposes trend, season, holiday and weekday from the -dependent variable. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inputs.R +\name{prophet_decomp} +\alias{prophet_decomp} +\title{Conduct prophet decomposition} +\usage{ +prophet_decomp( + dt_transform, + dt_holidays, + prophet_country, + prophet_vars, + prophet_signs, + factor_vars, + context_vars, + organic_vars, + paid_media_spends, + intervalType, + dayInterval, + custom_params +) +} +\arguments{ +\item{dt_transform}{A data.frame with all model features. +Must contain \code{ds} column for time variable values and +\code{dep_var} column for dependent variable values.} + +\item{dt_holidays}{data.frame. Raw input holiday data. Load standard +Prophet holidays using \code{data("dt_prophet_holidays")}} + +\item{context_vars, paid_media_spends, intervalType, dayInterval, prophet_country, prophet_vars, prophet_signs, factor_vars}{As included in \code{InputCollect}} + +\item{organic_vars}{Character vector. Typically newsletter sendings, +push-notifications, social media posts etc. Compared to \code{paid_media_vars} +\code{organic_vars} are often marketing activities without clear spends.} + +\item{custom_params}{List. Custom parameters passed to \code{prophet()}} +} +\value{ +A list containing all prophet decomposition output. +} +\description{ +When \code{prophet_vars} in \code{robyn_inputs()} is specified, this +function decomposes trend, season, holiday and weekday from the +dependent variable. +} diff --git a/man/robyn_allocator.Rd b/man/robyn_allocator.Rd index 4a44cb6..df2c9aa 100644 --- a/man/robyn_allocator.Rd +++ b/man/robyn_allocator.Rd @@ -1,156 +1,156 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/allocator.R -\name{robyn_allocator} -\alias{robyn_allocator} -\alias{print.robyn_allocator} -\alias{plot.robyn_allocator} -\title{Budget Allocator} -\usage{ -robyn_allocator( - robyn_object = NULL, - select_build = 0, - InputCollect = NULL, - OutputCollect = NULL, - select_model = NULL, - json_file = NULL, - scenario = "max_response", - total_budget = NULL, - target_value = NULL, - date_range = "all", - channel_constr_low = NULL, - channel_constr_up = NULL, - channel_constr_multiplier = 3, - optim_algo = "SLSQP_AUGLAG", - maxeval = 1e+05, - constr_mode = "eq", - plots = TRUE, - plot_folder = NULL, - plot_folder_sub = NULL, - export = TRUE, - quiet = FALSE, - ui = FALSE, - ... -) - -\method{print}{robyn_allocator}(x, ...) - -\method{plot}{robyn_allocator}(x, ...) -} -\arguments{ -\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object -that contains all previous modeling information or the imported list.} - -\item{select_build}{Integer. Default to the latest model build. \code{select_build = 0} -selects the initial model. \code{select_build = 1} selects the first refresh model.} - -\item{InputCollect}{List. Contains all input parameters for the model. -Required when \code{robyn_object} is not provided.} - -\item{OutputCollect}{List. Containing all model result. -Required when \code{robyn_object} is not provided.} - -\item{select_model}{Character. A model \code{SolID}. When \code{robyn_object} -is provided, \code{select_model} defaults to the already selected \code{SolID}. When -\code{robyn_object} is not provided, \code{select_model} must be provided with -\code{InputCollect} and \code{OutputCollect}, and must be one of -\code{OutputCollect$allSolutions}.} - -\item{json_file}{Character. JSON file to import previously exported inputs or -recreate a model. To generate this file, use \code{robyn_write()}. -If you didn't export your data in the json file as "raw_data", -\code{dt_input} must be provided; \code{dt_holidays} input is optional.} - -\item{scenario}{Character. Accepted options are: \code{"max_response"}, \code{"target_efficiency"}. -Scenario \code{"max_response"} answers the question: -"What's the potential revenue/conversions lift with the same (or custom) spend level -in \code{date_range} and what is the allocation and expected response mix?" -Scenario \code{"target_efficiency"} optimizes ROAS or CPA and answers the question: -"What's the potential revenue/conversions lift and spend levels based on a -\code{target_value} for CPA/ROAS and what is the allocation and expected response mix?" -Deprecated scenario: \code{"max_response_expected_spend"}.} - -\item{total_budget}{Numeric. Total marketing budget for all paid channels for the -period in \code{date_range}.} - -\item{target_value}{Numeric. When using the scenario \code{"target_efficiency"}, -target_value is the desired ROAS or CPA with no upper spend limit. Default is set to 80\% of -initial ROAS or 120\% of initial CPA, when \code{"target_value = NULL"}.} - -\item{date_range}{Character. Date(s) to apply adstocked transformations and pick mean spends -per channel. Set one of: "all", "last", or "last_n" (where -n is the last N dates available), date (i.e. "2022-03-27"), or date range -(i.e. \code{c("2022-01-01", "2022-12-31")}). Default to "all".} - -\item{channel_constr_low, channel_constr_up}{Numeric vectors. The lower and upper bounds -for each paid media variable when maximizing total media response. For example, -\code{channel_constr_low = 0.7} means minimum spend of the variable is 70% of historical -average, using non-zero spend values, within \code{date_min} and \code{date_max} date range. -Both constrains must be length 1 (same for all values) OR same length and order as -\code{paid_media_spends}. It's not recommended to 'exaggerate' upper bounds, especially -if the new level is way higher than historical level. Lower bound must be >=0.01, -and upper bound should be < 5.} - -\item{channel_constr_multiplier}{Numeric. Default to 3. For example, if -\code{channel_constr_low} and \code{channel_constr_up} are 0.8 to 1.2, the range is 0.4. -The allocator will also show the optimum solution for a larger constraint range of -0.4 x 3 = 1.2, or 0.4 to 1.6, to show the optimization potential to support allocation -interpretation and decision.} - -\item{optim_algo}{Character. Default to \code{"SLSQP_AUGLAG"}, short for "Sequential Least-Squares -Quadratic Programming" and "Augmented Lagrangian". Alternatively, "\code{"MMA_AUGLAG"}, -short for "Methods of Moving Asymptotes". More details see the documentation of -NLopt \href{https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/}{here}.} - -\item{maxeval}{Integer. The maximum iteration of the global optimization algorithm. -Defaults to 100000.} - -\item{constr_mode}{Character. Options are \code{"eq"} or \code{"ineq"}, -indicating constraints with equality or inequality.} - -\item{plots}{Boolean. Generate plots?} - -\item{plot_folder}{Character. Path for saving plots and files. Default -to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}.} - -\item{plot_folder_sub}{Character. Sub path for saving plots. Will overwrite the -default path with timestamp or, for refresh and allocator, simply overwrite files.} - -\item{export}{Boolean. Export outcomes into local files?} - -\item{quiet}{Boolean. Keep messages off?} - -\item{ui}{Boolean. Save additional outputs for UI usage. List outcome.} - -\item{...}{Additional parameters passed to \code{robyn_outputs()}.} - -\item{x}{\code{robyn_allocator()} output.} -} -\value{ -A list object containing allocator result. - -List. Contains optimized allocation results and plots. -} -\description{ -\code{robyn_allocator()} function returns a new split of media -variable spends that maximizes the total media response. -} -\examples{ -\dontrun{ -# Having InputCollect and OutputCollect results -AllocatorCollect <- robyn_allocator( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = "1_2_3", - scenario = "max_response", - channel_constr_low = 0.7, - channel_constr_up = c(1.2, 1.5, 1.5, 1.5, 1.5), - channel_constr_multiplier = 4, - date_range = "last_26", - export = FALSE -) -# Print a summary -print(AllocatorCollect) -# Plot the allocator one-pager -plot(AllocatorCollect) -} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/allocator.R +\name{robyn_allocator} +\alias{robyn_allocator} +\alias{print.robyn_allocator} +\alias{plot.robyn_allocator} +\title{Budget Allocator} +\usage{ +robyn_allocator( + robyn_object = NULL, + select_build = 0, + InputCollect = NULL, + OutputCollect = NULL, + select_model = NULL, + json_file = NULL, + scenario = "max_response", + total_budget = NULL, + target_value = NULL, + date_range = "all", + channel_constr_low = NULL, + channel_constr_up = NULL, + channel_constr_multiplier = 3, + optim_algo = "SLSQP_AUGLAG", + maxeval = 1e+05, + constr_mode = "eq", + plots = TRUE, + plot_folder = NULL, + plot_folder_sub = NULL, + export = TRUE, + quiet = FALSE, + ui = FALSE, + ... +) + +\method{print}{robyn_allocator}(x, ...) + +\method{plot}{robyn_allocator}(x, ...) +} +\arguments{ +\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object +that contains all previous modeling information or the imported list.} + +\item{select_build}{Integer. Default to the latest model build. \code{select_build = 0} +selects the initial model. \code{select_build = 1} selects the first refresh model.} + +\item{InputCollect}{List. Contains all input parameters for the model. +Required when \code{robyn_object} is not provided.} + +\item{OutputCollect}{List. Containing all model result. +Required when \code{robyn_object} is not provided.} + +\item{select_model}{Character. A model \code{SolID}. When \code{robyn_object} +is provided, \code{select_model} defaults to the already selected \code{SolID}. When +\code{robyn_object} is not provided, \code{select_model} must be provided with +\code{InputCollect} and \code{OutputCollect}, and must be one of +\code{OutputCollect$allSolutions}.} + +\item{json_file}{Character. JSON file to import previously exported inputs or +recreate a model. To generate this file, use \code{robyn_write()}. +If you didn't export your data in the json file as "raw_data", +\code{dt_input} must be provided; \code{dt_holidays} input is optional.} + +\item{scenario}{Character. Accepted options are: \code{"max_response"}, \code{"target_efficiency"}. +Scenario \code{"max_response"} answers the question: +"What's the potential revenue/conversions lift with the same (or custom) spend level +in \code{date_range} and what is the allocation and expected response mix?" +Scenario \code{"target_efficiency"} optimizes ROAS or CPA and answers the question: +"What's the potential revenue/conversions lift and spend levels based on a +\code{target_value} for CPA/ROAS and what is the allocation and expected response mix?" +Deprecated scenario: \code{"max_response_expected_spend"}.} + +\item{total_budget}{Numeric. Total marketing budget for all paid channels for the +period in \code{date_range}.} + +\item{target_value}{Numeric. When using the scenario \code{"target_efficiency"}, +target_value is the desired ROAS or CPA with no upper spend limit. Default is set to 80\% of +initial ROAS or 120\% of initial CPA, when \code{"target_value = NULL"}.} + +\item{date_range}{Character. Date(s) to apply adstocked transformations and pick mean spends +per channel. Set one of: "all", "last", or "last_n" (where +n is the last N dates available), date (i.e. "2022-03-27"), or date range +(i.e. \code{c("2022-01-01", "2022-12-31")}). Default to "all".} + +\item{channel_constr_low, channel_constr_up}{Numeric vectors. The lower and upper bounds +for each paid media variable when maximizing total media response. For example, +\code{channel_constr_low = 0.7} means minimum spend of the variable is 70% of historical +average, using non-zero spend values, within \code{date_min} and \code{date_max} date range. +Both constrains must be length 1 (same for all values) OR same length and order as +\code{paid_media_spends}. It's not recommended to 'exaggerate' upper bounds, especially +if the new level is way higher than historical level. Lower bound must be >=0.01, +and upper bound should be < 5.} + +\item{channel_constr_multiplier}{Numeric. Default to 3. For example, if +\code{channel_constr_low} and \code{channel_constr_up} are 0.8 to 1.2, the range is 0.4. +The allocator will also show the optimum solution for a larger constraint range of +0.4 x 3 = 1.2, or 0.4 to 1.6, to show the optimization potential to support allocation +interpretation and decision.} + +\item{optim_algo}{Character. Default to \code{"SLSQP_AUGLAG"}, short for "Sequential Least-Squares +Quadratic Programming" and "Augmented Lagrangian". Alternatively, "\code{"MMA_AUGLAG"}, +short for "Methods of Moving Asymptotes". More details see the documentation of +NLopt \href{https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/}{here}.} + +\item{maxeval}{Integer. The maximum iteration of the global optimization algorithm. +Defaults to 100000.} + +\item{constr_mode}{Character. Options are \code{"eq"} or \code{"ineq"}, +indicating constraints with equality or inequality.} + +\item{plots}{Boolean. Generate plots?} + +\item{plot_folder}{Character. Path for saving plots and files. Default +to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}.} + +\item{plot_folder_sub}{Character. Sub path for saving plots. Will overwrite the +default path with timestamp or, for refresh and allocator, simply overwrite files.} + +\item{export}{Boolean. Export outcomes into local files?} + +\item{quiet}{Boolean. Keep messages off?} + +\item{ui}{Boolean. Save additional outputs for UI usage. List outcome.} + +\item{...}{Additional parameters passed to \code{robyn_outputs()}.} + +\item{x}{\code{robyn_allocator()} output.} +} +\value{ +A list object containing allocator result. + +List. Contains optimized allocation results and plots. +} +\description{ +\code{robyn_allocator()} function returns a new split of media +variable spends that maximizes the total media response. +} +\examples{ +\dontrun{ +# Having InputCollect and OutputCollect results +AllocatorCollect <- robyn_allocator( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = "1_2_3", + scenario = "max_response", + channel_constr_low = 0.7, + channel_constr_up = c(1.2, 1.5, 1.5, 1.5, 1.5), + channel_constr_multiplier = 4, + date_range = "last_26", + export = FALSE +) +# Print a summary +print(AllocatorCollect) +# Plot the allocator one-pager +plot(AllocatorCollect) +} +} diff --git a/man/robyn_clusters.Rd b/man/robyn_clusters.Rd index 75eb02d..9580232 100644 --- a/man/robyn_clusters.Rd +++ b/man/robyn_clusters.Rd @@ -1,84 +1,84 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clusters.R -\name{robyn_clusters} -\alias{robyn_clusters} -\title{Clustering to Reduce Number of Models based on ROI and Errors} -\usage{ -robyn_clusters( - input, - dep_var_type, - cluster_by = "hyperparameters", - all_media = NULL, - k = "auto", - wss_var = 0.06, - max_clusters = 10, - limit = 1, - weights = rep(1, 3), - dim_red = "PCA", - quiet = FALSE, - export = FALSE, - seed = 123, - ... -) -} -\arguments{ -\item{input}{\code{robyn_export()}'s output or \code{pareto_aggregated.csv} results.} - -\item{dep_var_type}{Character. For dep_var_type 'revenue', ROI is used for clustering. -For conversion', CPA is used for clustering.} - -\item{cluster_by}{Character. Any of: "performance" or "hyperparameters".} - -\item{all_media}{Character vector. Default to \code{InputCollect$all_media}. -Includes \code{InputCollect$paid_media_spends} and \code{InputCollect$organic_vars}.} - -\item{k}{Integer. Number of clusters} - -\item{wss_var}{Numeric. Used to pick automatic \code{k} value, -when \code{k} is \code{NULL} based on WSS variance while considering -\code{limit} clusters. Values between (0, 1). Default value could be -0.05 to consider convergence.} - -\item{max_clusters}{Integer. Maximum number of clusters.} - -\item{limit}{Integer. Top N results per cluster. If kept in "auto", will select k -as the cluster in which the WSS variance was less than 5\%.} - -\item{weights}{Vector, size 3. How much should each error weight? -Order: nrmse, decomp.rssd, mape. The highest the value, the closer it will be scaled -to origin. Each value will be normalized so they all sum 1.} - -\item{dim_red}{Character. Select dimensionality reduction technique. -Pass any of: \code{c("PCA", "tSNE", "all", "none")}.} - -\item{quiet}{Boolean. Keep quiet? If not, print messages.} - -\item{export}{Export plots into local files?} - -\item{seed}{Numeric. Seed for reproducibility} - -\item{...}{Additional parameters passed to \code{lares::clusterKmeans()}.} -} -\value{ -List. Clustering results as labeled data.frames and plots. -} -\description{ -\code{robyn_clusters()} uses output from \code{robyn_run()}, -to reduce the number of models and create bootstrapped confidence -interval and help the user pick up the best (lowest combined error) -of the most different kinds (clusters) of models. -} -\examples{ -\dontrun{ -# Having InputCollect and OutputCollect results -cls <- robyn_clusters( - input = OutputCollect, - all_media = InputCollect$all_media, - k = 3, limit = 2, - weights = c(1, 1, 1.5) -) -} -} -\author{ -Bernardo Lares (bernardolares@meta.com) -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clusters.R +\name{robyn_clusters} +\alias{robyn_clusters} +\title{Clustering to Reduce Number of Models based on ROI and Errors} +\usage{ +robyn_clusters( + input, + dep_var_type, + cluster_by = "hyperparameters", + all_media = NULL, + k = "auto", + wss_var = 0.06, + max_clusters = 10, + limit = 1, + weights = rep(1, 3), + dim_red = "PCA", + quiet = FALSE, + export = FALSE, + seed = 123, + ... +) +} +\arguments{ +\item{input}{\code{robyn_export()}'s output or \code{pareto_aggregated.csv} results.} + +\item{dep_var_type}{Character. For dep_var_type 'revenue', ROI is used for clustering. +For conversion', CPA is used for clustering.} + +\item{cluster_by}{Character. Any of: "performance" or "hyperparameters".} + +\item{all_media}{Character vector. Default to \code{InputCollect$all_media}. +Includes \code{InputCollect$paid_media_spends} and \code{InputCollect$organic_vars}.} + +\item{k}{Integer. Number of clusters} + +\item{wss_var}{Numeric. Used to pick automatic \code{k} value, +when \code{k} is \code{NULL} based on WSS variance while considering +\code{limit} clusters. Values between (0, 1). Default value could be +0.05 to consider convergence.} + +\item{max_clusters}{Integer. Maximum number of clusters.} + +\item{limit}{Integer. Top N results per cluster. If kept in "auto", will select k +as the cluster in which the WSS variance was less than 5\%.} + +\item{weights}{Vector, size 3. How much should each error weight? +Order: nrmse, decomp.rssd, mape. The highest the value, the closer it will be scaled +to origin. Each value will be normalized so they all sum 1.} + +\item{dim_red}{Character. Select dimensionality reduction technique. +Pass any of: \code{c("PCA", "tSNE", "all", "none")}.} + +\item{quiet}{Boolean. Keep quiet? If not, print messages.} + +\item{export}{Export plots into local files?} + +\item{seed}{Numeric. Seed for reproducibility} + +\item{...}{Additional parameters passed to \code{lares::clusterKmeans()}.} +} +\value{ +List. Clustering results as labeled data.frames and plots. +} +\description{ +\code{robyn_clusters()} uses output from \code{robyn_run()}, +to reduce the number of models and create bootstrapped confidence +interval and help the user pick up the best (lowest combined error) +of the most different kinds (clusters) of models. +} +\examples{ +\dontrun{ +# Having InputCollect and OutputCollect results +cls <- robyn_clusters( + input = OutputCollect, + all_media = InputCollect$all_media, + k = 3, limit = 2, + weights = c(1, 1, 1.5) +) +} +} +\author{ +Bernardo Lares (bernardolares@meta.com) +} diff --git a/man/robyn_converge.Rd b/man/robyn_converge.Rd index d6280a8..d8c4636 100644 --- a/man/robyn_converge.Rd +++ b/man/robyn_converge.Rd @@ -1,59 +1,59 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convergence.R -\name{robyn_converge} -\alias{robyn_converge} -\title{Check Models Convergence} -\usage{ -robyn_converge( - OutputModels, - n_cuts = 20, - sd_qtref = 3, - med_lowb = 2, - nrmse_win = c(0, 0.998), - ... -) -} -\arguments{ -\item{OutputModels}{List. Output from \code{robyn_run()}.} - -\item{n_cuts}{Integer. Default to 20 (5\% cuts each).} - -\item{sd_qtref}{Integer. Reference quantile of the error convergence rule -for standard deviation (Criteria #1). Defaults to 3.} - -\item{med_lowb}{Integer. Lower bound distance of the error convergence rule -for median. (Criteria #2). Default to 3.} - -\item{nrmse_win}{Numeric vector. Lower and upper quantiles thresholds to -winsorize NRMSE. Set values within [0,1]; default: c(0, 0.998) which is 1/500.} - -\item{...}{Additional parameters} -} -\value{ -List. Plots and MOO convergence results. -} -\description{ -\code{robyn_converge()} consumes \code{robyn_run()} outputs -and calculate convergence status and builds convergence plots. -Convergence is calculated by default using the following criteria -(having kept the default parameters: sd_qtref = 3 and med_lowb = 2): -\describe{ - \item{Criteria #1:}{Last quantile's standard deviation < first 3 - quantiles' mean standard deviation} - \item{Criteria #2:}{Last quantile's absolute median < absolute first - quantile's absolute median - 2 * first 3 quantiles' mean standard - deviation} -} -Both mentioned criteria have to be satisfied to consider MOO convergence. -} -\examples{ -\dontrun{ -# Having OutputModels results -MOO <- robyn_converge( - OutputModels, - n_cuts = 10, - sd_qtref = 3, - med_lowb = 3 -) -} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convergence.R +\name{robyn_converge} +\alias{robyn_converge} +\title{Check Models Convergence} +\usage{ +robyn_converge( + OutputModels, + n_cuts = 20, + sd_qtref = 3, + med_lowb = 2, + nrmse_win = c(0, 0.998), + ... +) +} +\arguments{ +\item{OutputModels}{List. Output from \code{robyn_run()}.} + +\item{n_cuts}{Integer. Default to 20 (5\% cuts each).} + +\item{sd_qtref}{Integer. Reference quantile of the error convergence rule +for standard deviation (Criteria #1). Defaults to 3.} + +\item{med_lowb}{Integer. Lower bound distance of the error convergence rule +for median. (Criteria #2). Default to 3.} + +\item{nrmse_win}{Numeric vector. Lower and upper quantiles thresholds to +winsorize NRMSE. Set values within [0,1]; default: c(0, 0.998) which is 1/500.} + +\item{...}{Additional parameters} +} +\value{ +List. Plots and MOO convergence results. +} +\description{ +\code{robyn_converge()} consumes \code{robyn_run()} outputs +and calculate convergence status and builds convergence plots. +Convergence is calculated by default using the following criteria +(having kept the default parameters: sd_qtref = 3 and med_lowb = 2): +\describe{ + \item{Criteria #1:}{Last quantile's standard deviation < first 3 + quantiles' mean standard deviation} + \item{Criteria #2:}{Last quantile's absolute median < absolute first + quantile's absolute median - 2 * first 3 quantiles' mean standard + deviation} +} +Both mentioned criteria have to be satisfied to consider MOO convergence. +} +\examples{ +\dontrun{ +# Having OutputModels results +MOO <- robyn_converge( + OutputModels, + n_cuts = 10, + sd_qtref = 3, + med_lowb = 3 +) +} +} diff --git a/man/robyn_inputs.Rd b/man/robyn_inputs.Rd index 421bc85..399d0ec 100644 --- a/man/robyn_inputs.Rd +++ b/man/robyn_inputs.Rd @@ -1,213 +1,213 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inputs.R -\name{robyn_inputs} -\alias{robyn_inputs} -\alias{print.robyn_inputs} -\title{Input Data Check & Transformation} -\usage{ -robyn_inputs( - dt_input = NULL, - dep_var = NULL, - dep_var_type = NULL, - date_var = "auto", - paid_media_spends = NULL, - paid_media_vars = NULL, - paid_media_signs = NULL, - organic_vars = NULL, - organic_signs = NULL, - context_vars = NULL, - context_signs = NULL, - factor_vars = NULL, - dt_holidays = Robyn::dt_prophet_holidays, - prophet_vars = NULL, - prophet_signs = NULL, - prophet_country = NULL, - adstock = NULL, - hyperparameters = NULL, - window_start = NULL, - window_end = NULL, - calibration_input = NULL, - json_file = NULL, - InputCollect = NULL, - ... -) - -\method{print}{robyn_inputs}(x, ...) -} -\arguments{ -\item{dt_input}{data.frame. Raw input data. Load simulated -dataset using \code{data("dt_simulated_weekly")}} - -\item{dep_var}{Character. Name of dependent variable. Only one allowed} - -\item{dep_var_type}{Character. Type of dependent variable -as "revenue" or "conversion". Will be used to calculate ROI or CPI, -respectively. Only one allowed and case sensitive.} - -\item{date_var}{Character. Name of date variable. Daily, weekly -and monthly data supported. -\code{date_var} must have format "2020-01-01" (YYY-MM-DD). -Default to automatic date detection.} - -\item{paid_media_spends}{Character vector. Names of the paid media variables. -The values on each of these variables must be numeric. Also, -\code{paid_media_spends} must have same order and length as -\code{paid_media_vars} respectively.} - -\item{paid_media_vars}{Character vector. Names of the paid media variables' -exposure level metrics (impressions, clicks, GRP etc) other than spend. -The values on each of these variables must be numeric. These variables are not -being used to train the model but to check relationship and recommend to -split media channels into sub-channels (e.g. fb_retargeting, fb_prospecting, -etc.) to gain more variance. \code{paid_media_vars} must have same -order and length as \code{paid_media_spends} respectively and is not required.} - -\item{paid_media_signs}{Character vector. Choose any of -\code{c("default", "positive", "negative")}. Control -the signs of coefficients for \code{paid_media_vars}. Must have same -order and same length as \code{paid_media_vars}. By default, all values are -set to 'positive'.} - -\item{organic_vars}{Character vector. Typically newsletter sendings, -push-notifications, social media posts etc. Compared to \code{paid_media_vars} -\code{organic_vars} are often marketing activities without clear spends.} - -\item{organic_signs}{Character vector. Choose any of -"default", "positive", "negative". Control -the signs of coefficients for \code{organic_vars} Must have same -order and same length as \code{organic_vars}. By default, all values are -set to "positive".} - -\item{context_vars}{Character vector. Typically competitors, -price & promotion, temperature, unemployment rate, etc.} - -\item{context_signs}{Character vector. Choose any of -\code{c("default", "positive", "negative")}. Control -the signs of coefficients for context_vars. Must have same -order and same length as \code{context_vars}. By default it's -set to 'defualt'.} - -\item{factor_vars}{Character vector. Specify which of the provided -variables in organic_vars or context_vars should be forced as a factor.} - -\item{dt_holidays}{data.frame. Raw input holiday data. Load standard -Prophet holidays using \code{data("dt_prophet_holidays")}} - -\item{prophet_vars}{Character vector. Include any of "trend", -"season", "weekday", "monthly", "holiday" or NULL. Highly recommended -to use all for daily data and "trend", "season", "holiday" for -weekly and above cadence. Set to NULL to skip prophet's functionality.} - -\item{prophet_signs}{Character vector. Choose any of -"default", "positive", "negative". Control -the signs of coefficients for \code{prophet_vars}. Must have same -order and same length as \code{prophet_vars}. By default, all values are -set to "default".} - -\item{prophet_country}{Character. Only one country allowed. -Includes national holidays for all countries, whose list can -be found loading \code{data("dt_prophet_holidays")}.} - -\item{adstock}{Character. Choose any of "geometric", "weibull_cdf", -"weibull_pdf". Weibull adstock is a two-parametric function and thus more -flexible, but takes longer time than the traditional geometric one-parametric -function. CDF, or cumulative density function of the Weibull function allows -changing decay rate over time in both C and S shape, while the peak value will -always stay at the first period, meaning no lagged effect. PDF, or the -probability density function, enables peak value occurring after the first -period when shape >=1, allowing lagged effect. Run \code{plot_adstock()} to -see the difference visually. Time estimation: with geometric adstock, 2000 -iterations * 5 trials on 8 cores, it takes less than 30 minutes. Both Weibull -options take up to twice as much time.} - -\item{hyperparameters}{List. Contains hyperparameter lower and upper bounds. -Names of elements in list must be identical to output of \code{hyper_names()}. -To fix hyperparameter values, provide only one value.} - -\item{window_start, window_end}{Character. Set start and end dates of modelling -period. Recommended to not start in the first date in dataset to gain adstock -effect from previous periods. Also, columns to rows ratio in the input data -to be >=10:1, or in other words at least 10 observations to 1 independent variable. -This window will determine the date range of the data period within your dataset -you will be using to specifically regress the effects of media, organic and -context variables on your dependent variable. We recommend using a full -\code{dt_input} dataset with a minimum of 1 year of history, as it will be used -in full for the model calculation of trend, seasonality and holidays effects. -Whereas the window period will determine how much of the full data set will be -used for media, organic and context variables.} - -\item{calibration_input}{data.frame. Optional. Provide experimental results to -calibrate. Your input should include the following values for each experiment: -channel, liftStartDate, liftEndDate, liftAbs, spend, confidence, metric. -You can calibrate any spend or organic variable with a well designed experiment. -You can also use experimental results from multiple channels; to do so, -provide concatenated channel value, i.e. "channel_A+channel_B". -Check "Guide for calibration source" section.} - -\item{json_file}{Character. JSON file to import previously exported inputs or -recreate a model. To generate this file, use \code{robyn_write()}. -If you didn't export your data in the json file as "raw_data", -\code{dt_input} must be provided; \code{dt_holidays} input is optional.} - -\item{InputCollect}{Default to NULL. \code{robyn_inputs}'s output when -\code{hyperparameters} are not yet set.} - -\item{...}{Additional parameters passed to \code{prophet} functions.} - -\item{x}{\code{robyn_inputs()} output.} -} -\value{ -List. Contains all input parameters and modified results -using \code{Robyn:::robyn_engineering()}. This list is ready to be -used on other functions like \code{robyn_run()} and \code{print()}. -Class: \code{robyn_inputs}. -} -\description{ -\code{robyn_inputs()} is the function to input all model parameters and -check input correctness for the initial model build. It includes the -engineering process results that conducts trend, season, -holiday & weekday decomposition using Facebook's time-series forecasting -library \code{prophet} and fit a nonlinear model to spend and exposure -metrics in case exposure metrics are used in \code{paid_media_vars}. -} -\section{Guide for calibration source}{ - - \enumerate{ - \item We strongly recommend to use experimental and causal results - that are considered ground truth to calibrate MMM. Usual experiment - types are people-based (e.g. Facebook conversion lift) and - geo-based (e.g. Facebook GeoLift). - \item Currently, Robyn only accepts point-estimate as calibration - input. For example, if 10k$ spend is tested against a hold-out - for channel A, then input the incremental return as point-estimate - as the example below. - \item The point-estimate has to always match the spend in the variable. - For example, if channel A usually has 100k$ weekly spend and the - experimental HO is 70%, input the point-estimate for the 30k$, not the 70k$. -} -} - -\examples{ -# Using dummy simulated data -InputCollect <- robyn_inputs( - dt_input = Robyn::dt_simulated_weekly, - dt_holidays = Robyn::dt_prophet_holidays, - date_var = "DATE", - dep_var = "revenue", - dep_var_type = "revenue", - prophet_vars = c("trend", "season", "holiday"), - prophet_country = "DE", - context_vars = c("competitor_sales_B", "events"), - paid_media_spends = c("tv_S", "ooh_S", "print_S", "facebook_S", "search_S"), - paid_media_vars = c("tv_S", "ooh_S", "print_S", "facebook_I", "search_clicks_P"), - organic_vars = "newsletter", - factor_vars = "events", - window_start = "2016-11-23", - window_end = "2018-08-22", - adstock = "geometric", - # To be defined separately - hyperparameters = NULL, - calibration_input = NULL -) -print(InputCollect) -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inputs.R +\name{robyn_inputs} +\alias{robyn_inputs} +\alias{print.robyn_inputs} +\title{Input Data Check & Transformation} +\usage{ +robyn_inputs( + dt_input = NULL, + dep_var = NULL, + dep_var_type = NULL, + date_var = "auto", + paid_media_spends = NULL, + paid_media_vars = NULL, + paid_media_signs = NULL, + organic_vars = NULL, + organic_signs = NULL, + context_vars = NULL, + context_signs = NULL, + factor_vars = NULL, + dt_holidays = Robyn::dt_prophet_holidays, + prophet_vars = NULL, + prophet_signs = NULL, + prophet_country = NULL, + adstock = NULL, + hyperparameters = NULL, + window_start = NULL, + window_end = NULL, + calibration_input = NULL, + json_file = NULL, + InputCollect = NULL, + ... +) + +\method{print}{robyn_inputs}(x, ...) +} +\arguments{ +\item{dt_input}{data.frame. Raw input data. Load simulated +dataset using \code{data("dt_simulated_weekly")}} + +\item{dep_var}{Character. Name of dependent variable. Only one allowed} + +\item{dep_var_type}{Character. Type of dependent variable +as "revenue" or "conversion". Will be used to calculate ROI or CPI, +respectively. Only one allowed and case sensitive.} + +\item{date_var}{Character. Name of date variable. Daily, weekly +and monthly data supported. +\code{date_var} must have format "2020-01-01" (YYY-MM-DD). +Default to automatic date detection.} + +\item{paid_media_spends}{Character vector. Names of the paid media variables. +The values on each of these variables must be numeric. Also, +\code{paid_media_spends} must have same order and length as +\code{paid_media_vars} respectively.} + +\item{paid_media_vars}{Character vector. Names of the paid media variables' +exposure level metrics (impressions, clicks, GRP etc) other than spend. +The values on each of these variables must be numeric. These variables are not +being used to train the model but to check relationship and recommend to +split media channels into sub-channels (e.g. fb_retargeting, fb_prospecting, +etc.) to gain more variance. \code{paid_media_vars} must have same +order and length as \code{paid_media_spends} respectively and is not required.} + +\item{paid_media_signs}{Character vector. Choose any of +\code{c("default", "positive", "negative")}. Control +the signs of coefficients for \code{paid_media_vars}. Must have same +order and same length as \code{paid_media_vars}. By default, all values are +set to 'positive'.} + +\item{organic_vars}{Character vector. Typically newsletter sendings, +push-notifications, social media posts etc. Compared to \code{paid_media_vars} +\code{organic_vars} are often marketing activities without clear spends.} + +\item{organic_signs}{Character vector. Choose any of +"default", "positive", "negative". Control +the signs of coefficients for \code{organic_vars} Must have same +order and same length as \code{organic_vars}. By default, all values are +set to "positive".} + +\item{context_vars}{Character vector. Typically competitors, +price & promotion, temperature, unemployment rate, etc.} + +\item{context_signs}{Character vector. Choose any of +\code{c("default", "positive", "negative")}. Control +the signs of coefficients for context_vars. Must have same +order and same length as \code{context_vars}. By default it's +set to 'defualt'.} + +\item{factor_vars}{Character vector. Specify which of the provided +variables in organic_vars or context_vars should be forced as a factor.} + +\item{dt_holidays}{data.frame. Raw input holiday data. Load standard +Prophet holidays using \code{data("dt_prophet_holidays")}} + +\item{prophet_vars}{Character vector. Include any of "trend", +"season", "weekday", "monthly", "holiday" or NULL. Highly recommended +to use all for daily data and "trend", "season", "holiday" for +weekly and above cadence. Set to NULL to skip prophet's functionality.} + +\item{prophet_signs}{Character vector. Choose any of +"default", "positive", "negative". Control +the signs of coefficients for \code{prophet_vars}. Must have same +order and same length as \code{prophet_vars}. By default, all values are +set to "default".} + +\item{prophet_country}{Character. Only one country allowed. +Includes national holidays for all countries, whose list can +be found loading \code{data("dt_prophet_holidays")}.} + +\item{adstock}{Character. Choose any of "geometric", "weibull_cdf", +"weibull_pdf". Weibull adstock is a two-parametric function and thus more +flexible, but takes longer time than the traditional geometric one-parametric +function. CDF, or cumulative density function of the Weibull function allows +changing decay rate over time in both C and S shape, while the peak value will +always stay at the first period, meaning no lagged effect. PDF, or the +probability density function, enables peak value occurring after the first +period when shape >=1, allowing lagged effect. Run \code{plot_adstock()} to +see the difference visually. Time estimation: with geometric adstock, 2000 +iterations * 5 trials on 8 cores, it takes less than 30 minutes. Both Weibull +options take up to twice as much time.} + +\item{hyperparameters}{List. Contains hyperparameter lower and upper bounds. +Names of elements in list must be identical to output of \code{hyper_names()}. +To fix hyperparameter values, provide only one value.} + +\item{window_start, window_end}{Character. Set start and end dates of modelling +period. Recommended to not start in the first date in dataset to gain adstock +effect from previous periods. Also, columns to rows ratio in the input data +to be >=10:1, or in other words at least 10 observations to 1 independent variable. +This window will determine the date range of the data period within your dataset +you will be using to specifically regress the effects of media, organic and +context variables on your dependent variable. We recommend using a full +\code{dt_input} dataset with a minimum of 1 year of history, as it will be used +in full for the model calculation of trend, seasonality and holidays effects. +Whereas the window period will determine how much of the full data set will be +used for media, organic and context variables.} + +\item{calibration_input}{data.frame. Optional. Provide experimental results to +calibrate. Your input should include the following values for each experiment: +channel, liftStartDate, liftEndDate, liftAbs, spend, confidence, metric. +You can calibrate any spend or organic variable with a well designed experiment. +You can also use experimental results from multiple channels; to do so, +provide concatenated channel value, i.e. "channel_A+channel_B". +Check "Guide for calibration source" section.} + +\item{json_file}{Character. JSON file to import previously exported inputs or +recreate a model. To generate this file, use \code{robyn_write()}. +If you didn't export your data in the json file as "raw_data", +\code{dt_input} must be provided; \code{dt_holidays} input is optional.} + +\item{InputCollect}{Default to NULL. \code{robyn_inputs}'s output when +\code{hyperparameters} are not yet set.} + +\item{...}{Additional parameters passed to \code{prophet} functions.} + +\item{x}{\code{robyn_inputs()} output.} +} +\value{ +List. Contains all input parameters and modified results +using \code{Robyn:::robyn_engineering()}. This list is ready to be +used on other functions like \code{robyn_run()} and \code{print()}. +Class: \code{robyn_inputs}. +} +\description{ +\code{robyn_inputs()} is the function to input all model parameters and +check input correctness for the initial model build. It includes the +engineering process results that conducts trend, season, +holiday & weekday decomposition using Facebook's time-series forecasting +library \code{prophet} and fit a nonlinear model to spend and exposure +metrics in case exposure metrics are used in \code{paid_media_vars}. +} +\section{Guide for calibration source}{ + + \enumerate{ + \item We strongly recommend to use experimental and causal results + that are considered ground truth to calibrate MMM. Usual experiment + types are people-based (e.g. Facebook conversion lift) and + geo-based (e.g. Facebook GeoLift). + \item Currently, Robyn only accepts point-estimate as calibration + input. For example, if 10k$ spend is tested against a hold-out + for channel A, then input the incremental return as point-estimate + as the example below. + \item The point-estimate has to always match the spend in the variable. + For example, if channel A usually has 100k$ weekly spend and the + experimental HO is 70%, input the point-estimate for the 30k$, not the 70k$. +} +} + +\examples{ +# Using dummy simulated data +InputCollect <- robyn_inputs( + dt_input = Robyn::dt_simulated_weekly, + dt_holidays = Robyn::dt_prophet_holidays, + date_var = "DATE", + dep_var = "revenue", + dep_var_type = "revenue", + prophet_vars = c("trend", "season", "holiday"), + prophet_country = "DE", + context_vars = c("competitor_sales_B", "events"), + paid_media_spends = c("tv_S", "ooh_S", "print_S", "facebook_S", "search_S"), + paid_media_vars = c("tv_S", "ooh_S", "print_S", "facebook_I", "search_clicks_P"), + organic_vars = "newsletter", + factor_vars = "events", + window_start = "2016-11-23", + window_end = "2018-08-22", + adstock = "geometric", + # To be defined separately + hyperparameters = NULL, + calibration_input = NULL +) +print(InputCollect) +} diff --git a/man/robyn_mmm.Rd b/man/robyn_mmm.Rd index be3083b..381946d 100644 --- a/man/robyn_mmm.Rd +++ b/man/robyn_mmm.Rd @@ -1,101 +1,101 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/model.R -\name{robyn_mmm} -\alias{robyn_mmm} -\title{Core MMM Function} -\usage{ -robyn_mmm( - InputCollect, - hyper_collect, - iterations, - cores, - nevergrad_algo, - intercept = TRUE, - intercept_sign, - ts_validation = TRUE, - add_penalty_factor = FALSE, - objective_weights = NULL, - dt_hyper_fixed = NULL, - rssd_zero_penalty = TRUE, - refresh = FALSE, - trial = 1L, - seed = 123L, - quiet = FALSE, - ... -) -} -\arguments{ -\item{InputCollect}{List. Contains all input parameters for the model. -Required when \code{robyn_object} is not provided.} - -\item{hyper_collect}{List. Containing hyperparameter bounds. Defaults to -\code{InputCollect$hyperparameters}.} - -\item{iterations}{Integer. Number of iterations to run.} - -\item{cores}{Integer. Default to \code{parallel::detectCores() - 1} (all cores -except one). Set to 1 if you want to turn parallel computing off.} - -\item{nevergrad_algo}{Character. Default to "TwoPointsDE". Options are -\code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", -"DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", -"cGA", "RandomSearch")}.} - -\item{intercept}{Boolean. Should intercept(s) be fitted (default=TRUE) or -set to zero (FALSE).} - -\item{intercept_sign}{Character. Choose one of "non_negative" (default) or -"unconstrained". By default, if intercept is negative, Robyn will drop intercept -and refit the model. Consider changing intercept_sign to "unconstrained" when -there are \code{context_vars} with large positive values.} - -\item{ts_validation}{Boolean. When set to \code{TRUE}, Robyn will split data -by test, train, and validation partitions to validate the time series. By -default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be -customized or set to a fixed value using the hyperparameters input. For example, -if \code{train_size = 0.7}, validation size and test size will both be 0.15 -and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the -objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective -function.} - -\item{add_penalty_factor}{Boolean. Add penalty factor hyperparameters to -glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because -this feature might add too much hyperparameter space and probably requires -more iterations to converge.} - -\item{objective_weights}{Numeric vector. Default to NULL to give equal weights -to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration -data is provided). When you are not calibrating, only the first 2 values for -\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight -to the 1st (NRMSE). This is an experimental feature. There's no research on -optimal weight setting. Subjective weights might strongly bias modeling results.} - -\item{dt_hyper_fixed}{data.frame or named list. Only provide when loading -old model results. It consumes hyperparameters from saved csv -\code{pareto_hyperparameters.csv} or JSON file to replicate a model.} - -\item{rssd_zero_penalty}{Boolean. When TRUE, the objective function -DECOMP.RSSD will penalize models with more 0 media effects additionally. -In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef -variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while -another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1.} - -\item{refresh}{Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}.} - -\item{trial}{Integer. Which trial are we running? Used to ID each model.} - -\item{seed}{Integer. For reproducible results when running nevergrad.} - -\item{quiet}{Boolean. Keep messages off?} - -\item{...}{Additional parameters passed to \code{robyn_outputs()}.} -} -\value{ -List. MMM results with hyperparameters values. -} -\description{ -\code{robyn_mmm()} function activates Nevergrad to generate samples of -hyperparameters, conducts media transformation within each loop, fits the -Ridge regression, calibrates the model optionally, decomposes responses -and collects the result. It's an inner function within \code{robyn_run()}. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model.R +\name{robyn_mmm} +\alias{robyn_mmm} +\title{Core MMM Function} +\usage{ +robyn_mmm( + InputCollect, + hyper_collect, + iterations, + cores, + nevergrad_algo, + intercept = TRUE, + intercept_sign, + ts_validation = TRUE, + add_penalty_factor = FALSE, + objective_weights = NULL, + dt_hyper_fixed = NULL, + rssd_zero_penalty = TRUE, + refresh = FALSE, + trial = 1L, + seed = 123L, + quiet = FALSE, + ... +) +} +\arguments{ +\item{InputCollect}{List. Contains all input parameters for the model. +Required when \code{robyn_object} is not provided.} + +\item{hyper_collect}{List. Containing hyperparameter bounds. Defaults to +\code{InputCollect$hyperparameters}.} + +\item{iterations}{Integer. Number of iterations to run.} + +\item{cores}{Integer. Default to \code{parallel::detectCores() - 1} (all cores +except one). Set to 1 if you want to turn parallel computing off.} + +\item{nevergrad_algo}{Character. Default to "TwoPointsDE". Options are +\code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", +"DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", +"cGA", "RandomSearch")}.} + +\item{intercept}{Boolean. Should intercept(s) be fitted (default=TRUE) or +set to zero (FALSE).} + +\item{intercept_sign}{Character. Choose one of "non_negative" (default) or +"unconstrained". By default, if intercept is negative, Robyn will drop intercept +and refit the model. Consider changing intercept_sign to "unconstrained" when +there are \code{context_vars} with large positive values.} + +\item{ts_validation}{Boolean. When set to \code{TRUE}, Robyn will split data +by test, train, and validation partitions to validate the time series. By +default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be +customized or set to a fixed value using the hyperparameters input. For example, +if \code{train_size = 0.7}, validation size and test size will both be 0.15 +and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the +objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective +function.} + +\item{add_penalty_factor}{Boolean. Add penalty factor hyperparameters to +glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because +this feature might add too much hyperparameter space and probably requires +more iterations to converge.} + +\item{objective_weights}{Numeric vector. Default to NULL to give equal weights +to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration +data is provided). When you are not calibrating, only the first 2 values for +\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight +to the 1st (NRMSE). This is an experimental feature. There's no research on +optimal weight setting. Subjective weights might strongly bias modeling results.} + +\item{dt_hyper_fixed}{data.frame or named list. Only provide when loading +old model results. It consumes hyperparameters from saved csv +\code{pareto_hyperparameters.csv} or JSON file to replicate a model.} + +\item{rssd_zero_penalty}{Boolean. When TRUE, the objective function +DECOMP.RSSD will penalize models with more 0 media effects additionally. +In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef +variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while +another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1.} + +\item{refresh}{Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}.} + +\item{trial}{Integer. Which trial are we running? Used to ID each model.} + +\item{seed}{Integer. For reproducible results when running nevergrad.} + +\item{quiet}{Boolean. Keep messages off?} + +\item{...}{Additional parameters passed to \code{robyn_outputs()}.} +} +\value{ +List. MMM results with hyperparameters values. +} +\description{ +\code{robyn_mmm()} function activates Nevergrad to generate samples of +hyperparameters, conducts media transformation within each loop, fits the +Ridge regression, calibrates the model optionally, decomposes responses +and collects the result. It's an inner function within \code{robyn_run()}. +} diff --git a/man/robyn_outputs.Rd b/man/robyn_outputs.Rd index 91fca92..652854d 100644 --- a/man/robyn_outputs.Rd +++ b/man/robyn_outputs.Rd @@ -1,152 +1,152 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/outputs.R, R/plots.R -\name{robyn_outputs} -\alias{robyn_outputs} -\alias{print.robyn_outputs} -\alias{robyn_csv} -\alias{robyn_plots} -\alias{robyn_onepagers} -\alias{ts_validation} -\alias{decomp_plot} -\title{Evaluate Models and Output Results into Local Files} -\usage{ -robyn_outputs( - InputCollect, - OutputModels, - pareto_fronts = "auto", - calibration_constraint = 0.1, - plot_folder = NULL, - plot_folder_sub = NULL, - plot_pareto = TRUE, - csv_out = "pareto", - clusters = TRUE, - select_model = "clusters", - ui = FALSE, - export = TRUE, - all_sol_json = FALSE, - quiet = FALSE, - refresh = FALSE, - ... -) - -\method{print}{robyn_outputs}(x, ...) - -robyn_csv( - InputCollect, - OutputCollect, - csv_out = NULL, - export = TRUE, - calibrated = FALSE -) - -robyn_plots( - InputCollect, - OutputCollect, - export = TRUE, - plot_folder = OutputCollect$plot_folder, - ... -) - -robyn_onepagers( - InputCollect, - OutputCollect, - select_model = NULL, - quiet = FALSE, - export = TRUE, - plot_folder = OutputCollect$plot_folder, - baseline_level = 0, - ... -) - -ts_validation(OutputModels, quiet = FALSE, ...) - -decomp_plot( - InputCollect, - OutputCollect, - solID = NULL, - exclude = NULL, - baseline_level = 0 -) -} -\arguments{ -\item{InputCollect, OutputModels}{\code{robyn_inputs()} and \code{robyn_run()} -outcomes.} - -\item{pareto_fronts}{Integer. Number of Pareto fronts for the output. -\code{pareto_fronts = 1} returns the best models trading off \code{NRMSE} & -\code{DECOMP.RSSD}. Increase \code{pareto_fronts} to get more model choices. -\code{pareto_fronts = "auto"} selects the min fronts that include at least 100 -candidates. To customize this threshold, set value with \code{min_candidates}.} - -\item{calibration_constraint}{Numeric. Default to 0.1 and allows 0.01-0.1. When -calibrating, 0.1 means top 10% calibrated models are used for pareto-optimal -selection. Lower \code{calibration_constraint} increases calibration accuracy.} - -\item{plot_folder}{Character. Path for saving plots and files. Default -to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}.} - -\item{plot_folder_sub}{Character. Sub path for saving plots. Will overwrite the -default path with timestamp or, for refresh and allocator, simply overwrite files.} - -\item{plot_pareto}{Boolean. Set to \code{FALSE} to deactivate plotting -and saving model one-pagers. Used when testing models.} - -\item{csv_out}{Character. Accepts "pareto" or "all". Default to "pareto". Set -to "all" will output all iterations as csv. Set NULL to skip exports into CSVs.} - -\item{clusters}{Boolean. Apply \code{robyn_clusters()} to output models?} - -\item{select_model}{Character vector. Which models (by \code{solID}) do you -wish to plot the one-pagers and export? Default will take top -\code{robyn_clusters()} results.} - -\item{ui}{Boolean. Save additional outputs for UI usage. List outcome.} - -\item{export}{Boolean. Export outcomes into local files?} - -\item{all_sol_json}{Logical. Add all pareto solutions to json export?} - -\item{quiet}{Boolean. Keep messages off?} - -\item{refresh}{Boolean. Refresh mode} - -\item{...}{Additional parameters passed to \code{robyn_clusters()}} - -\item{x}{\code{robyn_outputs()} output.} - -\item{OutputCollect}{\code{robyn_run(..., export = FALSE)} output.} - -\item{calibrated}{Logical} - -\item{baseline_level}{Integer, from 0 to 5. Aggregate baseline variables, -depending on the level of aggregation you need. Default is 0 for no -aggregation. 1 for Intercept only. 2 adding trend. 3 adding all prophet -decomposition variables. 4. Adding contextual variables. 5 Adding organic -variables. Results will be reflected on the waterfall chart.} - -\item{solID}{Character vector. Model IDs to plot.} - -\item{exclude}{Character vector. Manually exclude variables from plot.} -} -\value{ -(Invisible) list. Class: \code{robyn_outputs}. Contains processed -results based on \code{robyn_run()} results. - -Invisible \code{NULL}. - -Invisible list with \code{ggplot} plots. - -Invisible list with \code{patchwork} plot(s). - -Invisible list with \code{ggplot} plots. -} -\description{ -Pack \code{robyn_plots()}, \code{robyn_csv()}, and \code{robyn_clusters()} -outcomes on \code{robyn_run()} results. When \code{UI=TRUE}, enriched -\code{OutputModels} results with additional plots and objects. - -Create a plot to visualize the convergence for each of the datasets -when running \code{robyn_run()}, especially useful for when using ts_validation. -As a reference, the closer the test and validation convergence points are, -the better, given the time-series wasn't overfitted. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/outputs.R, R/plots.R +\name{robyn_outputs} +\alias{robyn_outputs} +\alias{print.robyn_outputs} +\alias{robyn_csv} +\alias{robyn_plots} +\alias{robyn_onepagers} +\alias{ts_validation} +\alias{decomp_plot} +\title{Evaluate Models and Output Results into Local Files} +\usage{ +robyn_outputs( + InputCollect, + OutputModels, + pareto_fronts = "auto", + calibration_constraint = 0.1, + plot_folder = NULL, + plot_folder_sub = NULL, + plot_pareto = TRUE, + csv_out = "pareto", + clusters = TRUE, + select_model = "clusters", + ui = FALSE, + export = TRUE, + all_sol_json = FALSE, + quiet = FALSE, + refresh = FALSE, + ... +) + +\method{print}{robyn_outputs}(x, ...) + +robyn_csv( + InputCollect, + OutputCollect, + csv_out = NULL, + export = TRUE, + calibrated = FALSE +) + +robyn_plots( + InputCollect, + OutputCollect, + export = TRUE, + plot_folder = OutputCollect$plot_folder, + ... +) + +robyn_onepagers( + InputCollect, + OutputCollect, + select_model = NULL, + quiet = FALSE, + export = TRUE, + plot_folder = OutputCollect$plot_folder, + baseline_level = 0, + ... +) + +ts_validation(OutputModels, quiet = FALSE, ...) + +decomp_plot( + InputCollect, + OutputCollect, + solID = NULL, + exclude = NULL, + baseline_level = 0 +) +} +\arguments{ +\item{InputCollect, OutputModels}{\code{robyn_inputs()} and \code{robyn_run()} +outcomes.} + +\item{pareto_fronts}{Integer. Number of Pareto fronts for the output. +\code{pareto_fronts = 1} returns the best models trading off \code{NRMSE} & +\code{DECOMP.RSSD}. Increase \code{pareto_fronts} to get more model choices. +\code{pareto_fronts = "auto"} selects the min fronts that include at least 100 +candidates. To customize this threshold, set value with \code{min_candidates}.} + +\item{calibration_constraint}{Numeric. Default to 0.1 and allows 0.01-0.1. When +calibrating, 0.1 means top 10% calibrated models are used for pareto-optimal +selection. Lower \code{calibration_constraint} increases calibration accuracy.} + +\item{plot_folder}{Character. Path for saving plots and files. Default +to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}.} + +\item{plot_folder_sub}{Character. Sub path for saving plots. Will overwrite the +default path with timestamp or, for refresh and allocator, simply overwrite files.} + +\item{plot_pareto}{Boolean. Set to \code{FALSE} to deactivate plotting +and saving model one-pagers. Used when testing models.} + +\item{csv_out}{Character. Accepts "pareto" or "all". Default to "pareto". Set +to "all" will output all iterations as csv. Set NULL to skip exports into CSVs.} + +\item{clusters}{Boolean. Apply \code{robyn_clusters()} to output models?} + +\item{select_model}{Character vector. Which models (by \code{solID}) do you +wish to plot the one-pagers and export? Default will take top +\code{robyn_clusters()} results.} + +\item{ui}{Boolean. Save additional outputs for UI usage. List outcome.} + +\item{export}{Boolean. Export outcomes into local files?} + +\item{all_sol_json}{Logical. Add all pareto solutions to json export?} + +\item{quiet}{Boolean. Keep messages off?} + +\item{refresh}{Boolean. Refresh mode} + +\item{...}{Additional parameters passed to \code{robyn_clusters()}} + +\item{x}{\code{robyn_outputs()} output.} + +\item{OutputCollect}{\code{robyn_run(..., export = FALSE)} output.} + +\item{calibrated}{Logical} + +\item{baseline_level}{Integer, from 0 to 5. Aggregate baseline variables, +depending on the level of aggregation you need. Default is 0 for no +aggregation. 1 for Intercept only. 2 adding trend. 3 adding all prophet +decomposition variables. 4. Adding contextual variables. 5 Adding organic +variables. Results will be reflected on the waterfall chart.} + +\item{solID}{Character vector. Model IDs to plot.} + +\item{exclude}{Character vector. Manually exclude variables from plot.} +} +\value{ +(Invisible) list. Class: \code{robyn_outputs}. Contains processed +results based on \code{robyn_run()} results. + +Invisible \code{NULL}. + +Invisible list with \code{ggplot} plots. + +Invisible list with \code{patchwork} plot(s). + +Invisible list with \code{ggplot} plots. +} +\description{ +Pack \code{robyn_plots()}, \code{robyn_csv()}, and \code{robyn_clusters()} +outcomes on \code{robyn_run()} results. When \code{UI=TRUE}, enriched +\code{OutputModels} results with additional plots and objects. + +Create a plot to visualize the convergence for each of the datasets +when running \code{robyn_run()}, especially useful for when using ts_validation. +As a reference, the closer the test and validation convergence points are, +the better, given the time-series wasn't overfitted. +} diff --git a/man/robyn_refresh.Rd b/man/robyn_refresh.Rd index 042f831..2ab75f0 100644 --- a/man/robyn_refresh.Rd +++ b/man/robyn_refresh.Rd @@ -1,167 +1,167 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/refresh.R -\name{robyn_refresh} -\alias{robyn_refresh} -\alias{print.robyn_refresh} -\alias{plot.robyn_refresh} -\title{Build Refresh Model} -\usage{ -robyn_refresh( - json_file = NULL, - robyn_object = NULL, - dt_input = NULL, - dt_holidays = Robyn::dt_prophet_holidays, - refresh_steps = 4, - refresh_mode = "manual", - refresh_iters = 1000, - refresh_trials = 3, - bounds_freedom = NULL, - plot_folder = NULL, - plot_pareto = TRUE, - version_prompt = FALSE, - export = TRUE, - calibration_input = NULL, - objective_weights = NULL, - ... -) - -\method{print}{robyn_refresh}(x, ...) - -\method{plot}{robyn_refresh}(x, ...) -} -\arguments{ -\item{json_file}{Character. JSON file to import previously exported inputs or -recreate a model. To generate this file, use \code{robyn_write()}. -If you didn't export your data in the json file as "raw_data", -\code{dt_input} must be provided; \code{dt_holidays} input is optional.} - -\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object -that contains all previous modeling information or the imported list.} - -\item{dt_input}{data.frame. Should include all previous data and newly added -data for the refresh.} - -\item{dt_holidays}{data.frame. Raw input holiday data. Load standard -Prophet holidays using \code{data("dt_prophet_holidays")}.} - -\item{refresh_steps}{Integer. It controls how many time units the refresh -model build move forward. For example, \code{refresh_steps = 4} on weekly data -means the \code{InputCollect$window_start} & \code{InputCollect$window_end} -move forward 4 weeks. If \code{refresh_steps} is smaller than the number of -newly provided data points, then Robyn would only use the first N steps of the -new data.} - -\item{refresh_mode}{Character. Options are "auto" and "manual". In auto mode, -the \code{robyn_refresh()} function builds refresh models with given -\code{refresh_steps} repeatedly until there's no more data available. I -manual mode, the \code{robyn_refresh()} only moves forward \code{refresh_steps} -only once. "auto" mode has been deprecated when using \code{json_file} input.} - -\item{refresh_iters}{Integer. Iterations per refresh. Rule of thumb is, the -more new data added, the more iterations needed. More reliable recommendation -still needs to be investigated.} - -\item{refresh_trials}{Integer. Trials per refresh. Defaults to 5 trials. -More reliable recommendation still needs to be investigated.} - -\item{bounds_freedom}{Numeric. Percentage of freedom we'd like to allow for the -new hyperparameters values compared with the model to be refreshed. -If set to NULL (default) the value will be calculated as -refresh_steps / rollingWindowLength. Applies to all hyperparameters.} - -\item{plot_folder}{Character. Path for saving plots and files. Default -to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}.} - -\item{plot_pareto}{Boolean. Set to \code{FALSE} to deactivate plotting -and saving model one-pagers. Used when testing models.} - -\item{version_prompt}{Logical. If FALSE, the model refresh version will be -selected based on the smallest combined error of normalized NRMSE, DECOMP.RSSD, MAPE. -If \code{TRUE}, a prompt will be presented to the user to select one of the refreshed -models (one-pagers and Pareto CSV files will already be generated).} - -\item{export}{Boolean. Export outcomes into local files?} - -\item{calibration_input}{data.frame. Optional. Provide experimental results to -calibrate. Your input should include the following values for each experiment: -channel, liftStartDate, liftEndDate, liftAbs, spend, confidence, metric. -You can calibrate any spend or organic variable with a well designed experiment. -You can also use experimental results from multiple channels; to do so, -provide concatenated channel value, i.e. "channel_A+channel_B". -Check "Guide for calibration source" section.} - -\item{objective_weights}{Numeric vector. Default to NULL to give equal weights -to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration -data is provided). When you are not calibrating, only the first 2 values for -\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight -to the 1st (NRMSE). This is an experimental feature. There's no research on -optimal weight setting. Subjective weights might strongly bias modeling results.} - -\item{...}{Additional parameters to overwrite original custom parameters -passed into initial model.} - -\item{x}{\code{robyn_refresh()} output.} -} -\value{ -List. The Robyn object, class \code{robyn_refresh}. - -List. Same as \code{robyn_run()} but with refreshed models. -} -\description{ -\code{robyn_refresh()} builds updated models based on -the previously built models saved in the \code{Robyn.RDS} object specified -in \code{robyn_object}. For example, when updating the initial build with 4 -weeks of new data, \code{robyn_refresh()} consumes the selected model of -the initial build, sets lower and upper bounds of hyperparameters for the -new build around the selected hyperparameters of the previous build, -stabilizes the effect of baseline variables across old and new builds, and -regulates the new effect share of media variables towards the latest -spend level. It returns the aggregated results with all previous builds for -reporting purposes and produces reporting plots. - -You must run \code{robyn_save()} to select and save an initial model first, -before refreshing. - -\strong{When should \code{robyn_refresh()} NOT be used:} -The \code{robyn_refresh()} function is suitable for -updating within "reasonable periods". Two situations are considered better -to rebuild model instead of refreshing: - -1. Most data is new: If initial model was trained with 100 weeks worth of -data but we add +50 weeks of new data. - -2. New variables are added: If initial model had less variables than the ones -we want to start using on new refresh model. -} -\examples{ -\dontrun{ -# Loading dummy data -data("dt_simulated_weekly") -data("dt_prophet_holidays") -# Set the (pre-trained and exported) Robyn model JSON file -json_file <- "~/Robyn_202208081444_init/RobynModel-2_55_4.json" - -# Run \code{robyn_refresh()} with 13 weeks cadence in auto mode -Robyn <- robyn_refresh( - json_file = json_file, - dt_input = dt_simulated_weekly, - dt_holidays = Robyn::dt_prophet_holidays, - refresh_steps = 13, - refresh_mode = "auto", - refresh_iters = 200, - refresh_trials = 5 -) - -# Run \code{robyn_refresh()} with 4 weeks cadence in manual mode -json_file2 <- "~/Robyn_202208081444_init/Robyn_202208090847_rf/RobynModel-1_2_3.json" -Robyn <- robyn_refresh( - json_file = json_file2, - dt_input = dt_simulated_weekly, - dt_holidays = Robyn::dt_prophet_holidays, - refresh_steps = 4, - refresh_mode = "manual", - refresh_iters = 200, - refresh_trials = 5 -) -} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/refresh.R +\name{robyn_refresh} +\alias{robyn_refresh} +\alias{print.robyn_refresh} +\alias{plot.robyn_refresh} +\title{Build Refresh Model} +\usage{ +robyn_refresh( + json_file = NULL, + robyn_object = NULL, + dt_input = NULL, + dt_holidays = Robyn::dt_prophet_holidays, + refresh_steps = 4, + refresh_mode = "manual", + refresh_iters = 1000, + refresh_trials = 3, + bounds_freedom = NULL, + plot_folder = NULL, + plot_pareto = TRUE, + version_prompt = FALSE, + export = TRUE, + calibration_input = NULL, + objective_weights = NULL, + ... +) + +\method{print}{robyn_refresh}(x, ...) + +\method{plot}{robyn_refresh}(x, ...) +} +\arguments{ +\item{json_file}{Character. JSON file to import previously exported inputs or +recreate a model. To generate this file, use \code{robyn_write()}. +If you didn't export your data in the json file as "raw_data", +\code{dt_input} must be provided; \code{dt_holidays} input is optional.} + +\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object +that contains all previous modeling information or the imported list.} + +\item{dt_input}{data.frame. Should include all previous data and newly added +data for the refresh.} + +\item{dt_holidays}{data.frame. Raw input holiday data. Load standard +Prophet holidays using \code{data("dt_prophet_holidays")}.} + +\item{refresh_steps}{Integer. It controls how many time units the refresh +model build move forward. For example, \code{refresh_steps = 4} on weekly data +means the \code{InputCollect$window_start} & \code{InputCollect$window_end} +move forward 4 weeks. If \code{refresh_steps} is smaller than the number of +newly provided data points, then Robyn would only use the first N steps of the +new data.} + +\item{refresh_mode}{Character. Options are "auto" and "manual". In auto mode, +the \code{robyn_refresh()} function builds refresh models with given +\code{refresh_steps} repeatedly until there's no more data available. I +manual mode, the \code{robyn_refresh()} only moves forward \code{refresh_steps} +only once. "auto" mode has been deprecated when using \code{json_file} input.} + +\item{refresh_iters}{Integer. Iterations per refresh. Rule of thumb is, the +more new data added, the more iterations needed. More reliable recommendation +still needs to be investigated.} + +\item{refresh_trials}{Integer. Trials per refresh. Defaults to 5 trials. +More reliable recommendation still needs to be investigated.} + +\item{bounds_freedom}{Numeric. Percentage of freedom we'd like to allow for the +new hyperparameters values compared with the model to be refreshed. +If set to NULL (default) the value will be calculated as +refresh_steps / rollingWindowLength. Applies to all hyperparameters.} + +\item{plot_folder}{Character. Path for saving plots and files. Default +to \code{robyn_object} and saves plot in the same directory as \code{robyn_object}.} + +\item{plot_pareto}{Boolean. Set to \code{FALSE} to deactivate plotting +and saving model one-pagers. Used when testing models.} + +\item{version_prompt}{Logical. If FALSE, the model refresh version will be +selected based on the smallest combined error of normalized NRMSE, DECOMP.RSSD, MAPE. +If \code{TRUE}, a prompt will be presented to the user to select one of the refreshed +models (one-pagers and Pareto CSV files will already be generated).} + +\item{export}{Boolean. Export outcomes into local files?} + +\item{calibration_input}{data.frame. Optional. Provide experimental results to +calibrate. Your input should include the following values for each experiment: +channel, liftStartDate, liftEndDate, liftAbs, spend, confidence, metric. +You can calibrate any spend or organic variable with a well designed experiment. +You can also use experimental results from multiple channels; to do so, +provide concatenated channel value, i.e. "channel_A+channel_B". +Check "Guide for calibration source" section.} + +\item{objective_weights}{Numeric vector. Default to NULL to give equal weights +to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration +data is provided). When you are not calibrating, only the first 2 values for +\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight +to the 1st (NRMSE). This is an experimental feature. There's no research on +optimal weight setting. Subjective weights might strongly bias modeling results.} + +\item{...}{Additional parameters to overwrite original custom parameters +passed into initial model.} + +\item{x}{\code{robyn_refresh()} output.} +} +\value{ +List. The Robyn object, class \code{robyn_refresh}. + +List. Same as \code{robyn_run()} but with refreshed models. +} +\description{ +\code{robyn_refresh()} builds updated models based on +the previously built models saved in the \code{Robyn.RDS} object specified +in \code{robyn_object}. For example, when updating the initial build with 4 +weeks of new data, \code{robyn_refresh()} consumes the selected model of +the initial build, sets lower and upper bounds of hyperparameters for the +new build around the selected hyperparameters of the previous build, +stabilizes the effect of baseline variables across old and new builds, and +regulates the new effect share of media variables towards the latest +spend level. It returns the aggregated results with all previous builds for +reporting purposes and produces reporting plots. + +You must run \code{robyn_save()} to select and save an initial model first, +before refreshing. + +\strong{When should \code{robyn_refresh()} NOT be used:} +The \code{robyn_refresh()} function is suitable for +updating within "reasonable periods". Two situations are considered better +to rebuild model instead of refreshing: + +1. Most data is new: If initial model was trained with 100 weeks worth of +data but we add +50 weeks of new data. + +2. New variables are added: If initial model had less variables than the ones +we want to start using on new refresh model. +} +\examples{ +\dontrun{ +# Loading dummy data +data("dt_simulated_weekly") +data("dt_prophet_holidays") +# Set the (pre-trained and exported) Robyn model JSON file +json_file <- "~/Robyn_202208081444_init/RobynModel-2_55_4.json" + +# Run \code{robyn_refresh()} with 13 weeks cadence in auto mode +Robyn <- robyn_refresh( + json_file = json_file, + dt_input = dt_simulated_weekly, + dt_holidays = Robyn::dt_prophet_holidays, + refresh_steps = 13, + refresh_mode = "auto", + refresh_iters = 200, + refresh_trials = 5 +) + +# Run \code{robyn_refresh()} with 4 weeks cadence in manual mode +json_file2 <- "~/Robyn_202208081444_init/Robyn_202208090847_rf/RobynModel-1_2_3.json" +Robyn <- robyn_refresh( + json_file = json_file2, + dt_input = dt_simulated_weekly, + dt_holidays = Robyn::dt_prophet_holidays, + refresh_steps = 4, + refresh_mode = "manual", + refresh_iters = 200, + refresh_trials = 5 +) +} +} diff --git a/man/robyn_response.Rd b/man/robyn_response.Rd index 4b68ecf..8bbc33a 100644 --- a/man/robyn_response.Rd +++ b/man/robyn_response.Rd @@ -1,158 +1,158 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/response.R -\name{robyn_response} -\alias{robyn_response} -\title{Response and Saturation Curves} -\usage{ -robyn_response( - InputCollect = NULL, - OutputCollect = NULL, - json_file = NULL, - robyn_object = NULL, - select_build = NULL, - select_model = NULL, - metric_name = NULL, - metric_value = NULL, - date_range = NULL, - dt_hyppar = NULL, - dt_coef = NULL, - quiet = FALSE, - ... -) -} -\arguments{ -\item{InputCollect}{List. Contains all input parameters for the model. -Required when \code{robyn_object} is not provided.} - -\item{OutputCollect}{List. Containing all model result. -Required when \code{robyn_object} is not provided.} - -\item{json_file}{Character. JSON file to import previously exported inputs or -recreate a model. To generate this file, use \code{robyn_write()}. -If you didn't export your data in the json file as "raw_data", -\code{dt_input} must be provided; \code{dt_holidays} input is optional.} - -\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object -that contains all previous modeling information or the imported list.} - -\item{select_build}{Integer. Default to the latest model build. \code{select_build = 0} -selects the initial model. \code{select_build = 1} selects the first refresh model.} - -\item{select_model}{Character. A model \code{SolID}. When \code{robyn_object} -is provided, \code{select_model} defaults to the already selected \code{SolID}. When -\code{robyn_object} is not provided, \code{select_model} must be provided with -\code{InputCollect} and \code{OutputCollect}, and must be one of -\code{OutputCollect$allSolutions}.} - -\item{metric_name}{A character. Selected media variable for the response. -Must be one value from paid_media_spends, paid_media_vars or organic_vars} - -\item{metric_value}{Numeric. Desired metric value to return a response for.} - -\item{date_range}{Character. Date(s) to apply adstocked transformations and pick mean spends -per channel. Set one of: "all", "last", or "last_n" (where -n is the last N dates available), date (i.e. "2022-03-27"), or date range -(i.e. \code{c("2022-01-01", "2022-12-31")}). Default to "all".} - -\item{dt_hyppar}{A data.frame. When \code{robyn_object} is not provided, use -\code{dt_hyppar = OutputCollect$resultHypParam}. It must be provided along -\code{select_model}, \code{dt_coef} and \code{InputCollect}.} - -\item{dt_coef}{A data.frame. When \code{robyn_object} is not provided, use -\code{dt_coef = OutputCollect$xDecompAgg}. It must be provided along -\code{select_model}, \code{dt_hyppar} and \code{InputCollect}.} - -\item{quiet}{Boolean. Keep messages off?} - -\item{...}{Additional parameters passed to \code{robyn_outputs()}.} -} -\value{ -List. Response value and plot. Class: \code{robyn_response}. -} -\description{ -\code{robyn_response()} returns the response for a given -spend level of a given \code{paid_media_vars} from a selected model -result and selected model build (initial model, refresh model, etc.). -} -\examples{ -\dontrun{ -# Having InputCollect and OutputCollect objects -## Recreate original saturation curve -Response <- robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = "facebook_S" -) -Response$plot - -## Or you can call a JSON file directly (a bit slower) -# Response <- robyn_response( -# json_file = "your_json_path.json", -# dt_input = dt_simulated_weekly, -# dt_holidays = dt_prophet_holidays, -# metric_name = "facebook_S" -# ) - -## Get the "next 100 dollar" marginal response on Spend1 -Spend1 <- 20000 -Response1 <- robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = "facebook_S", - metric_value = Spend1, # total budget for date_range - date_range = "last_1" # last two periods -) -Response1$plot - -Spend2 <- Spend1 + 100 -Response2 <- robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = "facebook_S", - metric_value = Spend2, - date_range = "last_1" -) -# ROAS for the 100$ from Spend1 level -(Response2$response_total - Response1$response_total) / (Spend2 - Spend1) - -## Get response from for a given budget and date_range -Spend3 <- 100000 -Response3 <- robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = "facebook_S", - metric_value = Spend3, # total budget for date_range - date_range = "last_5" # last 5 periods -) -Response3$plot - -## Example of getting paid media exposure response curves -imps <- 10000000 -response_imps <- robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = "facebook_I", - metric_value = imps -) -response_imps$response_total / imps * 1000 -response_imps$plot - -## Example of getting organic media exposure response curves -sendings <- 30000 -response_sending <- robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = "newsletter", - metric_value = sendings -) -# response per 1000 sendings -response_sending$response_total / sendings * 1000 -response_sending$plot -} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/response.R +\name{robyn_response} +\alias{robyn_response} +\title{Response and Saturation Curves} +\usage{ +robyn_response( + InputCollect = NULL, + OutputCollect = NULL, + json_file = NULL, + robyn_object = NULL, + select_build = NULL, + select_model = NULL, + metric_name = NULL, + metric_value = NULL, + date_range = NULL, + dt_hyppar = NULL, + dt_coef = NULL, + quiet = FALSE, + ... +) +} +\arguments{ +\item{InputCollect}{List. Contains all input parameters for the model. +Required when \code{robyn_object} is not provided.} + +\item{OutputCollect}{List. Containing all model result. +Required when \code{robyn_object} is not provided.} + +\item{json_file}{Character. JSON file to import previously exported inputs or +recreate a model. To generate this file, use \code{robyn_write()}. +If you didn't export your data in the json file as "raw_data", +\code{dt_input} must be provided; \code{dt_holidays} input is optional.} + +\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object +that contains all previous modeling information or the imported list.} + +\item{select_build}{Integer. Default to the latest model build. \code{select_build = 0} +selects the initial model. \code{select_build = 1} selects the first refresh model.} + +\item{select_model}{Character. A model \code{SolID}. When \code{robyn_object} +is provided, \code{select_model} defaults to the already selected \code{SolID}. When +\code{robyn_object} is not provided, \code{select_model} must be provided with +\code{InputCollect} and \code{OutputCollect}, and must be one of +\code{OutputCollect$allSolutions}.} + +\item{metric_name}{A character. Selected media variable for the response. +Must be one value from paid_media_spends, paid_media_vars or organic_vars} + +\item{metric_value}{Numeric. Desired metric value to return a response for.} + +\item{date_range}{Character. Date(s) to apply adstocked transformations and pick mean spends +per channel. Set one of: "all", "last", or "last_n" (where +n is the last N dates available), date (i.e. "2022-03-27"), or date range +(i.e. \code{c("2022-01-01", "2022-12-31")}). Default to "all".} + +\item{dt_hyppar}{A data.frame. When \code{robyn_object} is not provided, use +\code{dt_hyppar = OutputCollect$resultHypParam}. It must be provided along +\code{select_model}, \code{dt_coef} and \code{InputCollect}.} + +\item{dt_coef}{A data.frame. When \code{robyn_object} is not provided, use +\code{dt_coef = OutputCollect$xDecompAgg}. It must be provided along +\code{select_model}, \code{dt_hyppar} and \code{InputCollect}.} + +\item{quiet}{Boolean. Keep messages off?} + +\item{...}{Additional parameters passed to \code{robyn_outputs()}.} +} +\value{ +List. Response value and plot. Class: \code{robyn_response}. +} +\description{ +\code{robyn_response()} returns the response for a given +spend level of a given \code{paid_media_vars} from a selected model +result and selected model build (initial model, refresh model, etc.). +} +\examples{ +\dontrun{ +# Having InputCollect and OutputCollect objects +## Recreate original saturation curve +Response <- robyn_response( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = select_model, + metric_name = "facebook_S" +) +Response$plot + +## Or you can call a JSON file directly (a bit slower) +# Response <- robyn_response( +# json_file = "your_json_path.json", +# dt_input = dt_simulated_weekly, +# dt_holidays = dt_prophet_holidays, +# metric_name = "facebook_S" +# ) + +## Get the "next 100 dollar" marginal response on Spend1 +Spend1 <- 20000 +Response1 <- robyn_response( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = select_model, + metric_name = "facebook_S", + metric_value = Spend1, # total budget for date_range + date_range = "last_1" # last two periods +) +Response1$plot + +Spend2 <- Spend1 + 100 +Response2 <- robyn_response( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = select_model, + metric_name = "facebook_S", + metric_value = Spend2, + date_range = "last_1" +) +# ROAS for the 100$ from Spend1 level +(Response2$response_total - Response1$response_total) / (Spend2 - Spend1) + +## Get response from for a given budget and date_range +Spend3 <- 100000 +Response3 <- robyn_response( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = select_model, + metric_name = "facebook_S", + metric_value = Spend3, # total budget for date_range + date_range = "last_5" # last 5 periods +) +Response3$plot + +## Example of getting paid media exposure response curves +imps <- 10000000 +response_imps <- robyn_response( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = select_model, + metric_name = "facebook_I", + metric_value = imps +) +response_imps$response_total / imps * 1000 +response_imps$plot + +## Example of getting organic media exposure response curves +sendings <- 30000 +response_sending <- robyn_response( + InputCollect = InputCollect, + OutputCollect = OutputCollect, + select_model = select_model, + metric_name = "newsletter", + metric_value = sendings +) +# response per 1000 sendings +response_sending$response_total / sendings * 1000 +response_sending$plot +} +} diff --git a/man/robyn_run.Rd b/man/robyn_run.Rd index af66580..9bd0be9 100644 --- a/man/robyn_run.Rd +++ b/man/robyn_run.Rd @@ -1,130 +1,130 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/model.R -\name{robyn_run} -\alias{robyn_run} -\alias{print.robyn_models} -\title{Robyn Modelling Function} -\usage{ -robyn_run( - InputCollect = NULL, - dt_hyper_fixed = NULL, - json_file = NULL, - ts_validation = FALSE, - add_penalty_factor = FALSE, - refresh = FALSE, - seed = 123L, - quiet = FALSE, - cores = NULL, - trials = 5, - iterations = 2000, - rssd_zero_penalty = TRUE, - objective_weights = NULL, - nevergrad_algo = "TwoPointsDE", - intercept = TRUE, - intercept_sign = "non_negative", - lambda_control = NULL, - outputs = FALSE, - ... -) - -\method{print}{robyn_models}(x, ...) -} -\arguments{ -\item{InputCollect}{List. Contains all input parameters for the model. -Required when \code{robyn_object} is not provided.} - -\item{dt_hyper_fixed}{data.frame or named list. Only provide when loading -old model results. It consumes hyperparameters from saved csv -\code{pareto_hyperparameters.csv} or JSON file to replicate a model.} - -\item{json_file}{Character. JSON file to import previously exported inputs or -recreate a model. To generate this file, use \code{robyn_write()}. -If you didn't export your data in the json file as "raw_data", -\code{dt_input} must be provided; \code{dt_holidays} input is optional.} - -\item{ts_validation}{Boolean. When set to \code{TRUE}, Robyn will split data -by test, train, and validation partitions to validate the time series. By -default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be -customized or set to a fixed value using the hyperparameters input. For example, -if \code{train_size = 0.7}, validation size and test size will both be 0.15 -and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the -objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective -function.} - -\item{add_penalty_factor}{Boolean. Add penalty factor hyperparameters to -glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because -this feature might add too much hyperparameter space and probably requires -more iterations to converge.} - -\item{refresh}{Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}.} - -\item{seed}{Integer. For reproducible results when running nevergrad.} - -\item{quiet}{Boolean. Keep messages off?} - -\item{cores}{Integer. Default to \code{parallel::detectCores() - 1} (all cores -except one). Set to 1 if you want to turn parallel computing off.} - -\item{trials}{Integer. Recommended 5 for default -\code{nevergrad_algo = "TwoPointsDE"}.} - -\item{iterations}{Integer. Recommended 2000 for default when using -\code{nevergrad_algo = "TwoPointsDE"}.} - -\item{rssd_zero_penalty}{Boolean. When TRUE, the objective function -DECOMP.RSSD will penalize models with more 0 media effects additionally. -In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef -variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while -another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1.} - -\item{objective_weights}{Numeric vector. Default to NULL to give equal weights -to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration -data is provided). When you are not calibrating, only the first 2 values for -\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight -to the 1st (NRMSE). This is an experimental feature. There's no research on -optimal weight setting. Subjective weights might strongly bias modeling results.} - -\item{nevergrad_algo}{Character. Default to "TwoPointsDE". Options are -\code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", -"DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", -"cGA", "RandomSearch")}.} - -\item{intercept}{Boolean. Should intercept(s) be fitted (default=TRUE) or -set to zero (FALSE).} - -\item{intercept_sign}{Character. Choose one of "non_negative" (default) or -"unconstrained". By default, if intercept is negative, Robyn will drop intercept -and refit the model. Consider changing intercept_sign to "unconstrained" when -there are \code{context_vars} with large positive values.} - -\item{lambda_control}{Deprecated in v3.6.0.} - -\item{outputs}{Boolean. If set to TRUE, will run \code{robyn_run()} and -\code{robyn_outputs()}, returning a list with OutputModels and -OutputCollect results.} - -\item{...}{Additional parameters passed to \code{robyn_outputs()}.} - -\item{x}{\code{robyn_models()} output.} -} -\value{ -List. Class: \code{robyn_models}. Contains the results of all trials -and iterations modeled. - -List. Contains all trained models. Class: \code{robyn_models}. -} -\description{ -\code{robyn_run()} consumes \code{robyn_input()} outputs, -runs \code{robyn_mmm()}, and collects all modeling results. -} -\examples{ -\dontrun{ -# Having InputCollect results -OutputModels <- robyn_run( - InputCollect = InputCollect, - cores = 2, - iterations = 200, - trials = 1 -) -} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model.R +\name{robyn_run} +\alias{robyn_run} +\alias{print.robyn_models} +\title{Robyn Modelling Function} +\usage{ +robyn_run( + InputCollect = NULL, + dt_hyper_fixed = NULL, + json_file = NULL, + ts_validation = FALSE, + add_penalty_factor = FALSE, + refresh = FALSE, + seed = 123L, + quiet = FALSE, + cores = NULL, + trials = 5, + iterations = 2000, + rssd_zero_penalty = TRUE, + objective_weights = NULL, + nevergrad_algo = "TwoPointsDE", + intercept = TRUE, + intercept_sign = "non_negative", + lambda_control = NULL, + outputs = FALSE, + ... +) + +\method{print}{robyn_models}(x, ...) +} +\arguments{ +\item{InputCollect}{List. Contains all input parameters for the model. +Required when \code{robyn_object} is not provided.} + +\item{dt_hyper_fixed}{data.frame or named list. Only provide when loading +old model results. It consumes hyperparameters from saved csv +\code{pareto_hyperparameters.csv} or JSON file to replicate a model.} + +\item{json_file}{Character. JSON file to import previously exported inputs or +recreate a model. To generate this file, use \code{robyn_write()}. +If you didn't export your data in the json file as "raw_data", +\code{dt_input} must be provided; \code{dt_holidays} input is optional.} + +\item{ts_validation}{Boolean. When set to \code{TRUE}, Robyn will split data +by test, train, and validation partitions to validate the time series. By +default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be +customized or set to a fixed value using the hyperparameters input. For example, +if \code{train_size = 0.7}, validation size and test size will both be 0.15 +and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the +objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective +function.} + +\item{add_penalty_factor}{Boolean. Add penalty factor hyperparameters to +glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because +this feature might add too much hyperparameter space and probably requires +more iterations to converge.} + +\item{refresh}{Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}.} + +\item{seed}{Integer. For reproducible results when running nevergrad.} + +\item{quiet}{Boolean. Keep messages off?} + +\item{cores}{Integer. Default to \code{parallel::detectCores() - 1} (all cores +except one). Set to 1 if you want to turn parallel computing off.} + +\item{trials}{Integer. Recommended 5 for default +\code{nevergrad_algo = "TwoPointsDE"}.} + +\item{iterations}{Integer. Recommended 2000 for default when using +\code{nevergrad_algo = "TwoPointsDE"}.} + +\item{rssd_zero_penalty}{Boolean. When TRUE, the objective function +DECOMP.RSSD will penalize models with more 0 media effects additionally. +In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef +variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while +another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1.} + +\item{objective_weights}{Numeric vector. Default to NULL to give equal weights +to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration +data is provided). When you are not calibrating, only the first 2 values for +\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight +to the 1st (NRMSE). This is an experimental feature. There's no research on +optimal weight setting. Subjective weights might strongly bias modeling results.} + +\item{nevergrad_algo}{Character. Default to "TwoPointsDE". Options are +\code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", +"DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", +"cGA", "RandomSearch")}.} + +\item{intercept}{Boolean. Should intercept(s) be fitted (default=TRUE) or +set to zero (FALSE).} + +\item{intercept_sign}{Character. Choose one of "non_negative" (default) or +"unconstrained". By default, if intercept is negative, Robyn will drop intercept +and refit the model. Consider changing intercept_sign to "unconstrained" when +there are \code{context_vars} with large positive values.} + +\item{lambda_control}{Deprecated in v3.6.0.} + +\item{outputs}{Boolean. If set to TRUE, will run \code{robyn_run()} and +\code{robyn_outputs()}, returning a list with OutputModels and +OutputCollect results.} + +\item{...}{Additional parameters passed to \code{robyn_outputs()}.} + +\item{x}{\code{robyn_models()} output.} +} +\value{ +List. Class: \code{robyn_models}. Contains the results of all trials +and iterations modeled. + +List. Contains all trained models. Class: \code{robyn_models}. +} +\description{ +\code{robyn_run()} consumes \code{robyn_input()} outputs, +runs \code{robyn_mmm()}, and collects all modeling results. +} +\examples{ +\dontrun{ +# Having InputCollect results +OutputModels <- robyn_run( + InputCollect = InputCollect, + cores = 2, + iterations = 200, + trials = 1 +) +} +} diff --git a/man/robyn_save.Rd b/man/robyn_save.Rd index b792b66..214a2e2 100644 --- a/man/robyn_save.Rd +++ b/man/robyn_save.Rd @@ -1,60 +1,60 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/exports.R -\name{robyn_save} -\alias{robyn_save} -\alias{print.robyn_save} -\alias{plot.robyn_save} -\alias{robyn_load} -\title{Export Robyn Model to Local File [DEPRECATED]} -\usage{ -robyn_save( - InputCollect, - OutputCollect, - robyn_object = NULL, - select_model = NULL, - dir = OutputCollect$plot_folder, - quiet = FALSE, - ... -) - -\method{print}{robyn_save}(x, ...) - -\method{plot}{robyn_save}(x, ...) - -robyn_load(robyn_object, select_build = NULL, quiet = FALSE) -} -\arguments{ -\item{InputCollect}{List. Contains all input parameters for the model. -Required when \code{robyn_object} is not provided.} - -\item{OutputCollect}{List. Containing all model result. -Required when \code{robyn_object} is not provided.} - -\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object -that contains all previous modeling information or the imported list.} - -\item{select_model}{Character. A model \code{SolID}. When \code{robyn_object} -is provided, \code{select_model} defaults to the already selected \code{SolID}. When -\code{robyn_object} is not provided, \code{select_model} must be provided with -\code{InputCollect} and \code{OutputCollect}, and must be one of -\code{OutputCollect$allSolutions}.} - -\item{dir}{Character. Existing directory to export JSON file to.} - -\item{quiet}{Boolean. Keep messages off?} - -\item{...}{Additional parameters passed to \code{robyn_outputs()}.} - -\item{x}{\code{robyn_save()} output.} - -\item{select_build}{Integer. Default to the latest model build. \code{select_build = 0} -selects the initial model. \code{select_build = 1} selects the first refresh model.} -} -\value{ -(Invisible) list with filename and summary. Class: \code{robyn_save}. - -(Invisible) list with imported results -} -\description{ -Use \code{robyn_save()} to select and save as .RDS file the initial model. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/exports.R +\name{robyn_save} +\alias{robyn_save} +\alias{print.robyn_save} +\alias{plot.robyn_save} +\alias{robyn_load} +\title{Export Robyn Model to Local File [DEPRECATED]} +\usage{ +robyn_save( + InputCollect, + OutputCollect, + robyn_object = NULL, + select_model = NULL, + dir = OutputCollect$plot_folder, + quiet = FALSE, + ... +) + +\method{print}{robyn_save}(x, ...) + +\method{plot}{robyn_save}(x, ...) + +robyn_load(robyn_object, select_build = NULL, quiet = FALSE) +} +\arguments{ +\item{InputCollect}{List. Contains all input parameters for the model. +Required when \code{robyn_object} is not provided.} + +\item{OutputCollect}{List. Containing all model result. +Required when \code{robyn_object} is not provided.} + +\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object +that contains all previous modeling information or the imported list.} + +\item{select_model}{Character. A model \code{SolID}. When \code{robyn_object} +is provided, \code{select_model} defaults to the already selected \code{SolID}. When +\code{robyn_object} is not provided, \code{select_model} must be provided with +\code{InputCollect} and \code{OutputCollect}, and must be one of +\code{OutputCollect$allSolutions}.} + +\item{dir}{Character. Existing directory to export JSON file to.} + +\item{quiet}{Boolean. Keep messages off?} + +\item{...}{Additional parameters passed to \code{robyn_outputs()}.} + +\item{x}{\code{robyn_save()} output.} + +\item{select_build}{Integer. Default to the latest model build. \code{select_build = 0} +selects the initial model. \code{select_build = 1} selects the first refresh model.} +} +\value{ +(Invisible) list with filename and summary. Class: \code{robyn_save}. + +(Invisible) list with imported results +} +\description{ +Use \code{robyn_save()} to select and save as .RDS file the initial model. +} diff --git a/man/robyn_train.Rd b/man/robyn_train.Rd index d2f2d70..f82d512 100644 --- a/man/robyn_train.Rd +++ b/man/robyn_train.Rd @@ -1,98 +1,98 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/model.R -\name{robyn_train} -\alias{robyn_train} -\title{Train Robyn Models} -\usage{ -robyn_train( - InputCollect, - hyper_collect, - cores, - iterations, - trials, - intercept_sign, - intercept, - nevergrad_algo, - dt_hyper_fixed = NULL, - ts_validation = TRUE, - add_penalty_factor = FALSE, - objective_weights = NULL, - rssd_zero_penalty = TRUE, - refresh = FALSE, - seed = 123, - quiet = FALSE -) -} -\arguments{ -\item{InputCollect}{List. Contains all input parameters for the model. -Required when \code{robyn_object} is not provided.} - -\item{hyper_collect}{List. Containing hyperparameter bounds. Defaults to -\code{InputCollect$hyperparameters}.} - -\item{cores}{Integer. Default to \code{parallel::detectCores() - 1} (all cores -except one). Set to 1 if you want to turn parallel computing off.} - -\item{iterations}{Integer. Recommended 2000 for default when using -\code{nevergrad_algo = "TwoPointsDE"}.} - -\item{trials}{Integer. Recommended 5 for default -\code{nevergrad_algo = "TwoPointsDE"}.} - -\item{intercept_sign}{Character. Choose one of "non_negative" (default) or -"unconstrained". By default, if intercept is negative, Robyn will drop intercept -and refit the model. Consider changing intercept_sign to "unconstrained" when -there are \code{context_vars} with large positive values.} - -\item{intercept}{Boolean. Should intercept(s) be fitted (default=TRUE) or -set to zero (FALSE).} - -\item{nevergrad_algo}{Character. Default to "TwoPointsDE". Options are -\code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", -"DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", -"cGA", "RandomSearch")}.} - -\item{dt_hyper_fixed}{data.frame or named list. Only provide when loading -old model results. It consumes hyperparameters from saved csv -\code{pareto_hyperparameters.csv} or JSON file to replicate a model.} - -\item{ts_validation}{Boolean. When set to \code{TRUE}, Robyn will split data -by test, train, and validation partitions to validate the time series. By -default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be -customized or set to a fixed value using the hyperparameters input. For example, -if \code{train_size = 0.7}, validation size and test size will both be 0.15 -and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the -objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective -function.} - -\item{add_penalty_factor}{Boolean. Add penalty factor hyperparameters to -glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because -this feature might add too much hyperparameter space and probably requires -more iterations to converge.} - -\item{objective_weights}{Numeric vector. Default to NULL to give equal weights -to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration -data is provided). When you are not calibrating, only the first 2 values for -\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight -to the 1st (NRMSE). This is an experimental feature. There's no research on -optimal weight setting. Subjective weights might strongly bias modeling results.} - -\item{rssd_zero_penalty}{Boolean. When TRUE, the objective function -DECOMP.RSSD will penalize models with more 0 media effects additionally. -In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef -variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while -another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1.} - -\item{refresh}{Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}.} - -\item{seed}{Integer. For reproducible results when running nevergrad.} - -\item{quiet}{Boolean. Keep messages off?} -} -\value{ -List. Iteration results to include in \code{robyn_run()} results. -} -\description{ -\code{robyn_train()} consumes output from \code{robyn_input()} -and runs the \code{robyn_mmm()} on each trial. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model.R +\name{robyn_train} +\alias{robyn_train} +\title{Train Robyn Models} +\usage{ +robyn_train( + InputCollect, + hyper_collect, + cores, + iterations, + trials, + intercept_sign, + intercept, + nevergrad_algo, + dt_hyper_fixed = NULL, + ts_validation = TRUE, + add_penalty_factor = FALSE, + objective_weights = NULL, + rssd_zero_penalty = TRUE, + refresh = FALSE, + seed = 123, + quiet = FALSE +) +} +\arguments{ +\item{InputCollect}{List. Contains all input parameters for the model. +Required when \code{robyn_object} is not provided.} + +\item{hyper_collect}{List. Containing hyperparameter bounds. Defaults to +\code{InputCollect$hyperparameters}.} + +\item{cores}{Integer. Default to \code{parallel::detectCores() - 1} (all cores +except one). Set to 1 if you want to turn parallel computing off.} + +\item{iterations}{Integer. Recommended 2000 for default when using +\code{nevergrad_algo = "TwoPointsDE"}.} + +\item{trials}{Integer. Recommended 5 for default +\code{nevergrad_algo = "TwoPointsDE"}.} + +\item{intercept_sign}{Character. Choose one of "non_negative" (default) or +"unconstrained". By default, if intercept is negative, Robyn will drop intercept +and refit the model. Consider changing intercept_sign to "unconstrained" when +there are \code{context_vars} with large positive values.} + +\item{intercept}{Boolean. Should intercept(s) be fitted (default=TRUE) or +set to zero (FALSE).} + +\item{nevergrad_algo}{Character. Default to "TwoPointsDE". Options are +\code{c("DE","TwoPointsDE", "OnePlusOne", "DoubleFastGADiscreteOnePlusOne", +"DiscreteOnePlusOne", "PortfolioDiscreteOnePlusOne", "NaiveTBPSA", +"cGA", "RandomSearch")}.} + +\item{dt_hyper_fixed}{data.frame or named list. Only provide when loading +old model results. It consumes hyperparameters from saved csv +\code{pareto_hyperparameters.csv} or JSON file to replicate a model.} + +\item{ts_validation}{Boolean. When set to \code{TRUE}, Robyn will split data +by test, train, and validation partitions to validate the time series. By +default the "train_size" range is set to \code{c(0.5, 0.8)}, but it can be +customized or set to a fixed value using the hyperparameters input. For example, +if \code{train_size = 0.7}, validation size and test size will both be 0.15 +and 0.15. When \code{ts_validation = FALSE}, nrmse_train is the +objective function; when \code{ts_validation = TRUE}, nrmse_val is the objective +function.} + +\item{add_penalty_factor}{Boolean. Add penalty factor hyperparameters to +glmnet's penalty.factor to be optimized by nevergrad. Use with caution, because +this feature might add too much hyperparameter space and probably requires +more iterations to converge.} + +\item{objective_weights}{Numeric vector. Default to NULL to give equal weights +to all objective functions. Order: NRMSE, DECOMP.RSSD, MAPE (when calibration +data is provided). When you are not calibrating, only the first 2 values for +\code{objective_weights} must be defined, i.e. set c(2, 1) to give double weight +to the 1st (NRMSE). This is an experimental feature. There's no research on +optimal weight setting. Subjective weights might strongly bias modeling results.} + +\item{rssd_zero_penalty}{Boolean. When TRUE, the objective function +DECOMP.RSSD will penalize models with more 0 media effects additionally. +In other words, given the same DECOMP.RSSD score, a model with 50\% 0-coef +variables will get penalized by DECOMP.RSSD * 1.5 (larger error), while +another model with no 0-coef variables gets un-penalized with DECOMP.RSSD * 1.} + +\item{refresh}{Boolean. Set to \code{TRUE} when used in \code{robyn_refresh()}.} + +\item{seed}{Integer. For reproducible results when running nevergrad.} + +\item{quiet}{Boolean. Keep messages off?} +} +\value{ +List. Iteration results to include in \code{robyn_run()} results. +} +\description{ +\code{robyn_train()} consumes output from \code{robyn_input()} +and runs the \code{robyn_mmm()} on each trial. +} diff --git a/man/robyn_update.Rd b/man/robyn_update.Rd index 7f48caf..5ccf14f 100644 --- a/man/robyn_update.Rd +++ b/man/robyn_update.Rd @@ -1,24 +1,24 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/auxiliary.R -\name{robyn_update} -\alias{robyn_update} -\title{Update Robyn Version} -\usage{ -robyn_update(dev = TRUE, ...) -} -\arguments{ -\item{dev}{Boolean. Dev version? If not, CRAN version.} - -\item{...}{Parameters to pass to \code{remotes::install_github} -or \code{utils::install.packages}, depending on \code{dev} parameter.} -} -\value{ -Invisible \code{NULL}. -} -\description{ -Update Robyn version from -\href{https://github.com/facebookexperimental/Robyn}{Github repository} -for latest "dev" version or from -\href{https://CRAN.R-project.org/package=Robyn}{CRAN} -for latest "stable" version. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/auxiliary.R +\name{robyn_update} +\alias{robyn_update} +\title{Update Robyn Version} +\usage{ +robyn_update(dev = TRUE, ...) +} +\arguments{ +\item{dev}{Boolean. Dev version? If not, CRAN version.} + +\item{...}{Parameters to pass to \code{remotes::install_github} +or \code{utils::install.packages}, depending on \code{dev} parameter.} +} +\value{ +Invisible \code{NULL}. +} +\description{ +Update Robyn version from +\href{https://github.com/facebookexperimental/Robyn}{Github repository} +for latest "dev" version or from +\href{https://CRAN.R-project.org/package=Robyn}{CRAN} +for latest "stable" version. +} diff --git a/man/robyn_write.Rd b/man/robyn_write.Rd index 4631147..d877e1c 100644 --- a/man/robyn_write.Rd +++ b/man/robyn_write.Rd @@ -1,78 +1,78 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/json.R -\name{robyn_write} -\alias{robyn_write} -\alias{print.robyn_write} -\alias{robyn_read} -\alias{print.robyn_read} -\alias{robyn_recreate} -\title{Import and Export Robyn JSON files} -\usage{ -robyn_write( - InputCollect, - OutputCollect = NULL, - select_model = NULL, - dir = OutputCollect$plot_folder, - add_data = TRUE, - export = TRUE, - quiet = FALSE, - pareto_df = NULL, - ... -) - -\method{print}{robyn_write}(x, ...) - -robyn_read(json_file = NULL, step = 1, quiet = FALSE, ...) - -\method{print}{robyn_read}(x, ...) - -robyn_recreate(json_file, quiet = FALSE, ...) -} -\arguments{ -\item{InputCollect}{\code{robyn_inputs()} output.} - -\item{OutputCollect}{\code{robyn_run(..., export = FALSE)} output.} - -\item{select_model}{Character. Which model ID do you want to export -into the JSON file?} - -\item{dir}{Character. Existing directory to export JSON file to.} - -\item{add_data}{Boolean. Include raw dataset. Useful to recreate models -with a single file containing all the required information (no need of CSV).} - -\item{export}{Boolean. Export outcomes into local files?} - -\item{quiet}{Boolean. Keep messages off?} - -\item{pareto_df}{Dataframe. Save all pareto solutions to json file.} - -\item{...}{Additional parameters to export into a custom Extras element.} - -\item{x}{\code{robyn_read()} or \code{robyn_write()} output.} - -\item{json_file}{Character. JSON file name to read and import.} - -\item{step}{Integer. 1 for import only and 2 for import and output.} -} -\value{ -(invisible) List. Contains all inputs and outputs of exported model. -Class: \code{robyn_write}. -} -\description{ -\code{robyn_write()} generates light JSON files with all the information -required to replicate Robyn models. Depending on user inputs, there are -3 use cases: only the inputs data, input data + modeling results data, -and input data, modeling results + specifics of a single selected model. -To replicate a model, you must provide InputCollect, OutputCollect, and, -if OutputCollect contains more than one model, the select_model. -} -\examples{ -\dontrun{ -InputCollectJSON <- robyn_inputs( - dt_input = Robyn::dt_simulated_weekly, - json_file = "~/Desktop/RobynModel-1_29_12.json" -) -print(InputCollectJSON) -} -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/json.R +\name{robyn_write} +\alias{robyn_write} +\alias{print.robyn_write} +\alias{robyn_read} +\alias{print.robyn_read} +\alias{robyn_recreate} +\title{Import and Export Robyn JSON files} +\usage{ +robyn_write( + InputCollect, + OutputCollect = NULL, + select_model = NULL, + dir = OutputCollect$plot_folder, + add_data = TRUE, + export = TRUE, + quiet = FALSE, + pareto_df = NULL, + ... +) + +\method{print}{robyn_write}(x, ...) + +robyn_read(json_file = NULL, step = 1, quiet = FALSE, ...) + +\method{print}{robyn_read}(x, ...) + +robyn_recreate(json_file, quiet = FALSE, ...) +} +\arguments{ +\item{InputCollect}{\code{robyn_inputs()} output.} + +\item{OutputCollect}{\code{robyn_run(..., export = FALSE)} output.} + +\item{select_model}{Character. Which model ID do you want to export +into the JSON file?} + +\item{dir}{Character. Existing directory to export JSON file to.} + +\item{add_data}{Boolean. Include raw dataset. Useful to recreate models +with a single file containing all the required information (no need of CSV).} + +\item{export}{Boolean. Export outcomes into local files?} + +\item{quiet}{Boolean. Keep messages off?} + +\item{pareto_df}{Dataframe. Save all pareto solutions to json file.} + +\item{...}{Additional parameters to export into a custom Extras element.} + +\item{x}{\code{robyn_read()} or \code{robyn_write()} output.} + +\item{json_file}{Character. JSON file name to read and import.} + +\item{step}{Integer. 1 for import only and 2 for import and output.} +} +\value{ +(invisible) List. Contains all inputs and outputs of exported model. +Class: \code{robyn_write}. +} +\description{ +\code{robyn_write()} generates light JSON files with all the information +required to replicate Robyn models. Depending on user inputs, there are +3 use cases: only the inputs data, input data + modeling results data, +and input data, modeling results + specifics of a single selected model. +To replicate a model, you must provide InputCollect, OutputCollect, and, +if OutputCollect contains more than one model, the select_model. +} +\examples{ +\dontrun{ +InputCollectJSON <- robyn_inputs( + dt_input = Robyn::dt_simulated_weekly, + json_file = "~/Desktop/RobynModel-1_29_12.json" +) +print(InputCollectJSON) +} +} diff --git a/man/saturation_hill.Rd b/man/saturation_hill.Rd index f4c6d63..8411635 100644 --- a/man/saturation_hill.Rd +++ b/man/saturation_hill.Rd @@ -1,43 +1,43 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/transformation.R -\name{saturation_hill} -\alias{saturation_hill} -\alias{plot_saturation} -\title{Hill Saturation Transformation} -\usage{ -saturation_hill(x, alpha, gamma, x_marginal = NULL) - -plot_saturation(plot = TRUE) -} -\arguments{ -\item{x}{Numeric vector.} - -\item{alpha}{Numeric. Alpha controls the shape of the saturation curve. -The larger the alpha, the more S-shape. The smaller, the more C-shape.} - -\item{gamma}{Numeric. Gamma controls the inflexion point of the -saturation curve. The larger the gamma, the later the inflexion point occurs.} - -\item{x_marginal}{Numeric. When provided, the function returns the -Hill-transformed value of the x_marginal input.} - -\item{plot}{Boolean. Do you wish to return the plot?} -} -\value{ -Numeric values. Transformed values. -} -\description{ -\code{saturation_hill} is a two-parametric version of the Hill -function that allows the saturation curve to flip between S and C shape. - -Produce example plots for the Hill saturation curve. -} -\examples{ -saturation_hill(c(100, 150, 170, 190, 200), alpha = 3, gamma = 0.5) -} -\seealso{ -Other Transformations: -\code{\link{adstock_geometric}()}, -\code{\link{mic_men}()} -} -\concept{Transformations} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transformation.R +\name{saturation_hill} +\alias{saturation_hill} +\alias{plot_saturation} +\title{Hill Saturation Transformation} +\usage{ +saturation_hill(x, alpha, gamma, x_marginal = NULL) + +plot_saturation(plot = TRUE) +} +\arguments{ +\item{x}{Numeric vector.} + +\item{alpha}{Numeric. Alpha controls the shape of the saturation curve. +The larger the alpha, the more S-shape. The smaller, the more C-shape.} + +\item{gamma}{Numeric. Gamma controls the inflexion point of the +saturation curve. The larger the gamma, the later the inflexion point occurs.} + +\item{x_marginal}{Numeric. When provided, the function returns the +Hill-transformed value of the x_marginal input.} + +\item{plot}{Boolean. Do you wish to return the plot?} +} +\value{ +Numeric values. Transformed values. +} +\description{ +\code{saturation_hill} is a two-parametric version of the Hill +function that allows the saturation curve to flip between S and C shape. + +Produce example plots for the Hill saturation curve. +} +\examples{ +saturation_hill(c(100, 150, 170, 190, 200), alpha = 3, gamma = 0.5) +} +\seealso{ +Other Transformations: +\code{\link{adstock_geometric}()}, +\code{\link{mic_men}()} +} +\concept{Transformations} diff --git a/man/set_holidays.Rd b/man/set_holidays.Rd index 076283a..458b195 100644 --- a/man/set_holidays.Rd +++ b/man/set_holidays.Rd @@ -1,23 +1,23 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inputs.R -\name{set_holidays} -\alias{set_holidays} -\title{Detect and set date variable interval} -\usage{ -set_holidays(dt_transform, dt_holidays, intervalType) -} -\arguments{ -\item{dt_transform}{A data.frame. Transformed input data.} - -\item{dt_holidays}{A data.frame. Raw input holiday data.} - -\item{intervalType}{A character. Accepts one of the values: -\code{c("day","week","month")}} -} -\value{ -List. Containing the all spend-exposure model results. -} -\description{ -Robyn only accepts daily, weekly and monthly data. This function -is only called in \code{robyn_engineering()}. -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inputs.R +\name{set_holidays} +\alias{set_holidays} +\title{Detect and set date variable interval} +\usage{ +set_holidays(dt_transform, dt_holidays, intervalType) +} +\arguments{ +\item{dt_transform}{A data.frame. Transformed input data.} + +\item{dt_holidays}{A data.frame. Raw input holiday data.} + +\item{intervalType}{A character. Accepts one of the values: +\code{c("day","week","month")}} +} +\value{ +List. Containing the all spend-exposure model results. +} +\description{ +Robyn only accepts daily, weekly and monthly data. This function +is only called in \code{robyn_engineering()}. +}