Skip to content

Commit

Permalink
Merge pull request #283 from facebookexperimental/lares
Browse files Browse the repository at this point in the history
Clustering and functions split for exporting results
  • Loading branch information
laresbernardo authored Feb 1, 2022
2 parents 86202d5 + 20981cb commit 4b3d829
Show file tree
Hide file tree
Showing 24 changed files with 1,692 additions and 1,043 deletions.
9 changes: 6 additions & 3 deletions R/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
Package: Robyn
Type: Package
Title: Automated Marketing Mix Modeling (MMM) Open Source Beta Project from Facebook Marketing Science
Version: 3.4.12
Version: 3.5.0
Authors@R: c(
person("Gufeng", "Zhou", , "gufeng@fb.com", c("aut")),
person("Leonel", "Sentana", , "leonelsentana@fb.com", c("aut")),
person("Igor", "Skokan", , "igorskokan@fb.com", c("aut")),
person("Bernardo", "Lares", , "bernardolares@fb.com", c("cre")))
person("Bernardo", "Lares", , "bernardolares@fb.com", c("cre","aut")))
Maintainer: Gufeng Zhou <gufeng@fb.com>, Bernardo Lares <bernardolares@fb.com>
Description: Automated Marketing Mix Modeling (MMM) package that aims to reduce human bias by means of ridge regression and evolutionary algorithms, enables actionable decision making providing a budget allocator and diminishing returns curves and allows ground-truth calibration to account for causation.
Depends:
Expand All @@ -15,18 +15,21 @@ Imports:
data.table,
doParallel,
doRNG,
dplyr,
foreach,
ggplot2,
ggridges,
glmnet,
lares,
lubridate,
minpack.lm,
nloptr,
patchwork,
prophet,
reticulate,
rPref,
stringr
stringr,
tidyr
Suggests:
shiny
Config/reticulate:
Expand Down
34 changes: 34 additions & 0 deletions R/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,57 @@ export(mic_men)
export(plot_adstock)
export(plot_saturation)
export(robyn_allocator)
export(robyn_clusters)
export(robyn_csv)
export(robyn_engineering)
export(robyn_inputs)
export(robyn_mmm)
export(robyn_onepagers)
export(robyn_outputs)
export(robyn_plots)
export(robyn_refresh)
export(robyn_response)
export(robyn_run)
export(robyn_save)
export(robyn_train)
export(saturation_hill)
import(data.table)
import(ggplot2)
importFrom(doParallel,registerDoParallel)
importFrom(doParallel,stopImplicitCluster)
importFrom(doRNG,"%dorng%")
importFrom(dplyr,any_of)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
importFrom(dplyr,bind_rows)
importFrom(dplyr,contains)
importFrom(dplyr,desc)
importFrom(dplyr,distinct)
importFrom(dplyr,everything)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,lag)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,pull)
importFrom(dplyr,row_number)
importFrom(dplyr,select)
importFrom(dplyr,slice)
importFrom(dplyr,ungroup)
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(foreach,getDoParWorkers)
importFrom(foreach,registerDoSEQ)
importFrom(ggridges,geom_density_ridges)
importFrom(glmnet,cv.glmnet)
importFrom(glmnet,glmnet)
importFrom(lares,`%>%`)
importFrom(lares,check_opts)
importFrom(lares,clusterKmeans)
importFrom(lares,formatNum)
importFrom(lares,freqs)
importFrom(lares,removenacols)
importFrom(lares,theme_lares)
importFrom(lubridate,day)
importFrom(lubridate,floor_date)
importFrom(lubridate,is.Date)
Expand Down Expand Up @@ -71,7 +102,10 @@ importFrom(stringr,str_extract)
importFrom(stringr,str_remove)
importFrom(stringr,str_replace)
importFrom(stringr,str_which)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(utils,askYesNo)
importFrom(utils,flush.console)
importFrom(utils,head)
importFrom(utils,setTxtProgressBar)
importFrom(utils,txtProgressBar)
4 changes: 1 addition & 3 deletions R/R/allocator.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#' variable spends that maximizes the total media response.
#'
#' @inheritParams robyn_run
#' @inheritParams robyn_outputs
#' @param robyn_object Character. Path of the \code{Robyn.RDS} object
#' that contains all previous modeling information.
#' @param select_build Integer. Default to the latest model build. \code{select_buil = 0}
Expand Down Expand Up @@ -480,8 +481,6 @@ robyn_allocator <- function(robyn_object = NULL,
)
}

# print(nlsMod)

## collect output

dt_bestModel <- dt_bestCoef[, .(rn, mean_spend, xDecompAgg, roi_total, roi_mean)][order(rank(rn))]
Expand Down Expand Up @@ -511,7 +510,6 @@ robyn_allocator <- function(robyn_object = NULL,
)

dt_optimOut[, optmResponseUnitTotalLift := (optmResponseUnitTotal / initResponseUnitTotal) - 1]
# print(dt_optimOut)

## plot allocator results

Expand Down
49 changes: 49 additions & 0 deletions R/R/checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -468,3 +468,52 @@ check_calibconstr <- function(calibration_constraint, iterations, trials, calibr
}
return(calibration_constraint)
}

check_hyper_fixed <- function(InputCollect, dt_hyper_fixed) {
hyper_fixed <- all(length(InputCollect$hyperparameters) == 1)
if (hyper_fixed & is.null(dt_hyper_fixed)) {
stop(paste("hyperparameters can't be all fixed for hyperparameter optimisation.",
"If you want to get old model result, please provide only 1 model / 1 row from",
"OutputCollect$resultHypParam or pareto_hyperparameters.csv from previous runs"))
}
if (!is.null(dt_hyper_fixed)) {
## Run robyn_mmm if using old model result tables
dt_hyper_fixed <- as.data.table(dt_hyper_fixed)
if (nrow(dt_hyper_fixed) != 1) {
stop(paste("Provide only 1 model / 1 row from OutputCollect$resultHypParam or",
"pareto_hyperparameters.csv from previous runs"))
}
hypParamSamName <- hyper_names(adstock = InputCollect$adstock, all_media = InputCollect$all_media)
if (!all(c(hypParamSamName, "lambda") %in% names(dt_hyper_fixed))) {
stop(paste("dt_hyper_fixed is provided with wrong input.",
"Please provide the table OutputCollect$resultHypParam from previous runs or",
"pareto_hyperparameters.csv with desired model ID"))
}
}
return(hyper_fixed)
}

# Enable parallelisation of main modelling loop for MacOS and Linux only
check_parallel <- function() "unix" %in% .Platform$OS.type
# ggplot doesn't work with process forking on MacOS; however it works fine on Linux and Windows
check_parallel_plot <- function() !"Darwin" %in% Sys.info()["sysname"]

check_parallel_msg <- function(InputCollect) {
if (check_parallel()) {
message(paste(
"Using", InputCollect$adstock, "adstocking with",
length(InputCollect$hyperparameters),
"hyperparameters & 10-fold ridge x-validation on", InputCollect$cores, "cores"
))
} else {
message(paste(
"Using", InputCollect$adstock, "adstocking with",
length(InputCollect$hyperparameters),
"hyperparameters & 10-fold ridge x-validation on 1 core (Windows fallback)"
))
}
}

check_class <- function(x, object) {
if (any(!x %in% class(object))) stop(sprintf("Input object must be class %s", x))
}
204 changes: 204 additions & 0 deletions R/R/clusters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
# Copyright (c) Meta Platforms, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

####################################################################
#' Reduce number of models based on ROI clusters and minimum combined errors
#'
#' The \code{robyn_clusters()} function uses output from \code{robyn_run()},
#' to reduce the amount of models and help the user pick up the best (lowest
#' combined error) of different kinds (clusters) of models.
#'
#' @inheritParams lares::clusterKmeans
#' @inheritParams hyper_names
#' @inheritParams robyn_outputs
#' @param input \code{robyn_export()}'s output or \code{pareto_aggregated.csv} results.
#' @param limit Integer. Top N results per cluster. If kept in "auto", will select k
#' as the cluster in which the WSS variance was less than 5\%.
#' @param weights Vector, size 3. How much should each error weight?
#' Order: nrmse, decomp.rssd, mape. The highest the value, the closer it will be scaled
#' to origin. Each value will be normalized so they all sum 1.
#' @param export Export plots into local files?
#' @param ... Additional parameters passed to \code{lares::clusterKmeans()}.
#' @author Bernardo Lares (bernardolares@@fb.com)
#' @examples
#' \dontrun{
#' cls <- robyn_clusters(input = OutputCollect,
#' all_media = InputCollect$all_media,
#' k = 3, limit = 2,
#' weights = c(1, 1, 1.5))
#' }
#' @export
robyn_clusters <- function(input, all_media = NULL, k = "auto", limit = 1,
weights = rep(1, 3), dim_red = "PCA",
quiet = FALSE, export = FALSE,
...) {

if ("robyn_outputs" %in% class(input)) {
if (is.null(all_media)) {
aux <- colnames(input$mediaVecCollect)
all_media <- aux[-c(1, which(aux == "type"):length(aux))]
path <- input$plot_folder
} else path <- paste0(getwd(), "/")
# Pareto and ROI data
rois <- input$xDecompAgg
df <- .prepare_roi(rois, all_media = all_media)
} else {
if (all(c("solID", "mape", "nrmse", "decomp.rssd") %in% names(input)) & is.data.frame(input)) {
df <- .prepare_roi(input, all_media)
} else {
stop(paste(
"You must run robyn_export(..., clusters = TRUE) or",
"pass a valid data.frame (sames as pareto_aggregated.csv output)",
"in order to use robyn_clusters()"
))
}
}

ignore <- c("solID", "mape", "decomp.rssd", "nrmse", "pareto")

# Auto K selected by less than 5% WSS variance (convergence)
min_clusters <- 3
limit_clusters <- min(nrow(df) - 1, 30)
if ("auto" %in% k) {
cls <- tryCatch({
clusterKmeans(df, k = NULL, limit = limit_clusters, ignore = ignore, dim_red = dim_red, quiet = TRUE, ...)
}, error = function(err) {
message(paste("Couldn't automatically create clusters:", err))
return(NULL)
})
#if (is.null(cls)) return(NULL)
min_var <- 0.05
k <- cls$nclusters %>%
mutate(pareto = .data$wss/.data$wss[1],
dif = lag(.data$pareto) - .data$pareto) %>%
filter(.data$dif > min_var) %>% pull(.data$n) %>% max(.)
if (k < min_clusters) k <- min_clusters
if (!quiet) message(sprintf(
">> Auto selected k = %s (clusters) based on minimum WSS variance of %s%%",
k, min_var*100))
}

# Build clusters
stopifnot(k %in% min_clusters:30)
cls <- clusterKmeans(df, k, limit = limit_clusters, ignore = ignore, dim_red = dim_red, quiet = TRUE, ...)

# Select top models by minimum (weighted) distance to zero
top_sols <- .clusters_df(cls$df, weights) %>%
mutate(error = (.data$nrmse^2 + .data$decomp.rssd^2 + .data$mape^2)^-(1 / 2)) %>%
.crit_proc(limit)

output <- list(
# Data and parameters
data = mutate(cls$df, top_sol = .data$solID %in% top_sols$solID),
n_clusters = k,
errors_weights = weights,
# Within Groups Sum of Squares Plot
wss = cls$nclusters_plot,
# Grouped correlations per cluster
corrs = cls$correlations + labs(title = "ROI Top Correlations by Cluster", subtitle = NULL),
# Mean ROI per cluster
clusters_means = cls$means,
# Dim reduction clusters
clusters_PCA = cls[["PCA"]],
clusters_tSNE = cls[["tSNE"]],
# Top Clusters
models = top_sols,
plot_models_errors = .plot_topsols_errors(df, top_sols, limit, weights),
plot_models_rois = .plot_topsols_rois(top_sols, all_media, limit)
)

if (export) {
fwrite(output$data, file = paste0(path, "pareto_clusters.csv"))
ggsave(paste0(path, "pareto_clusters_wss.png"), plot = output$wss, dpi = 500, width = 5, height = 4)
ggsave(paste0(path, "pareto_clusters_corr.png"), plot = output$corrs, dpi = 500, width = 7, height = 5)
db <- wrap_plots(output$plot_models_rois, output$plot_models_errors)
ggsave(paste0(path, "pareto_clusters_detail.png"), plot = db, dpi = 600, width = 12, height = 9)
}

return(output)

}


# ROIs data.frame for clustering (from xDecompAgg or pareto_aggregated.csv)
.prepare_roi <- function(x, all_media) {
check_opts(all_media, unique(x$rn))
rois <- pivot_wider(x, id_cols = "solID", names_from = "rn", values_from = "roi_total")
rois <- removenacols(rois, all = FALSE)
rois <- select(rois, any_of(c("solID", all_media)))
errors <- distinct(x, .data$solID, .data$nrmse, .data$decomp.rssd, .data$mape)
rois <- left_join(rois, errors, "solID") %>% ungroup()
return(rois)
}

.min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))

.clusters_df <- function(df, balance = rep(1, 3)) {
stopifnot(length(balance) == 3)
balance <- balance / sum(balance)
crit_df <- df %>%
# Force normalized values so they can be comparable
mutate(
nrmse = .min_max_norm(.data$nrmse),
decomp.rssd = .min_max_norm(.data$decomp.rssd),
mape = .min_max_norm(.data$mape)
) %>%
# Balance to give more or less importance to each error
mutate(
nrmse = balance[1] / .data$nrmse,
decomp.rssd = balance[2] / .data$decomp.rssd,
mape = balance[3] / .data$mape
) %>%
replace(., is.na(.), 0) %>%
group_by(.data$cluster)
return(crit_df)
}

.crit_proc <- function(df, limit) {
arrange(df, .data$cluster, desc(.data$error)) %>%
slice(1:limit) %>%
mutate(rank = row_number()) %>%
select(.data$cluster, .data$rank, everything())
}

.plot_topsols_errors <- function(df, top_sols, limit = 1, balance = rep(1, 3)) {
balance <- balance / sum(balance)
left_join(df, select(top_sols, 1:3), "solID") %>%
mutate(
alpha = ifelse(is.na(.data$cluster), 0.5, 1),
label = ifelse(!is.na(.data$cluster), sprintf(
"[%s.%s]", .data$cluster, .data$rank
), NA)
) %>%
ggplot(aes(x = .data$nrmse, y = .data$decomp.rssd)) +
geom_point(aes(colour = .data$cluster, alpha = .data$alpha)) +
geom_text(aes(label = .data$label), na.rm = TRUE, hjust = -0.3) +
guides(alpha = "none", colour = "none") +
labs(
title = paste("Selecting Top", limit, "Performing Models by Cluster"),
subtitle = "Based on minimum (weighted) distance to origin",
x = "NRMSE", y = "DECOMP.RSSD",
caption = sprintf(
"Weights: NRMSE %s%%, DECOMP.RSSD %s%%, MAPE %s%%",
round(100 * balance[1]), round(100 * balance[2]), round(100 * balance[3])
)
) +
theme_lares()
}

.plot_topsols_rois <- function(top_sols, all_media, limit = 1) {
top_sols %>%
mutate(label = sprintf("[%s.%s]\n%s", .data$cluster, .data$rank, .data$solID)) %>%
tidyr::gather("media", "roi", contains(all_media)) %>%
ggplot(aes(x = .data$media, y = .data$roi)) +
facet_grid(.data$label ~ .) +
geom_col() +
coord_flip() +
labs(
title = paste("ROIs on Top", limit, "Performing Models"),
x = NULL, y = "ROI per Media"
) +
theme_lares()
}
Loading

0 comments on commit 4b3d829

Please sign in to comment.