In [1]:
#EMPIRICS - MARGINAL_EFFECT (Readme)
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Module conducts the following analysis:

# ---- Marginal effects calculation
# ---- Figure 2: Average marginal effects of spatial spilloverts and cost competitiveness on aqdoption probability
# ---- Supplementary Figure 4: Average marginal effects of cost competitiveness and spatial spillovers on adoption using wind as historical analogue (uses Figure 2 code, produces SF 4 when EMPIRICS - MODEL Module is run for Wind before)

# Module is input for:

# ---- This module is not required as input

In [2]:
#SET-UP
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Sys.setenv(PROJ_LIB = "/opt/conda/share/proj")
Sys.getenv("PROJ_LIB")

check_and_load <- function(packages) {
  for (pkg in packages) {
    if (!requireNamespace(pkg, quietly = TRUE)) {
      message(paste("Installing missing package:", pkg))
      install.packages(pkg, dependencies = TRUE, repos = "https://cloud.r-project.org")
    }
    if (!(pkg %in% (.packages()))) {
      suppressPackageStartupMessages(library(pkg, character.only = TRUE))
    }
  }
}


required_packages <- c(
  "dplyr",      # Data manipulation and wrangling (mutate, group_by, summarise, joins, etc.)
  "purrr",      # Data manipulation (e.g., for lists)
  "tibble",     # Data manipulation (tibbles)
  "ggplot2",    # Graphics plotting system (core visualization)
  "viridis",    # Color scales
  "patchwork",  # Combines multiple ggplot2 objects into composite figures
  "ggsci"       # Color palette
)

check_and_load(required_packages)

train_data <- readRDS("train_data.rds")
second_pass <- readRDS("second_pass.rds")

In [None]:
#DATAFILES
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
train_data <- readRDS("train_data.rds")
second_pass <- readRDS("second_pass.rds")

In [None]:
#INPUTS AND SETTIGS
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
standardize_input <- TRUE   #Standardize marginal effects to allow for comparison of size effect
sd_scale <- 1               #1= calculated average marginal effect for 1 SD, 0.1 = for 0.1 SDs etc.

In [None]:
#MARGINAL EFFECTS CALCULATION
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
compute_full_marginal_effects <- function(var, data = train_data, h_unit = 0.01) {
  df_base <- data.frame(
    cost_proxy_scaled = mean(data$cost_proxy_scaled, na.rm = TRUE),
    spatial_influence_detrended = mean(data$spatial_influence_detrended, na.rm = TRUE),
    distance_to_waterway = mean(data$distance_to_waterway, na.rm = TRUE),
    distance_to_pipeline = mean(data$distance_to_pipeline, na.rm = TRUE)
  )
  
  # Deltas
  delta_range <- 0.01 * diff(range(data[[var]], na.rm = TRUE))   # 1% of range
  delta_sd    <- sd_scale * sd(data[[var]], na.rm = TRUE)        # 1 SD
  delta_unit  <- h_unit                                          # raw unit change
  
  # At Mean
  p_base <- predict(second_pass, newdata = df_base, type = "response")
  
  df_shift_range <- df_base; df_shift_range[[var]] <- df_shift_range[[var]] + delta_range
  df_shift_sd    <- df_base; df_shift_sd[[var]]    <- df_shift_sd[[var]] + delta_sd
  df_shift_unit  <- df_base; df_shift_unit[[var]]  <- df_shift_unit[[var]] + delta_unit
  
  p_shift_range <- predict(second_pass, newdata = df_shift_range, type = "response")
  p_shift_sd    <- predict(second_pass, newdata = df_shift_sd,    type = "response")
  p_shift_unit  <- predict(second_pass, newdata = df_shift_unit,  type = "response")

  # AME
  data_plus  <- data; data_plus[[var]]  <- data_plus[[var]]  + h_unit
  data_minus <- data; data_minus[[var]] <- data_minus[[var]] - h_unit
  
  prob_plus  <- predict(second_pass, newdata = data_plus,  type = "response")
  prob_minus <- predict(second_pass, newdata = data_minus, type = "response")#
  
  raw_AME <- mean((prob_plus - prob_minus) / (2 * h_unit), na.rm = TRUE)
  std_AME <- raw_AME * sd(data[[var]], na.rm = TRUE)
  
  # Results
  tibble(
    variable = var,
    mean_prob = mean(data$adopted, na.rm = TRUE),
    MEM_abs_change_1pctRange = (p_shift_range - p_base) * 100,
    MEM_abs_change_1SD        = (p_shift_sd    - p_base) * 100,
    MEM_abs_change_1Unit      = (p_shift_unit  - p_base) * 100,
    AME_raw                   = raw_AME,
    AME_standardized          = std_AME,
    AME_relative_pct          = (std_AME / mean(data$adopted, na.rm = TRUE)) * 100
  )
}

# Run for all variables
results <- bind_rows(
  compute_full_marginal_effects("cost_proxy_scaled"),
  compute_full_marginal_effects("spatial_influence_detrended"),
  compute_full_marginal_effects("distance_to_waterway"),
  compute_full_marginal_effects("distance_to_pipeline")
)

results

In [None]:
# FIGURE 2: AVERAGE MARGINAL EFFECTS IN DIFFUSION MODEL
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Plot theme
        # define colors
        cost_col    <- "#2B4F81"   # blue
        spatial_col <- "#8AA54A"   # green
        other_col   <- "#6E6E6E"   # grey
        theme_clean_axes <- function(base_size = 26) {
          theme_minimal(base_size = base_size) +
            theme(
              axis.text.y.right  = element_blank(),
              axis.title.y.right = element_blank(),
              axis.ticks.y.right = element_blank(),
        
              axis.text.x.top    = element_blank(),
              axis.title.x.top   = element_blank(),
              axis.ticks.x.top   = element_blank(),
        
              axis.ticks.x.bottom = element_line(colour = "black", linewidth = 1.2),
              axis.ticks.y.left   = element_line(colour = "black", linewidth = 1.2),
        
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
        
              panel.border = element_rect(colour = "black", fill = NA, linewidth = 1.4),
        
              axis.line = element_blank()
            )
        }

# Average marginal effects 
        compute_average_effects <- function(var, data = train_data, h_unit = 0.01) {
          ddp <- data; ddp[[var]] <- ddp[[var]] + h_unit
          ddm <- data; ddm[[var]] <- ddm[[var]] - h_unit
        
          prp <- predict(second_pass, newdata = ddp, type = "response")
          prm <- predict(second_pass, newdata = ddm, type = "response")
        
          raw <- mean((prp - prm) / (2 * h_unit))
          std <- raw * sd(data[[var]]) * sd_scale
        
          tibble(variable = var, AME_standardized = std)
        }
        
        # Predictor variables
        predictor_vars <- c(
          "spatial_influence_detrended",
          "cost_proxy_scaled",
          "distance_to_pipeline",
          "distance_to_waterway"
        )
        
        # AME Values in PP
        ame_values <- bind_rows(lapply(predictor_vars, compute_average_effects)) %>%
          mutate(
            variable_label = case_when(
              variable == "spatial_influence_detrended" ~ "Spatial spillovers",
              variable == "cost_proxy_scaled"           ~ "Cost competitiveness",
              variable == "distance_to_pipeline"        ~ "Distance to pipeline",
              variable == "distance_to_waterway"        ~ "Distance to waterway",
              TRUE ~ variable
            ),
            AME = AME_standardized * 100 #Percentage points
          )

        # AME Lines
        AME_lines_raw <- bind_rows(
          compute_average_effects("cost_proxy_scaled"),
          compute_average_effects("spatial_influence_detrended")
        )
        
        AME_lines <- tibble(
          variable = c("Spatial Influence", "Cost Proxy"),
          y_value = c(
            AME_lines_raw$AME_standardized[
              AME_lines_raw$variable == "spatial_influence_detrended"
            ] * 100,
            AME_lines_raw$AME_standardized[
              AME_lines_raw$variable == "cost_proxy_scaled"
            ] * 100
          )
        )


# Bar chart
        bar_ame <- ggplot(
          ame_values,
          aes(x = reorder(variable_label, AME), y = AME, fill = variable_label)
        ) +
          geom_col(width = 0.45, alpha = .95) +
          scale_fill_manual(values = c(
            "Spatial spillovers"   = spatial_col,
            "Cost competitiveness" = cost_col,
            "Distance to pipeline" = other_col,
            "Distance to waterway" = other_col
          )) +
          geom_text(
            aes(label = sprintf("%.1f", AME)),
            hjust = -0.15, size = 11
          ) +
          coord_flip() +
          labs(
            title = "Average marginal effect of diffusion drivers on green H2 adoption probability",
            x = NULL,
            y = "Average marginal effect (percentage points per SD)"
          ) +
          scale_y_continuous(
            expand = expansion(mult = c(0, 0.30)),
            sec.axis = dup_axis(labels = NULL, breaks = NULL)
          ) +
          theme_clean_axes(base_size = 26) +
          theme(
            legend.position = "none",
            axis.text.y = element_text(hjust = 0, size = 26),
            plot.title = element_text(size = 30, face = "bold")
          )

# Marginal effect across grid
        compute_marginal_effect <- function(data, variable, h = 0.01,
                                            standardize = FALSE, sd_ref = NULL) {
        
          data_plus  <- data; data_plus[[variable]]  <- data_plus[[variable]] + h
          data_minus <- data; data_minus[[variable]] <- data_minus[[variable]] - h
        
          prp <- predict(second_pass, newdata = data_plus,  type = "response")
          prm <- predict(second_pass, newdata = data_minus, type = "response")
        
          me <- (prp - prm) / (2 * h)
        
          if (standardize) {
            if (is.null(sd_ref)) sd_ref <- sd(train_data[[variable]], na.rm = TRUE)
            me <- me * sd_ref
          }
        
          me * 100
        }

        # Marginal effect grid
        make_grid <- function(var, conditioning_values, cond_var, cond_deciles,
                              x_seq, label, standardize = FALSE) {
        
          grid <- expand.grid(
            cost_proxy_scaled =
              if (var == "cost_proxy_scaled") x_seq else conditioning_values,
            spatial_influence_detrended =
              if (var == "spatial_influence_detrended") x_seq else conditioning_values,
            distance_to_waterway = mean(train_data$distance_to_waterway, na.rm = TRUE),
            distance_to_pipeline = mean(train_data$distance_to_pipeline, na.rm = TRUE)
          )
        
          sd_ref <- sd(train_data[[var]], na.rm = TRUE) * sd_scale
        
          grid$marginal_effect <- compute_marginal_effect(
            grid, var, standardize = standardize, sd_ref = sd_ref
          )
        
          grid$variable  <- label
          grid$x_value   <- grid[[var]]
          grid$group_raw <- round(grid[[cond_var]], 6)
        
          grid
        }
        
        # Conditioning sequence
        spatial_levels_all <- quantile(
          train_data$spatial_influence_detrended,
          probs = seq(0.01, 0.99, 0.01), na.rm = TRUE
        )
        
        cost_levels_all <- quantile(
          train_data$cost_proxy_scaled,
          probs = seq(0.01, 0.99, 0.01), na.rm = TRUE
        )
        
        grid_cost <- make_grid(
          "cost_proxy_scaled",
          spatial_levels_all,
          "spatial_influence_detrended",
          quantile(
            train_data$spatial_influence_detrended,
            probs = seq(0.1, 0.9, 0.1), na.rm = TRUE
          ),
          seq(
            min(train_data$cost_proxy_scaled),
            max(train_data$cost_proxy_scaled),
            length.out = 140
          ),
          "Cost Proxy",
          standardize = standardize_input
        )
        
        grid_spatial <- make_grid(
          "spatial_influence_detrended",
          cost_levels_all,
          "cost_proxy_scaled",
          quantile(
            train_data$cost_proxy_scaled,
            probs = seq(0.1, 0.9, 0.1), na.rm = TRUE
          ),
          seq(
            min(train_data$spatial_influence_detrended),
            max(train_data$spatial_influence_detrended),
            length.out = 140
          ),
          "Spatial Influence",
          standardize = standardize_input
        )
        
        marginal_data <- bind_rows(grid_cost, grid_spatial)
        
        # Envelope
        envelope_band <- marginal_data %>%
          group_by(variable, x_value) %>%
          summarise(
            ymin = min(marginal_effect),
            ymax = max(marginal_effect),
            .groups = "drop"
          ) %>%
          mutate(legend_label = "Percentile envelope")
        
        # "Target" percentiles
        target_probs  <- c(0.10, 0.25, 0.50, 0.75, 0.90)
        target_labels <- paste0("P", target_probs * 100)
        
        # Extract lines
        get_target_lines <- function(var_name) {
        
          cond_var <- if (var_name == "Cost Proxy") {
            "spatial_influence_detrended"
          } else {
            "cost_proxy_scaled"
          }
        
          map2_dfr(
            target_probs,
            target_labels,
            ~ {
              target_val <- as.numeric(
                quantile(train_data[[cond_var]], probs = .x, na.rm = TRUE)
              )
        
              df <- marginal_data %>%
                filter(
                  variable == var_name,
                  near(.data[[cond_var]], target_val)
                )
        
              df$group_label <- .y
              df
            }
          )
        }
        
        marginal_data_lines <- bind_rows(
          get_target_lines("Cost Proxy"),
          get_target_lines("Spatial Influence")
        )
        
        marginal_data_other <- marginal_data %>%
          anti_join(
            marginal_data_lines %>% distinct(
              variable, x_value,
              spatial_influence_detrended, cost_proxy_scaled
            ),
            by = c(
              "variable", "x_value",
              "spatial_influence_detrended", "cost_proxy_scaled"
            )
          ) %>%
          mutate(group_label = "Other percentiles")
        
        AME_annot <- AME_lines %>%
          rowwise() %>%
          mutate(
            xmin = min(marginal_data$x_value[marginal_data$variable == variable]),
            xmax = max(marginal_data$x_value[marginal_data$variable == variable]),
            x = xmax - 0.02 * (xmax - xmin),
            y = y_value,
            label = sprintf("%.1f", y_value)
          )
        
        # Colors
        mix_colors <- function(col1, col2, amount = 0.5) {
          col1 <- col2rgb(col1) / 255
          col2 <- col2rgb(col2) / 255
          mixed <- col1 * (1 - amount) + col2 * amount
          rgb(mixed[1], mixed[2], mixed[3])
        }
        lighten_color <- function(col, amount = 0.3) {
          mix_colors(col, "#FFFFFF", amount)
        }
        
        darken_color <- function(col, amount = 0.3) {
          mix_colors(col, "#000000", amount)
        }
        
        
        get_percentile_colors <- function(var_name) {
          if (var_name == "Cost Proxy") {
            base_col <- cost_col
          } else {
            base_col <- spatial_col
          }
        
        p_cols <- c(
          "P10" = lighten_color(base_col, 0.6),
          "P25" = lighten_color(base_col, 0.30),
          "P50" = base_col,
          "P75" = darken_color(base_col, 0.30),
          "P90" = darken_color(base_col, 0.6)
        )
          c(
            "Other percentiles" = lighten_color(base_col, 0.50),
            p_cols
          )
        }

# Panel function
        make_panel <- function(var_name, plot_title, xlab,
                               show_legend = TRUE, drop_y_title = FALSE,
                               ylims = NULL) {
        
          ame_color <- if (var_name == "Spatial Influence") spatial_col else cost_col
          pal <- get_percentile_colors(var_name)
        
          ggplot() +
            geom_ribbon(
              data = filter(envelope_band, variable == var_name),
              aes(x = x_value, ymin = ymin, ymax = ymax, fill = legend_label),
              alpha = .20
            ) +
            geom_line(
              data = filter(marginal_data_other, variable == var_name),
              aes(
                x = x_value, y = marginal_effect,
                group = group_raw, color = "Other percentiles"
              ),
              alpha = .08, linewidth = 1.3
            ) +
            geom_line(
              data = filter(marginal_data_lines, variable == var_name),
              aes(
                x = x_value, y = marginal_effect,
                group = group_raw, color = group_label
              ),
              linewidth = 2.5
            ) +
            geom_hline(
              data = filter(AME_lines, variable == var_name),
              aes(yintercept = y_value),
              colour = "black",
              linewidth = 1.25
            ) +
            geom_point(
              data = filter(AME_annot, variable == var_name),
              aes(x = x, y = y),
              shape = 21,
              fill = "white",
              colour = "black",
              stroke = 2.5,
              size = 30
            ) +
            geom_text(
              data = filter(AME_annot, variable == var_name),
              aes(x = x, y = y, label = label),
              colour = "black",
              fontface = "bold",
              size = 12
            ) +
            scale_color_manual(
              values = pal,
              breaks = c(target_labels, "Other percentiles"),
              name = "Percentile of\nconditioning variable"
            ) +
            scale_fill_manual(
              values = c("Percentile envelope" = "grey70"),
              breaks = "Percentile envelope",
              labels = "Percentile\nenvelope",
              name = ""
            ) +
            scale_x_continuous(sec.axis = dup_axis()) +
            scale_y_continuous(
              limits = ylims,
              sec.axis = dup_axis()
            ) +
            labs(
              title = plot_title,
              x = xlab,
              y = if (drop_y_title) NULL else "Marginal effect\n(percentage point per SD)"
            ) +
            guides(
              color = guide_legend(order = 1),
              fill  = guide_legend(order = 2)
            ) +
            theme_clean_axes(base_size = 26) +
            theme(
              plot.title  = element_text(size = 30, face = "bold"),
              legend.text = element_text(size = 26),
              legend.title = element_text(size = 28)
            )
        }


# Panels

        # set y limits if standardized
        ylims <- if (standardize_input) {
          range(marginal_data$marginal_effect, na.rm = TRUE)
        } else {
          NULL
        }
                
        p1 <- make_panel(
          "Spatial Influence",
          "Spatial spillovers \nconditioned on cost competitiveness",
          "Spatial spillovers (Index -1 to 1)",
          show_legend = TRUE,
          drop_y_title = TRUE,
          ylims = ylims
        )
        
        p2 <- make_panel(
          "Cost Proxy",
          "Cost competitiveness \nconditioned on spatial spillovers",
          expression("Cost competitiveness (EUR " * MWh^{-1} * ")"),
          show_legend = TRUE,
          drop_y_title = FALSE,
          ylims = ylims
        )


        panel_bc <- p2 + p1 + plot_layout(ncol = 2)

        figure2 <- bar_ame / panel_bc +
          plot_layout(
            heights = c(0.52, 1),
            guides = "collect"
          ) +
          plot_annotation(
            tag_levels = "a",
            theme = theme(
              plot.tag = element_text(size = 30, face = "bold"),
              plot.tag.text = element_text(size = 30, face = "bold"),
              plot.tag.position = c(0, 1)
            )
          )

# Draw and save
options(repr.plot.width = 30, repr.plot.height = 20, repr.plot.res = 800)
figure2

ggsave("figure2.pdf", figure2, device = pdf, width = 30, height = 20, units = "in",dpi = 800)

In [None]:
# ADDITIONAL VALUES FOR TEXT
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

cost_percentiles <- quantile(
  train_data$cost_proxy_scaled,
  probs = target_probs,
  na.rm = TRUE
)
spatial_percentiles <- quantile(
  train_data$spatial_influence_detrended,
  probs = target_probs,
  na.rm = TRUE
)

cat("\nCombined table:\n")
print(
  tibble(
    Percentile = paste0("P", target_probs * 100),
    Cost_proxy_scaled = round(cost_percentiles, 6),
    Spatial_influence_detrended = round(spatial_percentiles, 6)
  )
)

# Calculate maximum marginal effect of spillovers (for text)
max_me_spatial <- max(grid_spatial$marginal_effect, na.rm = TRUE)
cat("\n--- Maximum Marginal Effect of Spatial Spillovers (pp per SD) ---\n")
sprintf("%.3f percentage points per SD", max_me_spatial)


# Marginal effect peaks by percentile
spillover_lookup <- tibble(
  group_raw       = round(spatial_levels_all, 6),
  spillover_prob  = seq(0.01, 0.99, 0.01),
  spillover_pct   = spillover_prob * 100
)

peak_cost_by_spillover_all <- marginal_data %>%
  filter(variable == "Cost Proxy") %>%
  group_by(group_raw) %>%
  slice_max(marginal_effect, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  left_join(spillover_lookup, by = "group_raw") %>%
  arrange(spillover_prob) %>%
  select(
    spillover_prob,
    spillover_pct,
    spatial_spillover_value = group_raw,
    x_value_at_peak         = x_value,
    peak_marginal_effect    = marginal_effect
  )

peak_cost_by_spillover_all