In [None]:
# Install the necessary packages (run this separately if not already installed)
install.packages(c("dplyr", "ggplot2", "scales", "stats", "boot", "randomForest", "glmnet", "caret", "e1071", "rpart", "gbm", "Metrics"))

# Import libraries
library(dplyr)
library(ggplot2)
library(scales)
library(stats)
library(boot)
library(randomForest)
library(glmnet)
library(caret)
library(e1071)
library(rpart)
library(gbm)
library(Metrics)
library(causalTree)
library(grf)
library(gridExtra)

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

“package ‘stats’ is a base package, and should not be updated”
also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘globals’, ‘future.apply’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘diagram’, ‘lava’, ‘prodlim’, ‘iterators’, ‘clock’, ‘gower’, ‘hardhat’, ‘ipred’, ‘timeDate’, ‘foreach’, ‘shape’, ‘Rcpp’, ‘RcppEigen’, ‘ModelMetrics’, ‘plyr’, ‘pROC’, ‘recipes’, ‘reshape2’, ‘proxy’


“installation of package ‘clock’ had non-zero exit status”
“installation of package ‘gower’ had non-zero exit status”
“installation of package ‘hardhat’ had non-zero exit status”
“installation of package ‘plyr’ had non-zero exit status”
“installation of package ‘ggplot2’ had non-zero exit status”
“installation of package ‘e1071’ had non-zero exit status”
“installation of package ‘future.apply’ had non-zero exit status”
“installation of package ‘pROC’ had non-zero exit status”
“installation of package ‘reshape2’ had no

# Descriptives

In [None]:
# Load the data
data <- read.csv("processed_esti.csv")

# Display the first few rows
head(data)

This study focuses on evaluating the effectiveness of internet-accessed STI testing (e-STI testing) compared to traditional face-to-face services. The e-STI testing service, known as SH:24, offers postal self-sampling test kits for chlamydia, gonorrhoea, HIV, and syphilis, with results communicated via text message or telephone and supported by web-based safer sex health information. This randomized controlled trial aims to assess the impact of SH:24 on the uptake of STI testing, diagnosis, and treatment when delivered alongside usual care. The findings from this study could inform strategies to improve STI testing and treatment services and address public health concerns associated with STIs.

In [None]:
# Define the variables to include in the table
variables <- c(
    'y', 'w', 'gender_female', 'gender_male', 'gender_transgender',
    'ethnicgrp_asian', 'ethnicgrp_black', 'ethnicgrp_mixed_multiple',
    'ethnicgrp_other', 'ethnicgrp_white', 'partners1', 'postlaunch',
    'age', 'imd_decile'
)

# Split the data into control and treatment groups
control_group <- data[data$w == 0, ]
treatment_group <- data[data$w == 1, ]

# Function to calculate mean, std, and diff
calculate_stats <- function(variable) {
    control_mean <- mean(control_group[[variable]], na.rm = TRUE)
    control_std <- sd(control_group[[variable]], na.rm = TRUE)
    treatment_mean <- mean(treatment_group[[variable]], na.rm = TRUE)
    treatment_std <- sd(treatment_group[[variable]], na.rm = TRUE)
    diff <- lm(data[[variable]] ~ data$w)$coefficients[2]
    return(c(control_mean, control_std, treatment_mean, treatment_std, diff))
}

# Create the balance table
balance_table <- data.frame(matrix(ncol = 5, nrow = length(variables)))
colnames(balance_table) <- c('Control mean', 'Control sd', 'Treatment mean', 'Treatment sd', 'Diff')
rownames(balance_table) <- variables

for (var in variables) {
    stats <- calculate_stats(var)
    balance_table[var, ] <- stats
}

# Display the balance table
print(balance_table)

We can observe a similar composition of the treatment and control groups in all variables.

In [None]:
# Proportion of STI Testing (Outcome y) in Treated and Control Groups
ggplot(data, aes(x = factor(w), y = y, fill = factor(w))) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("blue", "red")) +
  labs(title = "Proportion of STI Testing (Outcome y) in Treated and Control Groups",
       x = "Group (0 = Control, 1 = Treated)",
       y = "Proportion of STI Testing") +
  theme_minimal()

In [None]:
# Gender Distribution in Treated and Control Groups
ggplot(data, aes(x = factor(w), fill = factor(gender_female))) +
  geom_bar(position = "dodge") +
  labs(title = "Gender Distribution in Treated and Control Groups",
       x = "Group (0 = Control, 1 = Treated)",
       y = "Count",
       fill = "Gender Female") +
  scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
  theme_minimal()

In [None]:
# Setting up the visualisation style
theme_set(theme_minimal())

# Define a function to plot histograms for each group separately
plot_separate_histograms <- function(variable, treated, control) {
  treated_plot <- ggplot(treated, aes(x = !!sym(variable))) +
    geom_histogram(binwidth = 0.5, fill = "blue", alpha = 0.6) +
    labs(title = paste("Treated Group - Distribution of", variable),
         x = variable, y = "Frequency") +
    theme_minimal()

  control_plot <- ggplot(control, aes(x = !!sym(variable))) +
    geom_histogram(binwidth = 0.5, fill = "red", alpha = 0.6) +
    labs(title = paste("Control Group - Distribution of", variable),
         x = variable, y = "Frequency") +
    theme_minimal()

  plot_grid(treated_plot, control_plot, nrow = 1, labels = c("Treated", "Control"))
}

In [None]:
# Example usage
plot_separate_histograms("age", treated_group, control_group)
plot_separate_histograms("imd_decile", treated_group, control_group)

In [None]:
# Define a function to plot bar plots for each group separately
plot_distribution <- function(variable, data, treated, control, kind='bar') {
  treated_counts <- table(treated[[variable]])
  control_counts <- table(control[[variable]])

  treated_df <- data.frame(group = "Treated", variable = names(treated_counts), count = as.numeric(treated_counts))
  control_df <- data.frame(group = "Control", variable = names(control_counts), count = as.numeric(control_counts))
  combined_df <- rbind(treated_df, control_df)

  ggplot(combined_df, aes(x = variable, y = count, fill = group)) +
    geom_bar(stat = kind, position = "dodge", width = 0.5) +
    labs(title = paste("Distribution of", variable),
         x = variable, y = "Count", fill = "Group") +
    theme_minimal()
}

# Example usage
plot_distribution('gender_female', data, treated_group, control_group)
plot_distribution('gender_male', data, treated_group, control_group)
plot_distribution('ethnicgrp_white', data, treated_group, control_group)

In [None]:
# Define a function to plot pie charts for ethnic group composition with labels
plot_ethnic_group_piecharts <- function(treated, control) {
  # Summarize ethnic group counts for treated group
  treated_ethnic_counts <- colSums(treated[, grepl('ethnicgrp_', names(treated))])

  # Summarize ethnic group counts for control group
  control_ethnic_counts <- colSums(control[, grepl('ethnicgrp_', names(control))])

  # Define labels for the ethnic groups
  labels <- c('Asian', 'Black', 'Mixed/Multiple', 'Other', 'White')

  # Pie chart for treated group
  treated_pie <- ggplot(data.frame(labels, treated_ethnic_counts), aes(x = "", y = treated_ethnic_counts, fill = labels)) +
    geom_bar(stat = "identity", width = 1) +
    coord_polar("y", start = 0) +
    labs(title = "Treated Group - Ethnic Composition") +
    theme_void() +
    theme(legend.position = "bottom")

  # Pie chart for control group
  control_pie <- ggplot(data.frame(labels, control_ethnic_counts), aes(x = "", y = control_ethnic_counts, fill = labels)) +
    geom_bar(stat = "identity", width = 1) +
    coord_polar("y", start = 0) +
    labs(title = "Control Group - Ethnic Composition") +
    theme_void() +
    theme(legend.position = "bottom")

  # Combine the pie charts
  grid.arrange(treated_pie, control_pie, nrow = 1)
}

# Example usage
plot_ethnic_group_piecharts(treated_group, control_group)

# Linear regression analysis

In [None]:
# Define the dependent variable and the treatment variable
Y <- data$y
T <- data$w

# Model 1: Y ~ T
model1 <- lm(Y ~ T)

# Print model 1 summary
summary(model1)

In [None]:
# Model 2: Y ~ T + X (where X are selected covariates, e.g., 'age' and 'gender_female')
X2 <- data.frame(constant = 1, data[, c('w', 'age', 'gender_female')])
model2 <- lm(Y ~ ., data = X2)

# Print model 2 summary
summary(model2)

In [None]:
# Prepare data for Lasso regression
features <- subset(data, select=-c(y))
target <- data$y

# Perform Lasso regression to select important features
lasso <- cv.glmnet(as.matrix(features), target, alpha=1, nfolds=5, type.measure="mse", parallel=TRUE, standardize=TRUE)
selected_features <- coef(lasso, s="lambda.min")[-1,] != 0
selected_features <- colnames(features)[selected_features]

# Ensure that the treatment variable 'w' is included in the selected features
if (!("w" %in% selected_features)) {
    selected_features <- c("w", selected_features)
}

# Model 3: Y ~ T + selected features from Lasso
X3 <- data[selected_features]
X3$w <- as.factor(X3$w)  # Convert 'w' to a factor
model3 <- lm(Y ~ ., data = X3)

# Print model 3 summary
summary(model3)

In [None]:
# Extract coefficients and confidence intervals for T from each model
coeffs <- c(
    'Model 1' = coef(model1)['T'],
    'Model 2' = coef(model2)['T'],
    'Model 3' = coef(model3)['T']
)

conf_intervals <- cbind(
    'Model 1' = confint(model1)['T', ],
    'Model 2' = confint(model2)['T', ],
    'Model 3' = confint(model3)['T', ]
)

# Plot the coefficients with confidence intervals
models <- names(coeffs)
estimates <- coeffs
ci_lower <- conf_intervals[1, ]
ci_upper <- conf_intervals[2, ]

plot_ci <- function(models, estimates, ci_lower, ci_upper) {
  plot(x = as.numeric(factor(models)), y = estimates, ylim = range(ci_lower, ci_upper),
       xaxt = 'n', xlab = 'Model', ylab = 'Coefficient for T',
       main = 'Comparison of Coefficients for T with Confidence Intervals',
       type = 'b', pch = 19, col = 'blue', xlim = c(0.5, length(models) + 0.5))
  segments(x0 = as.numeric(factor(models)), y0 = ci_lower, y1 = ci_upper, col = 'blue')
  abline(h = 0, col = 'grey', lty = 2)
  axis(side = 1, at = as.numeric(factor(models)), labels = models)
  grid()
}

plot_ci(models, estimates, ci_lower, ci_upper)

We obtained similar estimates from the three models

# Non-Linear Methods DML

In [None]:
# Define the outcome variable and the treatment variable
Y <- data$y
W <- data$w
X <- subset(data, select = -c(y, w))

# Set the seed for reproducibility
set.seed(42)

# Split the data into training and testing sets
index <- createDataPartition(Y, p = 0.8, list = FALSE)
X_train <- X[index, ]
X_test <- X[-index, ]
Y_train <- Y[index]
Y_test <- Y[-index]
T_train <- W[index]
T_test <- W[-index]

In [None]:
# Function to run DML with Lasso
dml_lasso <- function(X_train, X_test, Y_train, Y_test, T_train, T_test) {
  # Step 1: Learn Y and T using Lasso
  lasso_y <- cv.glmnet(as.matrix(X_train), Y_train, alpha=1, nfolds=5, parallel=TRUE)
  lasso_t <- cv.glmnet(as.matrix(X_train), T_train, alpha=1, nfolds=5, parallel=TRUE)

  # Step 2: Get residuals
  Y_residuals <- Y_test - predict(lasso_y, newx=as.matrix(X_test), s="lambda.min")
  T_residuals <- T_test - predict(lasso_t, newx=as.matrix(X_test), s="lambda.min")

  # Step 3: Run OLS on residuals
  ols_model <- lm(Y_residuals ~ T_residuals)

  return(ols_model)
}

# Run DML with Lasso
lasso_model <- dml_lasso(X_train, X_test, Y_train, Y_test, T_train, T_test)
summary(lasso_model)

In [None]:
# Function to run DML with Decision Trees
dml_tree <- function(X_train, X_test, Y_train, Y_test, T_train, T_test) {
  # Step 1: Learn Y and T using Decision Trees
  tree_y <- rpart(Y_train ~ ., data = X_train)
  tree_t <- rpart(T_train ~ ., data = X_train)

  # Step 2: Get residuals
  Y_residuals <- Y_test - predict(tree_y, newdata = X_test)
  T_residuals <- T_test - predict(tree_t, newdata = X_test)

  # Step 3: Run OLS on residuals
  ols_model <- lm(Y_residuals ~ T_residuals)

  return(ols_model)
}

# Run DML with Decision Trees
tree_model <- dml_tree(X_train, X_test, Y_train, Y_test, T_train, T_test)
summary(tree_model)

In [None]:
# Function to run DML with Boosting Trees
dml_boosting <- function(X_train, X_test, Y_train, Y_test, T_train, T_test) {
  # Step 1: Learn Y and T using Boosting Trees
  boost_y <- gbm(Y_train ~ ., data = X_train, distribution = "gaussian", n.trees = 100, interaction.depth = 3)
  boost_t <- gbm(T_train ~ ., data = X_train, distribution = "gaussian", n.trees = 100, interaction.depth = 3)

  # Step 2: Get residuals
  Y_residuals <- Y_test - predict(boost_y, newdata = X_test, n.trees = 100)
  T_residuals <- T_test - predict(boost_t, newdata = X_test, n.trees = 100)

  # Step 3: Run OLS on residuals
  ols_model <- lm(Y_residuals ~ T_residuals)

  return(ols_model)
}

# Run DML with Boosting Trees
boosting_model <- dml_boosting(X_train, X_test, Y_train, Y_test, T_train, T_test)
summary(boosting_model)

In [None]:
# Function to run DML
dml_model <- function(X_train, X_test, Y_train, Y_test, T_train, T_test, model_y, model_t) {
  # Step 1: Learn Y and T using specified models
  model_y <- randomForest(Y_train ~ ., data = X_train)
  model_t <- randomForest(T_train ~ ., data = X_train)

  # Step 2: Get residuals
  Y_residuals <- Y_test - predict(model_y, X_test)
  T_residuals <- T_test - predict(model_t, X_test)

  # Step 3: Run OLS on residuals
  ols_model <- lm(Y_residuals ~ T_residuals)
  return(ols_model)
}

# Create and fit Random Forest models
model_y <- randomForest(Y_train ~ ., data = X_train)
model_t <- randomForest(T_train ~ ., data = X_train)

# Call the DML function
forest_model <- dml_model(X_train, X_test, Y_train, Y_test, T_train, T_test, model_y, model_t)

summary(forest_model)

In [None]:
# Run DML for each model and store results
results <- list()

for (name in names(models)) {
  model_y <- models[[name]][[1]]
  model_t <- models[[name]][[2]]
  results[[name]] <- dml_model(X_train, X_test, Y_train, Y_test, T_train, T_test, model_y, model_t)
}

# Create a table with all results
summary_table <- data.frame(
  Model = character(),
  Coefficient = numeric(),
  `Standard Error` = numeric(),
  `t-value` = numeric(),
  `p-value` = numeric(),
  stringsAsFactors = FALSE
)

for (name in names(results)) {
  result <- results[[name]]
  coeff <- coef(result)[2]
  std_err <- summary(result)$coef[2, 2]
  t_value <- summary(result)$coef[2, 3]
  p_value <- summary(result)$coef[2, 4]

  summary_table <- rbind(summary_table, data.frame(
    Model = name,
    Coefficient = coeff,
    `Standard Error` = std_err,
    `t-value` = t_value,
    `p-value` = p_value
  ))
}

print(summary_table)

# Plot the coefficient estimates as points with confidence intervals
plot_ci(summary_table$Model, summary_table$Coefficient, summary_table$`Standard Error`)

### Recommendation: Lasso Method for DML

1. **Interpretability**: Lasso produces sparse models by forcing some coefficients to be exactly zero, aiding in variable selection and model interpretability.

2. **Variable Selection**: Lasso's ability to shrink coefficients to zero effectively performs variable selection, which can improve model generalization and reduce overfitting.

3. **Regularization**: Lasso's regularization term helps prevent overfitting by penalizing large coefficients, leading to a more robust model, especially with correlated variables.

4. **Performance**: While decision trees, boosting trees, and random forests can be powerful techniques, they might not be as suitable for DML due to complexity and overfitting risks. Lasso is simpler and more interpretable, making it advantageous for DML.

In conclusion, the Lasso method offers a balance between model complexity, interpretability, and performance, making it a suitable choice for your DML procedure.