# 1) Data Simulation (3 points)

In [1]:
set.seed(123)
n  <- 1000

X1 <- rnorm(n, 0, 1)          # continuous
X2 <- rnorm(n, 2, 1)          # continuous
X3 <- rbinom(n, 1, 0.4)       # binary
X4 <- runif(n, -1, 1)         # continuous

D  <- rbinom(n, 1, 0.5)       # treatment
eps <- rnorm(n, 0, 1)

Y <- 2*D + 0.5*X1 - 0.3*X2 + 0.2*X3 + eps

df <- data.frame(Y, D, X1, X2, X3, X4)

stopifnot(sum(is.na(df)) == 0)

## ------------------------------------------------------------
## (1 pt) Balance check: means and Welch t-test by covariate

covs <- c("X1","X2","X3","X4")

# Group means
group_means <- aggregate(df[ , covs], by = list(D = df$D), FUN = mean)

# t-tests
balance <- do.call(rbind, lapply(covs, function(v){
  tt <- t.test(df[df$D==1, v], df[df$D==0, v])  # Welch by default
  data.frame(
    Covariable = v,
    Media_Tratado = mean(df[df$D==1, v]),
    Media_Control = mean(df[df$D==0, v]),
    Diferencia = mean(df[df$D==1, v]) - mean(df[df$D==0, v]),
    t = unname(tt$statistic),
    p_valor = unname(tt$p.value)
  )
}))

## ------------------------------------------------------------
## Function to round only numeric columns
round_df <- function(df, digits){
  num <- sapply(df, is.numeric)
  out <- df
  out[ , num] <- lapply(df[ , num, drop=FALSE], round, digits = digits)
  out
}

## Results
cat("== Group means ==\n")
print(round_df(group_means, 3))

cat("\n== Balance by Welch t-test ==\n")
print(round_df(balance, 4))


== Medias por grupo ==
  D     X1    X2    X3    X4
1 0  0.086 2.082 0.389 0.010
2 1 -0.058 2.001 0.379 0.012

== Balance por t-test (Welch) ==
  Covariable Media_Tratado Media_Control Diferencia       t p_valor
1         X1       -0.0580        0.0862    -0.1442 -2.3023  0.0215
2         X2        2.0011        2.0816    -0.0805 -1.2581  0.2087
3         X3        0.3786        0.3891    -0.0105 -0.3411  0.7331
4         X4        0.0119        0.0103     0.0015  0.0429  0.9658


In [2]:
# --- Export to Output ---
out_dir <- "C:/Users/User/Desktop/Lasso_Potential_Outcomes_RCTs/R/Output"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

# Simulated DataFrame
write.csv(df,
          file = file.path(out_dir, "simulated_data.csv"),
          row.names = FALSE)

# Group means (rounded)
write.csv(round_df(group_means, 3),
          file = file.path(out_dir, "balance_group_means.csv"),
          row.names = FALSE)

# Welch t-test (rounded)
write.csv(round_df(balance, 4),
          file = file.path(out_dir, "balance_ttest_welch.csv"),
          row.names = FALSE)

cat("Exported to:", out_dir, "\n")



Exportado a: C:/Users/User/Desktop/Lasso_Potential_Outcomes_RCTs/R/Output 


# 2) Estimating the Average Treatment Effect (3 points)


In [3]:
# (1 pt) Simple ATE: Y ~ D
m_simple <- lm(Y ~ D, data = df)
coefs_simple <- summary(m_simple)$coef

ATE_simple <- coefs_simple["D", "Estimate"]
SE_simple  <- coefs_simple["D", "Std. Error"]
CI_simple  <- confint(m_simple)["D", ]
p_simple   <- coefs_simple["D", "Pr(>|t|)"]

cat("== 3.2.1 Simple ATE: Y ~ D ==\n")
cat(sprintf("ATE: %.4f  SE: %.4f  95%% CI: [%.4f, %.4f]  p=%.4g\n",
            ATE_simple, SE_simple, CI_simple[1], CI_simple[2], p_simple))

## ------------------------------------------------------------
## (1 pt) ATE with controls: Y ~ D + X1 + X2 + X3 + X4
m_ctrl <- lm(Y ~ D + X1 + X2 + X3 + X4, data = df)
coefs_ctrl <- summary(m_ctrl)$coef

ATE_ctrl <- coefs_ctrl["D", "Estimate"]
SE_ctrl  <- coefs_ctrl["D", "Std. Error"]
CI_ctrl  <- confint(m_ctrl)["D", ]
p_ctrl   <- coefs_ctrl["D", "Pr(>|t|)"]

cat("\n== 3.2.2 ATE with controls ==\n")
cat(sprintf("ATE: %.4f  SE: %.4f  95%% CI: [%.4f, %.4f]  p=%.4g\n",
            ATE_ctrl, SE_ctrl, CI_ctrl[1], CI_ctrl[2], p_ctrl))

## ------------------------------------------------------------
## (1 pt) Comparison
delta_ate <- ATE_ctrl - ATE_simple
delta_se  <- SE_ctrl - SE_simple
ratio_se  <- SE_ctrl / SE_simple

cat("\n== 3.2.3 Comparison ==\n")
cat(sprintf("Change in ATE (controls - simple): %.4f\n", delta_ate))
cat(sprintf("Change in SE: %.4f   SE ratio (ctrl/simple): %.3f\n", delta_se, ratio_se))

cat("\nInterpretation:\n")
cat("- In a randomized experiment, the point estimate of the ATE is expected to remain stable,\n",
    "  which we observe here (difference is very small).\n")
cat("- Including covariates that are correlated with the outcome reduces residual variance,\n",
    "  which leads to smaller standard errors and narrower confidence intervals.\n")
cat("- The precision gain comes without altering the unbiasedness of the treatment effect estimate.\n")


== 3.2.1 Simple ATE: Y ~ D ==
ATE: 1.8651  SE: 0.0735  95% CI: [1.7210, 2.0093]  p=4.265e-110

== 3.2.2 ATE with controls ==
ATE: 1.9162  SE: 0.0633  95% CI: [1.7920, 2.0404]  p=3.408e-143

== 3.2.3 Comparison ==
Change in ATE (controls - simple): 0.0511
Change in SE: -0.0102   SE ratio (ctrl/simple): 0.862

Interpretation:
- In a randomized experiment, the point estimate of the ATE is expected to remain stable,
   which we observe here (difference is very small).
- Including covariates that are correlated with the outcome reduces residual variance,
   which leads to smaller standard errors and narrower confidence intervals.
- The precision gain comes without altering the unbiasedness of the treatment effect estimate.


In [5]:
install.packages("glmnet")


Installing package into 'C:/Users/User/AppData/Local/R/win-library/4.5'
(as 'lib' is unspecified)

also installing the dependencies 'iterators', 'foreach', 'shape', 'Rcpp', 'RcppEigen'




package 'iterators' successfully unpacked and MD5 sums checked
package 'foreach' successfully unpacked and MD5 sums checked
package 'shape' successfully unpacked and MD5 sums checked
package 'Rcpp' successfully unpacked and MD5 sums checked
package 'RcppEigen' successfully unpacked and MD5 sums checked
package 'glmnet' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\User\AppData\Local\Temp\RtmpUphwdN\downloaded_packages


# 3) Lasso and Variable Selection (3 points)

In [4]:
# (1 pt) cv.glmnet: Y ~ X1+X2+X3+X4 (exclude D)
suppressPackageStartupMessages({
  library(glmnet)
})

X <- as.matrix(df[, c("X1","X2","X3","X4")])
y <- df$Y

set.seed(123)
cvfit <- cv.glmnet(X, y, alpha = 1, nfolds = 10, standardize = TRUE)

cat("== 3.3.1 LASSO selection ==\n")
cat(sprintf("lambda.min: %.6f   lambda.1se: %.6f\n",
            cvfit$lambda.min, cvfit$lambda.1se))

cmin <- coef(cvfit, s = "lambda.min")
nz   <- which(cmin[,1] != 0)
selected <- rownames(cmin)[nz]
selected <- setdiff(selected, "(Intercept)")
print(list(Selected_lambda_min = selected))

# ------------------------------------------------------------
# (1 pt) Re-estimate ATE with covariates selected by LASSO
rhs <- if (length(selected)) paste("D +", paste(selected, collapse = " + ")) else "D"
form <- as.formula(paste("Y ~", rhs))
m_lasso <- lm(form, data = df)

coefs_lasso <- summary(m_lasso)$coef
ATE_lasso <- coefs_lasso["D", "Estimate"]
SE_lasso  <- coefs_lasso["D", "Std. Error"]
CI_lasso  <- confint(m_lasso)["D", ]
p_lasso   <- coefs_lasso["D", "Pr(>|t|)"]

cat("\n== 3.3.2 ATE with selected covariates (λ_min) ==\n")
cat(sprintf("ATE: %.4f  SE: %.4f  95%% CI: [%.4f, %.4f]  p=%.4g\n",
            ATE_lasso, SE_lasso, CI_lasso[1], CI_lasso[2], p_lasso))

# ------------------------------------------------------------
# (1 pt) Comparison with 3.2
if (!exists("m_simple")) m_simple <- lm(Y ~ D, data = df)
if (!exists("m_ctrl"))   m_ctrl   <- lm(Y ~ D + X1 + X2 + X3 + X4, data = df)

ATE_simple <- summary(m_simple)$coef["D","Estimate"]
ATE_ctrl   <- summary(m_ctrl)$coef["D","Estimate"]

cat("\n== 3.3.3 Comparison ==\n")
cat(sprintf("Simple ATE (3.2.1): %.4f\n", ATE_simple))
cat(sprintf("ATE with all controls (3.2.2): %.4f\n", ATE_ctrl))
cat(sprintf("ATE with LASSO-selected controls (3.3.2): %.4f\n", ATE_lasso))

cat("Comment: In a randomized experiment, the ATE should remain stable. ",
    "The value here does not change much, which is expected. ",
    "The contribution of LASSO is parsimony: it discards irrelevant covariates, ",
    "which can improve precision (smaller SE) and interpretability.\n")


== 3.3.1 LASSO selection ==
lambda.min: 0.034760   lambda.1se: 0.185505
$Selected_lambda_min
[1] "X1" "X2"


== 3.3.2 ATE with selected covariates (λ_min) ==
ATE: 1.9154  SE: 0.0633  95% CI: [1.7912, 2.0395]  p=2.851e-143

== 3.3.3 Comparison ==
Simple ATE (3.2.1): 1.8651
ATE with all controls (3.2.2): 1.9162
ATE with LASSO-selected controls (3.3.2): 1.9154
Comment: In a randomized experiment, the ATE should remain stable.  The value here does not change much, which is expected.  The contribution of LASSO is parsimony: it discards irrelevant covariates,  which can improve precision (smaller SE) and interpretability.
