In [2]:
## 3.1 DATA SIMULATION (3 puntos) — R
## ------------------------------------------------------------
## (2 pts) Simular n=1000 con X1..X4, D~Bernoulli(0.5), eps~N(0,1)

set.seed(123)
n  <- 1000

X1 <- rnorm(n, 0, 1)          # continuo
X2 <- rnorm(n, 2, 1)          # continuo
X3 <- rbinom(n, 1, 0.4)       # binario
X4 <- runif(n, -1, 1)         # continuo

D  <- rbinom(n, 1, 0.5)       # tratamiento
eps <- rnorm(n, 0, 1)

Y <- 2*D + 0.5*X1 - 0.3*X2 + 0.2*X3 + eps

df <- data.frame(Y, D, X1, X2, X3, X4)

stopifnot(sum(is.na(df)) == 0)

## ------------------------------------------------------------
## (1 pt) Balance check: medias y t-test Welch por covariable

covs <- c("X1","X2","X3","X4")

# Medias por grupo
group_means <- aggregate(df[ , covs], by = list(D = df$D), FUN = mean)

# t-tests
balance <- do.call(rbind, lapply(covs, function(v){
  tt <- t.test(df[df$D==1, v], df[df$D==0, v])  # Welch por defecto
  data.frame(
    Covariable = v,
    Media_Tratado = mean(df[df$D==1, v]),
    Media_Control = mean(df[df$D==0, v]),
    Diferencia = mean(df[df$D==1, v]) - mean(df[df$D==0, v]),
    t = unname(tt$statistic),
    p_valor = unname(tt$p.value)
  )
}))

## ------------------------------------------------------------
## Función para redondear solo columnas numéricas
round_df <- function(df, digits){
  num <- sapply(df, is.numeric)
  out <- df
  out[ , num] <- lapply(df[ , num, drop=FALSE], round, digits = digits)
  out
}

## Resultados
cat("== Medias por grupo ==\n")
print(round_df(group_means, 3))

cat("\n== Balance por t-test (Welch) ==\n")
print(round_df(balance, 4))



== Medias por grupo ==
  D     X1    X2    X3    X4
1 0  0.086 2.082 0.389 0.010
2 1 -0.058 2.001 0.379 0.012

== Balance por t-test (Welch) ==
  Covariable Media_Tratado Media_Control Diferencia       t p_valor
1         X1       -0.0580        0.0862    -0.1442 -2.3023  0.0215
2         X2        2.0011        2.0816    -0.0805 -1.2581  0.2087
3         X3        0.3786        0.3891    -0.0105 -0.3411  0.7331
4         X4        0.0119        0.0103     0.0015  0.0429  0.9658


In [3]:
## 3.2 ESTIMATING THE AVERAGE TREATMENT EFFECT (3 puntos) — R
## ------------------------------------------------------------
## Requiere el data.frame df con Y, D, X1..X4 de la sección 3.1

## (1 pt) ATE simple: Y ~ D
m_simple <- lm(Y ~ D, data = df)
coefs_simple <- summary(m_simple)$coef

ATE_simple <- coefs_simple["D", "Estimate"]
SE_simple  <- coefs_simple["D", "Std. Error"]
CI_simple  <- confint(m_simple)["D", ]
p_simple   <- coefs_simple["D", "Pr(>|t|)"]

cat("== 3.2.1 ATE simple: Y ~ D ==\n")
cat(sprintf("ATE: %.4f  SE: %.4f  95%% CI: [%.4f, %.4f]  p=%.4g\n",
            ATE_simple, SE_simple, CI_simple[1], CI_simple[2], p_simple))

## ------------------------------------------------------------
## (1 pt) ATE con controles: Y ~ D + X1 + X2 + X3 + X4
m_ctrl <- lm(Y ~ D + X1 + X2 + X3 + X4, data = df)
coefs_ctrl <- summary(m_ctrl)$coef

ATE_ctrl <- coefs_ctrl["D", "Estimate"]
SE_ctrl  <- coefs_ctrl["D", "Std. Error"]
CI_ctrl  <- confint(m_ctrl)["D", ]
p_ctrl   <- coefs_ctrl["D", "Pr(>|t|)"]

cat("\n== 3.2.2 ATE con controles ==\n")
cat(sprintf("ATE: %.4f  SE: %.4f  95%% CI: [%.4f, %.4f]  p=%.4g\n",
            ATE_ctrl, SE_ctrl, CI_ctrl[1], CI_ctrl[2], p_ctrl))

## ------------------------------------------------------------
## (1 pt) Comparación
delta_ate <- ATE_ctrl - ATE_simple
delta_se  <- SE_ctrl - SE_simple
ratio_se  <- SE_ctrl / SE_simple

cat("\n== 3.2.3 Comparación ==\n")
cat(sprintf("Cambio en ATE (controles - simple): %.4f\n", delta_ate))
cat(sprintf("Cambio en SE: %.4f   Ratio SE (ctrl/simple): %.3f\n", delta_se, ratio_se))
cat("Nota: En un RCT balanceado, el ATE no debería cambiar mucho.\n")
cat("Agregar covariables predictivas de Y puede reducir la varianza (SE).\n")


== 3.2.1 ATE simple: Y ~ D ==
ATE: 1.8651  SE: 0.0735  95% CI: [1.7210, 2.0093]  p=4.265e-110

== 3.2.2 ATE con controles ==
ATE: 1.9162  SE: 0.0633  95% CI: [1.7920, 2.0404]  p=3.408e-143

== 3.2.3 Comparación ==
Cambio en ATE (controles - simple): 0.0511
Cambio en SE: -0.0102   Ratio SE (ctrl/simple): 0.862
Nota: En un RCT balanceado, el ATE no debería cambiar mucho.
Agregar covariables predictivas de Y puede reducir la varianza (SE).


In [5]:
install.packages("glmnet")


Installing package into 'C:/Users/User/AppData/Local/R/win-library/4.5'
(as 'lib' is unspecified)

also installing the dependencies 'iterators', 'foreach', 'shape', 'Rcpp', 'RcppEigen'




package 'iterators' successfully unpacked and MD5 sums checked
package 'foreach' successfully unpacked and MD5 sums checked
package 'shape' successfully unpacked and MD5 sums checked
package 'Rcpp' successfully unpacked and MD5 sums checked
package 'RcppEigen' successfully unpacked and MD5 sums checked
package 'glmnet' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\User\AppData\Local\Temp\RtmpUphwdN\downloaded_packages


In [6]:
## 3.3 LASSO AND VARIABLE SELECTION (3 puntos) — R
## Requiere df con Y, D, X1..X4

# ------------------------------------------------------------
# (1 pt) cv.glmnet: Y ~ X1+X2+X3+X4 (excluir D)
suppressPackageStartupMessages({
  library(glmnet)
})

X <- as.matrix(df[, c("X1","X2","X3","X4")])
y <- df$Y

set.seed(123)
cvfit <- cv.glmnet(X, y, alpha = 1, nfolds = 10, standardize = TRUE)

cat("== 3.3.1 LASSO selección ==\n")
cat(sprintf("lambda.min: %.6f   lambda.1se: %.6f\n",
            cvfit$lambda.min, cvfit$lambda.1se))

cmin <- coef(cvfit, s = "lambda.min")
nz   <- which(cmin[,1] != 0)
selected <- rownames(cmin)[nz]
selected <- setdiff(selected, "(Intercept)")
print(list(Seleccionadas_lambda_min = selected))

# ------------------------------------------------------------
# (1 pt) Re-estimar ATE con covariables seleccionadas
rhs <- if (length(selected)) paste("D +", paste(selected, collapse = " + ")) else "D"
form <- as.formula(paste("Y ~", rhs))
m_lasso <- lm(form, data = df)

coefs_lasso <- summary(m_lasso)$coef
ATE_lasso <- coefs_lasso["D", "Estimate"]
SE_lasso  <- coefs_lasso["D", "Std. Error"]
CI_lasso  <- confint(m_lasso)["D", ]
p_lasso   <- coefs_lasso["D", "Pr(>|t|)"]

cat("\n== 3.3.2 ATE con seleccionadas (λ_min) ==\n")
cat(sprintf("ATE: %.4f  SE: %.4f  95%% CI: [%.4f, %.4f]  p=%.4g\n",
            ATE_lasso, SE_lasso, CI_lasso[1], CI_lasso[2], p_lasso))

# ------------------------------------------------------------
# (1 pt) Comparación con 3.2
if (!exists("m_simple")) m_simple <- lm(Y ~ D, data = df)
if (!exists("m_ctrl"))   m_ctrl   <- lm(Y ~ D + X1 + X2 + X3 + X4, data = df)

ATE_simple <- summary(m_simple)$coef["D","Estimate"]
ATE_ctrl   <- summary(m_ctrl)$coef["D","Estimate"]

cat("\n== 3.3.3 Comparación ==\n")
cat(sprintf("ATE simple (3.2.1): %.4f\n", ATE_simple))
cat(sprintf("ATE con todos los controles (3.2.2): %.4f\n", ATE_ctrl))
cat(sprintf("ATE con controles LASSO (3.3.2): %.4f\n", ATE_lasso))
cat("Comentario: en un RCT, el ATE suele ser estable; LASSO aporta parsimonia y puede reducir varianza al excluir covariables irrelevantes.\n")


== 3.3.1 LASSO selección ==
lambda.min: 0.034760   lambda.1se: 0.185505
$Seleccionadas_lambda_min
[1] "X1" "X2"


== 3.3.2 ATE con seleccionadas (λ_min) ==
ATE: 1.9154  SE: 0.0633  95% CI: [1.7912, 2.0395]  p=2.851e-143

== 3.3.3 Comparación ==
ATE simple (3.2.1): 1.8651
ATE con todos los controles (3.2.2): 1.9162
ATE con controles LASSO (3.3.2): 1.9154
Comentario: en un RCT, el ATE suele ser estable; LASSO aporta parsimonia y puede reducir varianza al excluir covariables irrelevantes.
