# 1) Data Simulation (3 points)


In [6]:
using Random, Distributions, DataFrames, HypothesisTests, Statistics, CSV

# ------------------------------------------------------------
# (2 pts) Simulate n=1000 with X1..X4, D~Bernoulli(0.5), eps~N(0,1)
Random.seed!(123)
n = 1000

X1 = rand(Normal(0,1), n)                # continuous
X2 = rand(Normal(2,1), n)                # continuous
X3 = rand(Bernoulli(0.4), n)             # binary
X4 = rand(Uniform(-1,1), n)              # continuous

D  = rand(Bernoulli(0.5), n)             # treatment
ϵ  = rand(Normal(0,1), n)

Y = 2 .* D .+ 0.5 .* X1 .- 0.3 .* X2 .+ 0.2 .* X3 .+ ϵ

df = DataFrame(Y=Y, D=D, X1=X1, X2=X2, X3=X3, X4=X4)

# ------------------------------------------------------------
# (1 pt) Balance check: group means and Welch t-tests
covs = [:X1, :X2, :X3, :X4]

println("== Group means ==")
group_means = combine(groupby(df, :D), covs .=> mean)
show(group_means, allrows=true, allcols=true)

println("\n\n== Balance by Welch t-test ==")
results = DataFrame(
    Covariate     = String[],
    Mean_Treated  = Float64[],
    Mean_Control  = Float64[],
    Difference    = Float64[],
    t_stat        = Float64[],
    p_value       = Float64[]
)

for v in covs
    a = df[df.D .== 1, v]
    b = df[df.D .== 0, v]
    test = UnequalVarianceTTest(a, b)
    push!(results, (
        string(v),
        mean(a),
        mean(b),
        mean(a) - mean(b),
        Statistics.mean(test.t),
        pvalue(test)
    ))
end

show(results, allrows=true, allcols=true)

# ------------------------------------------------------------
# Export to Output folder
out_dir = raw"C:\Users\User\Desktop\Lasso_Potential_Outcomes_RCTs\Julia\Output"
CSV.write(joinpath(out_dir, "simulated_data.csv"), df)
CSV.write(joinpath(out_dir, "group_means.csv"), group_means)

println("Files exported to: ", out_dir)



[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling CSV [336ed68f-0bac-5ca0-87d4-7b16caf5d00b] (cache misses: wrong dep version loaded (2))


== Group means ==
[1m2×5 DataFrame[0m
[1m Row [0m│[1m D     [0m[1m X1_mean    [0m[1m X2_mean [0m[1m X3_mean  [0m[1m X4_mean    [0m
     │[90m Bool  [0m[90m Float64    [0m[90m Float64 [0m[90m Float64  [0m[90m Float64    [0m
─────┼──────────────────────────────────────────────────
   1 │ false  -0.0638543  1.99191  0.369072  0.00625674
   2 │  true  -0.0370065  2.04436  0.35534   0.018983

== Balance by Welch t-test ==
[1m4×6 DataFrame[0m
[1m Row [0m│[1m Covariate [0m[1m Mean_Treated [0m[1m Mean_Control [0m[1m Difference [0m[1m t_stat    [0m[1m p_value  [0m
     │[90m String    [0m[90m Float64      [0m[90m Float64      [0m[90m Float64    [0m[90m Float64   [0m[90m Float64  [0m
─────┼────────────────────────────────────────────────────────────────────────
   1 │ X1           -0.0370065   -0.0638543    0.0268478   0.42936   0.667755
   2 │ X2            2.04436      1.99191      0.0524445   0.830258  0.406592
   3 │ X3            0.35534  

# 2) Estimating the Average Treatment Effect (3 points)


In [9]:
using DataFrames, GLM, StatsModels, Statistics, Distributions

# (1 pt) Simple ATE: Y ~ D
m_simple = lm(@formula(Y ~ D), df)
βs = coef(m_simple); se_s = stderror(m_simple)
ate_simple = βs[2]; se_simple = se_s[2]
t_simple = ate_simple / se_simple
p_simple = 2 * (1 - cdf(TDist(dof_residual(m_simple)), abs(t_simple)))
ci_simple = ate_simple .+ [-1, 1] .* 1.96 .* se_simple

println("== 3.2.1 Simple ATE: Y ~ D ==")
println("ATE: $(round(ate_simple, digits=4))  SE: $(round(se_simple, digits=4))  ",
        "95% CI: [$(round(ci_simple[1], digits=4)), $(round(ci_simple[2], digits=4))]  p=$(round(p_simple, sigdigits=4))")

# (1 pt) ATE with controls: Y ~ D + X1 + X2 + X3 + X4
m_ctrl = lm(@formula(Y ~ D + X1 + X2 + X3 + X4), df)
βc = coef(m_ctrl); se_c = stderror(m_ctrl)
ate_ctrl = βc[2]; se_ctrl = se_c[2]
t_ctrl = ate_ctrl / se_ctrl
p_ctrl = 2 * (1 - cdf(TDist(dof_residual(m_ctrl)), abs(t_ctrl)))
ci_ctrl = ate_ctrl .+ [-1, 1] .* 1.96 .* se_ctrl

println("\n== 3.2.2 ATE with controls ==")
println("ATE: $(round(ate_ctrl, digits=4))  SE: $(round(se_ctrl, digits=4))  ",
        "95% CI: [$(round(ci_ctrl[1], digits=4)), $(round(ci_ctrl[2], digits=4))]  p=$(round(p_ctrl, sigdigits=4))")

# (1 pt) Comparison
delta_ate = ate_ctrl - ate_simple
delta_se  = se_ctrl - se_simple
ratio_se  = se_ctrl / se_simple

println("\n== 3.2.3 Comparison ==")
println("Change in ATE (controls - simple): $(round(delta_ate, digits=4))")
println("Change in SE: $(round(delta_se, digits=4))   Ratio SE (ctrl/simple): $(round(ratio_se, digits=3))")


== 3.2.1 Simple ATE: Y ~ D ==
ATE: 1.9732  SE: 0.072  95% CI: [1.8321, 2.1142]  p=0.0

== 3.2.2 ATE with controls ==
ATE: 1.9787  SE: 0.0626  95% CI: [1.8559, 2.1014]  p=0.0

== 3.2.3 Comparison ==
Change in ATE (controls - simple): 0.0055
Change in SE: -0.0093   Ratio SE (ctrl/simple): 0.87


# 3) Lasso and Variable Selection (3 points)

In [14]:
using GLMNet, Random, Distributions

# ---------- 3.3.1 LASSO for variable selection ----------
# Covariate matrix (excluding D)
X = Matrix(select(df, [:X1, :X2, :X3, :X4]))
y = df.Y

# Fit with cross-validation
Random.seed!(123)
cv = glmnetcv(X, y, Normal(); alpha=1)

# Locate the index of lambda that minimizes error
j     = argmin(cv.meanloss)
λ_min = cv.lambda[j]

# Coefficients at that lambda
βx = cv.path.betas[:, j]   # coefficients for X1..X4
β0 = cv.path.a0[j]         # intercept

# Selected variables (nonzero coefficients)
selected_idx  = findall(abs.(βx) .> 1e-6)
selected_vars = [:X1, :X2, :X3, :X4][selected_idx]

println("λ_min = ", λ_min)
println("Intercept = ", β0)
println("Coefficients of X = ", βx)
println("Selected covariates: ", selected_vars)



λ_min = 0.0017969759447506045
Intercept = 0.9745614130239888
Coefficients of X = [0.49200329777780993, -0.26205053398858535, 0.23909950187581056, 0.0583239265964184]
Selected covariates: [:X1, :X2, :X3, :X4]


In [16]:
using DataFrames, GLM, StatsModels, Distributions, Statistics

# ---------- 3.3.2 Re-estimate ATE with selected covariates ----------
# Assumes `selected_vars` from 3.3.1 already exists (Vector{Symbol})

rhs = Term(:D)
if !isempty(selected_vars)
    for s in selected_vars
        rhs = rhs + Term(s)
    end
end
fmla = Term(:Y) ~ rhs

m_sel = lm(fmla, df)

# Extract coefficient of D
names_coefs = coefnames(m_sel)
posD = findfirst(==("D"), names_coefs)

β    = coef(m_sel)
se   = stderror(m_sel)

ate_sel = β[posD]
se_sel  = se[posD]

t_sel  = ate_sel / se_sel
p_sel  = 2 * (1 - cdf(TDist(dof_residual(m_sel)), abs(t_sel)))
ci_sel = ate_sel .+ [-1, 1] .* 1.96 .* se_sel

println("== 3.3.2 ATE with selected covariates ==")
println("Formula: ", fmla)
println("ATE: ", round(ate_sel, digits=4),
        "   SE: ", round(se_sel, digits=4),
        "   95% CI: [", round(ci_sel[1], digits=4), ", ", round(ci_sel[2], digits=4), "]",
        "   p=", round(p_sel, sigdigits=4))



== 3.3.2 ATE with selected covariates ==
Formula: Y ~ D + X1 + X2 + X3 + X4
ATE: 1.9787   SE: 0.0626   95% CI: [1.8559, 2.1014]   p=0.0


In [18]:
# ---------- 3.3.3 Comparison with 3.2 ----------
println("\n== 3.3.3 Comparison ==")
println("Simple ATE (Y~D): ", round(ate_simple, digits=4))
println("ATE with controls (Y~D+X1+X2+X3+X4): ", round(ate_ctrl, digits=4))
println("ATE with LASSO-selected covariates: ", round(ate_sel, digits=4))

println("\nDiscussion:")
println("- The LASSO estimator is usually close to the ATE with all controls,")
println("  but it reduces dimensionality by removing less relevant covariates.")
println("- It can improve precision (lower variance) and avoid overfitting,")
println("  especially in contexts with many covariates.")
println("- In this case, since D is randomized, the three estimates are similar,")
println("  but LASSO is useful if the number of covariates were large.")



== 3.3.3 Comparison ==
Simple ATE (Y~D): 1.9732
ATE with controls (Y~D+X1+X2+X3+X4): 1.9787
ATE with LASSO-selected covariates: 1.9787

Discussion:
- The LASSO estimator is usually close to the ATE with all controls,
  but it reduces dimensionality by removing less relevant covariates.
- It can improve precision (lower variance) and avoid overfitting,
  especially in contexts with many covariates.
- In this case, since D is randomized, the three estimates are similar,
  but LASSO is useful if the number of covariates were large.
