In [2]:
# 3.1 DATA SIMULATION (3 puntos) — Julia
# ------------------------------------------------------------
using Random, Distributions, DataFrames, HypothesisTests, Statistics

# (2 pts) Simular n=1000 con X1..X4, D~Bernoulli(0.5), eps~N(0,1)
Random.seed!(123)
n = 1000

X1 = rand(Normal(0,1), n)                # continuo
X2 = rand(Normal(2,1), n)                # continuo
X3 = rand(Bernoulli(0.4), n)             # binario
X4 = rand(Uniform(-1,1), n)              # continuo

D  = rand(Bernoulli(0.5), n)             # tratamiento
ϵ  = rand(Normal(0,1), n)

Y = 2 .* D .+ 0.5 .* X1 .- 0.3 .* X2 .+ 0.2 .* X3 .+ ϵ

df = DataFrame(Y=Y, D=D, X1=X1, X2=X2, X3=X3, X4=X4)

# ------------------------------------------------------------
# (1 pt) Balance check: medias por grupo y t-tests Welch
covs = [:X1, :X2, :X3, :X4]

println("== Medias por grupo ==")
group_means = combine(groupby(df, :D), covs .=> mean)
show(group_means, allrows=true, allcols=true)

println("\n\n== Balance por t-test (Welch) ==")
results = DataFrame(
    Covariable = String[],
    Media_Tratado = Float64[],
    Media_Control = Float64[],
    Diferencia = Float64[],
    t_stat = Float64[],
    p_valor = Float64[]
)

for v in covs
    a = df[df.D .== 1, v]
    b = df[df.D .== 0, v]
    test = UnequalVarianceTTest(a, b)
    push!(results, (
        string(v),
        mean(a),
        mean(b),
        mean(a) - mean(b),
        Statistics.mean(test.t),   # valor t
        pvalue(test)               # valor p
    ))
end

show(results, allrows=true, allcols=true)


== Medias por grupo ==
[1m2×5 DataFrame[0m
[1m Row [0m│[1m D     [0m[1m X1_mean    [0m[1m X2_mean [0m[1m X3_mean  [0m[1m X4_mean    [0m
     │[90m Bool  [0m[90m Float64    [0m[90m Float64 [0m[90m Float64  [0m[90m Float64    [0m
─────┼──────────────────────────────────────────────────
   1 │ false  -0.0638543  1.99191  0.369072  0.00625674
   2 │  true  -0.0370065  2.04436  0.35534   0.018983

== Balance por t-test (Welch) ==
[1m4×6 DataFrame[0m
[1m Row [0m│[1m Covariable [0m[1m Media_Tratado [0m[1m Media_Control [0m[1m Diferencia [0m[1m t_stat    [0m[1m p_valor  [0m
     │[90m String     [0m[90m Float64       [0m[90m Float64       [0m[90m Float64    [0m[90m Float64   [0m[90m Float64  [0m
─────┼───────────────────────────────────────────────────────────────────────────
   1 │ X1             -0.0370065    -0.0638543    0.0268478   0.42936   0.667755
   2 │ X2              2.04436       1.99191      0.0524445   0.830258  0.406592
   3 │ 

In [6]:
# 3.2 ESTIMATING THE AVERAGE TREATMENT EFFECT (3 puntos) — Julia

using DataFrames, GLM, StatsModels, Statistics, Distributions

# (1 pt) ATE simple: Y ~ D
m_simple = lm(@formula(Y ~ D), df)
βs = coef(m_simple); se_s = stderror(m_simple)
ate_simple = βs[2]; se_simple = se_s[2]
t_simple = ate_simple / se_simple
p_simple = 2 * (1 - cdf(TDist(dof_residual(m_simple)), abs(t_simple)))
ci_simple = ate_simple .+ [-1, 1] .* 1.96 .* se_simple

println("== 3.2.1 ATE simple: Y ~ D ==")
println("ATE: $(round(ate_simple, digits=4))  SE: $(round(se_simple, digits=4))  ",
        "95% CI: [$(round(ci_simple[1], digits=4)), $(round(ci_simple[2], digits=4))]  p=$(round(p_simple, sigdigits=4))")

# (1 pt) ATE con controles: Y ~ D + X1 + X2 + X3 + X4
m_ctrl = lm(@formula(Y ~ D + X1 + X2 + X3 + X4), df)
βc = coef(m_ctrl); se_c = stderror(m_ctrl)
ate_ctrl = βc[2]; se_ctrl = se_c[2]
t_ctrl = ate_ctrl / se_ctrl
p_ctrl = 2 * (1 - cdf(TDist(dof_residual(m_ctrl)), abs(t_ctrl)))
ci_ctrl = ate_ctrl .+ [-1, 1] .* 1.96 .* se_ctrl

println("\n== 3.2.2 ATE con controles ==")
println("ATE: $(round(ate_ctrl, digits=4))  SE: $(round(se_ctrl, digits=4))  ",
        "95% CI: [$(round(ci_ctrl[1], digits=4)), $(round(ci_ctrl[2], digits=4))]  p=$(round(p_ctrl, sigdigits=4))")

# (1 pt) Comparación
delta_ate = ate_ctrl - ate_simple
delta_se  = se_ctrl - se_simple
ratio_se  = se_ctrl / se_simple

println("\n== 3.2.3 Comparación ==")
println("Cambio en ATE (controles - simple): $(round(delta_ate, digits=4))")
println("Cambio en SE: $(round(delta_se, digits=4))   Ratio SE (ctrl/simple): $(round(ratio_se, digits=3))")
println("Nota: En un RCT balanceado, el ATE cambia poco; agregar controles predictivos reduce varianza.")



== 3.2.1 ATE simple: Y ~ D ==
ATE: 1.9732  SE: 0.072  95% CI: [1.8321, 2.1142]  p=0.0

== 3.2.2 ATE con controles ==
ATE: 1.9787  SE: 0.0626  95% CI: [1.8559, 2.1014]  p=0.0

== 3.2.3 Comparación ==
Cambio en ATE (controles - simple): 0.0055
Cambio en SE: -0.0093   Ratio SE (ctrl/simple): 0.87
Nota: En un RCT balanceado, el ATE cambia poco; agregar controles predictivos reduce varianza.


In [30]:

import Pkg; Pkg.add("Lasso")


[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m IterTools ─── v1.10.0
[32m[1m   Installed[22m[39m MLBase ────── v0.9.2
[32m[1m   Installed[22m[39m Setfield ──── v1.1.2
[32m[1m   Installed[22m[39m Polynomials ─ v4.1.0
[32m[1m   Installed[22m[39m DSP ───────── v0.7.10
[32m[1m   Installed[22m[39m Lasso ─────── v0.7.2
[32m[1m    Updating[22m[39m `C:\Users\User\.julia\environments\v1.11\Project.toml`
  [90m[b4fcebef] [39m[92m+ Lasso v0.7.2[39m
[32m[1m    Updating[22m[39m `C:\Users\User\.julia\environments\v1.11\Manifest.toml`
[33m⌅[39m [90m[717857b8] [39m[92m+ DSP v0.7.10[39m
  [90m[c8e1da08] [39m[92m+ IterTools v1.10.0[39m
  [90m[b4fcebef] [39m[92m+ Lasso v0.7.2[39m
  [90m[f0e99cf1] [39m[92m+ MLBase v0.9.2[39m
  [90m[f27b6e38] [39m[92m+ Polynomials v4.1.0[39m
  [90m[efcf1570] [39m[92m+ Setfield v1.1.2[39m
[36m[1m        Info[22m[39m Packages marked with [33m⌅[39m have new versions available 

In [36]:
using Random, Lasso, DataFrames, Statistics, LinearAlgebra

# Datos: covariables (X) y outcome (y)
X = Matrix(select(df, [:X1, :X2, :X3, :X4]))
y = Vector(df.Y)

# (1 pt) Ajuste de la ruta LASSO
Random.seed!(123)
path = Lasso.fit(LassoPath, X, y; λ = exp10.(-4:0.1:1))

# Seleccionar el λ más pequeño de la grilla (menos penalización)
λ_min = minimum(path.λs)
β = coef(path, λ_min)   # coeficientes [X1..X4]

names_X = ["X1","X2","X3","X4"]
selected_vars = [names_X[i] for i in 1:4 if β[i] != 0.0]
println("== 3.3.1 LASSO selección ==")
println("lambda_min = ", λ_min)
println("Covariables seleccionadas: ", selected_vars)

# ------------------------------------------------------------
# (1 pt) Re-estimar ATE con OLS manual
n = size(X,1)
D = Vector(df.D)

Z = hcat(ones(n), D)  # intercepto y D
for v in selected_vars
    Z = hcat(Z, Vector(df[!, Symbol(v)]))
end

β_ols = (Z'Z) \ (Z'y)
resid = y .- Z*β_ols
k = size(Z,2)
σ2 = sum(resid.^2) / (n - k)
vcov = σ2 * inv(Z'Z)
se   = sqrt.(diag(vcov))

ate_lasso = β_ols[2]   # coef de D
se_D      = se[2]
ci_D      = ate_lasso .+ [-1,1] .* 1.96 .* se_D

println("\n== 3.3.2 ATE con seleccionadas (λ_min) ==")
println("ATE: ", round(ate_lasso, digits=4),
        "  SE: ", round(se_D, digits=4),
        "  95% CI: [", round(ci_D[1], digits=4), ", ", round(ci_D[2], digits=4), "]")

# ------------------------------------------------------------
# (1 pt) Comparación mínima
println("\n== 3.3.3 Comparación ==")
println("ATE con controles seleccionados por LASSO: ", round(ate_lasso, digits=4))
println("Comentario: el ATE es estable; LASSO elimina covariables irrelevantes y da parsimonia.")


LoadError: type LassoPath has no field λs