# 1) Data Simulation (3 points)


In [1]:
# (2 pts) Simulate n=1000 with X1..X4, D~Bernoulli(0.5), epsilon~N(0,1)
import os
import numpy as np
import pandas as pd
from scipy import stats

rng = np.random.default_rng(123)
n = 1000

# Covariates: mix of continuous/binary
x1 = rng.normal(0, 1, n)              # continuous
x2 = rng.normal(2, 1, n)              # continuous
x3 = rng.binomial(1, 0.4, n)          # binary
x4 = rng.uniform(-1, 1, n)            # continuous

# Treatment assignment
D  = rng.binomial(1, 0.5, n)

# Error and outcome: Y = 2D + 0.5X1 − 0.3X2 + 0.2X3 + ε, ε~N(0,1)
eps = rng.normal(0, 1, n)
Y = 2*D + 0.5*x1 - 0.3*x2 + 0.2*x3 + eps

# Build DataFrame
df = pd.DataFrame({"Y": Y, "D": D, "X1": x1, "X2": x2, "X3": x3, "X4": x4})

# Sanity check: no missing values
assert df.isna().sum().sum() == 0, "Missing values found in DataFrame"

# ------------------------------------------------------------
# (1 pt) Balance check: compare means of X1..X4 between D=1 and D=0 using Welch t-test
covs = ["X1", "X2", "X3", "X4"]

# Group means
group_means = df.groupby("D")[covs].mean().rename(index={0: "Control", 1: "Treated"})

# Welch t-tests and summary table
rows = []
for c in covs:
    a = df.loc[df["D"] == 1, c]
    b = df.loc[df["D"] == 0, c]
    t_stat, p_val = stats.ttest_ind(a, b, equal_var=False)
    rows.append({
        "Covariate": c,
        "Treated_Mean": a.mean(),
        "Control_Mean": b.mean(),
        "Difference": a.mean() - b.mean(),
        "t_stat": t_stat,
        "p_value": p_val
    })
balance = pd.DataFrame(rows)

# Outputs
print("== Group means ==")
print(group_means.round(3))
print("\n== Balance by Welch t-test ==")
print(balance.round(4))

# ------------------------------------------------------------
# Export DataFrame and tables to Output folder
out_dir = r"C:\Users\User\Desktop\Lasso_Potential_Outcomes_RCTs\Python\Output"
os.makedirs(out_dir, exist_ok=True)

df.to_csv(os.path.join(out_dir, "simulated_data.csv"), index=False)
group_means.round(3).to_csv(os.path.join(out_dir, "balance_group_means.csv"))
balance.round(4).to_csv(os.path.join(out_dir, "balance_ttest_welch.csv"), index=False)

print(f"\nFiles saved to: {out_dir}")


== Group means ==
            X1     X2     X3     X4
D                                  
Control  0.039  1.996  0.380 -0.005
Treated  0.005  1.993  0.388  0.012

== Balance by Welch t-test ==
  Covariate  Treated_Mean  Control_Mean  Difference  t_stat  p_value
0        X1        0.0050        0.0391     -0.0341 -0.5359   0.5921
1        X2        1.9934        1.9965     -0.0031 -0.0472   0.9624
2        X3        0.3884        0.3795      0.0089  0.2900   0.7719
3        X4        0.0120       -0.0047      0.0166  0.4651   0.6419

Files saved to: C:\Users\User\Desktop\Lasso_Potential_Outcomes_RCTs\Python\Output


# 2) Estimating the Average Treatment Effect (3 points)

In [2]:
# Requires 'df' from Step 3.1: columns Y, D, X1..X4.
# If it does not exist, re-create with the same seed for reproducibility.
import numpy as np, pandas as pd
import statsmodels.api as sm

if 'df' not in globals():
    rng = np.random.default_rng(123); n=1000
    x1 = rng.normal(0,1,n); x2 = rng.normal(2,1,n); x3 = rng.binomial(1,0.4,n); x4 = rng.uniform(-1,1,n)
    D  = rng.binomial(1,0.5,n); eps = rng.normal(0,1,n)
    Y = 2*D + 0.5*x1 - 0.3*x2 + 0.2*x3 + eps
    df = pd.DataFrame({"Y":Y,"D":D,"X1":x1,"X2":x2,"X3":x3,"X4":x4})

def fit_ols(y, X, robust=True):
    Xc = sm.add_constant(X, has_constant='add')
    model = sm.OLS(y, Xc).fit(cov_type='HC1' if robust else 'nonrobust')
    return model

# (1 pt) Simple ATE: Y ~ D
m_simple = fit_ols(df['Y'], df[['D']], robust=True)
ate_simple = m_simple.params['D']
se_simple  = m_simple.bse['D']
ci_simple  = m_simple.conf_int().loc['D'].tolist()
p_simple   = m_simple.pvalues['D']

print("== 3.2.1 Simple ATE: Y ~ D ==")
print(f"ATE (coef D): {ate_simple:.4f}  SE: {se_simple:.4f}  95% CI: [{ci_simple[0]:.4f}, {ci_simple[1]:.4f}]  p={p_simple:.4g}")

# (1 pt) ATE with controls: Y ~ D + X1 + X2 + X3 + X4
m_ctrl = fit_ols(df['Y'], df[['D','X1','X2','X3','X4']], robust=True)
ate_ctrl = m_ctrl.params['D']
se_ctrl  = m_ctrl.bse['D']
ci_ctrl  = m_ctrl.conf_int().loc['D'].tolist()
p_ctrl   = m_ctrl.pvalues['D']

print("\n== 3.2.2 ATE with controls: Y ~ D + X1 + X2 + X3 + X4 ==")
print(f"ATE (coef D): {ate_ctrl:.4f}  SE: {se_ctrl:.4f}  95% CI: [{ci_ctrl[0]:.4f}, {ci_ctrl[1]:.4f}]  p={p_ctrl:.4g}")

# (1 pt) 3.2.3 Comparison: does the ATE change? what happens with the SE?
delta_ate = ate_ctrl - ate_simple
se_change = se_ctrl - se_simple
ratio_se  = se_ctrl / se_simple if se_simple!=0 else np.nan

print("\n== 3.2.3 Comparison of estimates ==")
print(f"Change in ATE (controls - simple): {delta_ate:.4f}")
print(f"Change in SE: {se_change:.4f}   SE ratio (ctrl/simple): {ratio_se:.3f}")
print("Note: In a well-balanced RCT, the ATE should remain close and the SEs usually decrease when adding controls predictive of Y.")


== 3.2.1 Simple ATE: Y ~ D ==
ATE (coef D): 2.0527  SE: 0.0712  95% CI: [1.9131, 2.1924]  p=1.521e-182

== 3.2.2 ATE with controls: Y ~ D + X1 + X2 + X3 + X4 ==
ATE (coef D): 2.0633  SE: 0.0625  95% CI: [1.9409, 2.1857]  p=2.562e-239

== 3.2.3 Comparison of estimates ==
Change in ATE (controls - simple): 0.0106
Change in SE: -0.0088   SE ratio (ctrl/simple): 0.877
Note: In a well-balanced RCT, the ATE should remain close and the SEs usually decrease when adding controls predictive of Y.


# 3) Lasso and Variable Selection (3 points)

In [5]:
# Equivalent to cv.glmnet in R using LassoCV from scikit-learn
import numpy as np, pandas as pd
from sklearn.linear_model import LassoCV
import statsmodels.api as sm

# ------------------------------------------------------------
# (1 pt) Fit LASSO of Y ~ X1..X4, EXCLUDING D
X = df[['X1','X2','X3','X4']].to_numpy()
y = df['Y'].to_numpy()

# Cross-validation (10 folds by default). Standardization handled internally.
lasso = LassoCV(cv=10, random_state=123).fit(X, y)

coef = pd.Series(lasso.coef_, index=['X1','X2','X3','X4'])
selected = coef[coef != 0].index.tolist()

print("== 3.3.1 LASSO SELECTION ==")
print("Optimal alpha (λ):", lasso.alpha_)
print("Estimated coefficients:")
print(coef.round(4))
print("Selected covariates at λ_min:", selected)

# ------------------------------------------------------------
# (1 pt) Re-estimate ATE with covariates selected by LASSO
if selected:
    X_sel = df[['D'] + selected]
else:
    X_sel = df[['D']]

Xc = sm.add_constant(X_sel, has_constant='add')
m_lasso = sm.OLS(df['Y'], Xc).fit(cov_type='HC1')
ate_lasso = m_lasso.params['D']
se_lasso  = m_lasso.bse['D']
ci_lasso  = m_lasso.conf_int().loc['D'].tolist()

print("\n== 3.3.2 ATE with covariates selected by LASSO ==")
print(f"ATE (coef D): {ate_lasso:.4f}  SE: {se_lasso:.4f}  95% CI: [{ci_lasso[0]:.4f}, {ci_lasso[1]:.4f}]")

# ------------------------------------------------------------
# (1 pt) Comparison with 3.2
# (assuming m_simple and m_ctrl were already estimated)
print("\n== 3.3.3 Comparison ==")
print(f"Simple ATE (3.2.1): {m_simple.params['D']:.4f}")
print(f"ATE with all controls (3.2.2): {m_ctrl.params['D']:.4f}")
print(f"ATE with controls selected by LASSO (3.3.2): {ate_lasso:.4f}")
print("Comment: The ATE should be stable because the treatment was randomly assigned.")
print("Using LASSO can reduce noise, improve precision, and select only the most relevant covariates.")


== 3.3.1 LASSO SELECTION ==
Optimal alpha (λ): 0.0007311184038558634
Estimated coefficients:
X1    0.4201
X2   -0.3031
X3    0.2599
X4    0.1007
dtype: float64
Selected covariates at λ_min: ['X1', 'X2', 'X3', 'X4']

== 3.3.2 ATE with covariates selected by LASSO ==
ATE (coef D): 2.0633  SE: 0.0625  95% CI: [1.9409, 2.1857]

== 3.3.3 Comparison ==
Simple ATE (3.2.1): 2.0527
ATE with all controls (3.2.2): 2.0633
ATE with controls selected by LASSO (3.3.2): 2.0633
Comment: The ATE should be stable because the treatment was randomly assigned.
Using LASSO can reduce noise, improve precision, and select only the most relevant covariates.
