# Workgroup 4

Authors: Valerie Dube, Erzo Garay, Juan Marcos Guerrero y Matias Villalba

## Bootstraping

## Causal Forest

In [1]:
# Libraries
library(grf)
library(sandwich)
library(lmtest)
library(Hmisc)
library(ggplot2)

ERROR: Error in library(tidyverse): there is no package called 'tidyverse'


### 1. Preprocessing

In [102]:
# Import synthetic data from data folder
df <- read_csv("../../data/synthetic_data.csv")


In [103]:
df.head()

Unnamed: 0,schoolid,Z,Y,S3,C1,C2,C3,XC,X1,X2,X3,X4,X5
0,76,1,0.081602,6,4,2,1,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
1,76,1,-0.385869,4,12,2,1,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
2,76,1,0.398184,6,4,2,0,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
3,76,1,-0.175037,6,4,2,0,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
4,76,1,0.884583,6,4,1,0,4,0.334544,0.648586,-1.310927,0.224077,-0.426757


In [104]:
head(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10391 entries, 0 to 10390
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   schoolid  10391 non-null  int64  
 1   Z         10391 non-null  int64  
 2   Y         10391 non-null  float64
 3   S3        10391 non-null  int64  
 4   C1        10391 non-null  int64  
 5   C2        10391 non-null  int64  
 6   C3        10391 non-null  int64  
 7   XC        10391 non-null  int64  
 8   X1        10391 non-null  float64
 9   X2        10391 non-null  float64
 10  X3        10391 non-null  float64
 11  X4        10391 non-null  float64
 12  X5        10391 non-null  float64
dtypes: float64(6), int64(7)
memory usage: 1.0 MB


In [105]:
# Save school clusters in variable
df$school_id <- as.factor(df$schoolid)

In [106]:
# Fit treatment (w) OLS
formula <- as.formula(paste("Z ~ ", paste(setdiff(names(df), c("Z", "Y")), collapse = " + ")))
w_lm <- glm(formula, data = df, family = binomial())

# Print summary of the GLM model
summary(w_lm)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      Z   No. Observations:                10391
Model:                            GLM   Df Residuals:                    10379
Model Family:                Binomial   Df Model:                           11
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -6519.5
Date:                Tue, 04 Jun 2024   Deviance:                       13039.
Time:                        19:06:52   Pearson chi2:                 1.04e+04
No. Iterations:                     4   Pseudo R-squ. (CS):           0.007280
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.0758      0.146     -7.348      0.0

In the previous OLS, we can observe that only the ctudent’s self-reported expectations for success (S3), student gender (C2), student first-generation status (C3), and school-level mean of students’ fixed mindsets (X1) variables are significat

In [107]:
# Define W, Y, and X_raw
W <- df$Z
Y <- df$Y
X_raw <- df %>% select(-schoolid, -Z, -Y)  # Drop school ID, Z (treatment), and Y (outcome)


In [108]:
# Create model matrices for categorical variables
C1_exp <- model.matrix(~ C1 - 1, data = X_raw)
XC_exp <- model.matrix(~ XC - 1, data = X_raw)

In [109]:
# Combine these matrices with the rest of the data
X <- X_raw %>% select(-C1, -XC) %>% bind_cols(as.data.frame(C1_exp), as.data.frame(XC_exp))

### 2. Estimation

#### 2.1. Cluster-Robust Random Forests

In this section, we grow a forest. We add extra trees for the causal forest.

First, we train a regression forest that can be used to estimate the conditional mean function mu(x) = E[Y | X = x]

In [110]:
Y_forest <- regression_forest(X, Y, clusters = df$school_id, num.trees = 1000)

Y_hat <- predict(Y_forest)$predictions

W_forest <- regression_forest(X, W, clusters = df$school_id, num.trees = 1000)


W_hat <- predict(W_forest)$predictions

#### 2.2. Causal Forests for Observational Studies

In [2]:
# Train the causal forest
cf_raw <- causal_forest(X, Y, W, Y_hat = Y_hat, W_hat = W_hat, clusters = school_id, equalize.cluster.weights = TRUE)

# Variable importance and feature selection
varimp <- variable_importance(cf_raw)
selected_idx <- which(varimp > mean(varimp))


# Retrain the causal forest with selected features
cf <- causal_forest(X[,selected_idx], Y, W, Y_hat = Y_hat, W_hat = W_hat, clusters = school_id, equalize.cluster.weights = TRUE, tune.parameters = "all")
tau_hat <- predict(cf)$predictions

ERROR: Error in causal_forest(X, Y, W, Y_hat = Y_hat, W_hat = W_hat, clusters = school_id, : could not find function "causal_forest"


**Q1: How the tree was built?**

Answer: ...

**Q2: Estimate the ATE**

Answer: ...

In [None]:
ATE <- average_treatment_effect(cf)
paste("95% CI for the ATE:", round(ATE[1], 3), "+/-", round(qnorm(0.975) * ATE[2], 3))

**Q3: Run best linear predictor analysis**

Answer: ...

In [None]:
test_calibration(cf)

**Q4
: Look at school-wise heterogeneity**

Answer: ...

In [None]:
# Calculate the doubly robust (DR) score for each school
dr_score <- tau_hat + W / cf$W_hat * (Y - cf$Y_hat - (1 - cf$W_hat) * tau_hat) -
  (1 - W) / (1 - cf$W_hat) * (Y - cf$Y_hat + cf$W_hat * tau_hat)
school_score <- t(school.mat) %*% dr_score / school.size
# Plot histogram of school treatment effect estimates
pdf("school_hist.pdf")
par(mar = c(5, 4, 4, 2) + 0.5, cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex.sub = 1.5)
hist(school_score, xlab = "School Treatment Effect Estimate", main = "")
dev.off()

**Q5: Analysis Ignoring Clusters**

Answer: ...

In [None]:
# Train causal forest ignoring clusters
cf_noclust <- causal_forest(X[,selected_idx], Y, W, Y_hat = Y_hat, W_hat = W_hat, tune.parameters = "all")

# Estimate ATE ignoring clusters
ATE_noclust <- average_treatment_effect(cf_noclust)
paste("95% CI for the ATE:", round(ATE_noclust[1], 3), "+/-", round(qnorm(0.975) * ATE_noclust[2], 3))

# Calibration test ignoring clusters
test_calibration(cf_noclust)

# Predict tau_hat ignoring clusters
tau_hat_noclust <- predict(cf_noclust)$predictions

# Plot school ID versus tau_hat ignoring clusters
pdf("tau_hat_noclust_vs_school.pdf")
plot(school_id, tau_hat_noclust, xlab = "School ID", ylab = "Estimated CATE (Ignoring Clusters)")
dev.off()



In [None]:
# Cross-validation to assess the model without clusters
nfold <- 5
school_levels <- unique(school_id)
cluster_folds <- sample.int(nfold, length(school_levels), replace = TRUE)

tau_hat_crossfold <- rep(NA, length(Y))
for (foldid in 1:nfold) {
  infold <- school_id %in% school_levels[cluster_folds == foldid]
  cf_fold <- causal_forest(X[!infold, selected_idx], Y[!infold], W[!infold], Y_hat = Y_hat[!infold], W_hat = W_hat[!infold], tune.parameters = "all")
  pred_fold <- predict(cf_fold, X[infold, selected_idx])$predictions
  tau_hat_crossfold[infold] <- pred_fold
}

cf_noclust_cpy <- cf_noclust
cf_noclust_cpy$predictions <- tau_hat_crossfold
cf_noclust_cpy$clusters <- school_id
test_calibration(cf_noclust_cpy)

# Calculate R-loss
Rloss <- mean(((Y - Y_hat) - tau_hat * (W - W_hat))^2)
Rloss_noclust <- mean(((Y - Y_hat) - tau_hat_noclust * (W - W_hat))^2)
Rloss_crossfold <- mean(((Y - Y_hat) - tau_hat_crossfold * (W - W_hat))^2)

c(Rloss_noclust - Rloss, Rloss_crossfold - Rloss)

# ANOVA to compare DR scores across schools
summary(aov(dr_score ~ factor(school_id)))


**Q6: Analysis without fitting the propensity scores**

Answer: ...

In [None]:
# Train causal forest without fitting the propensity score
cf_noprop <- causal_forest(X[,selected.idx], Y, W, Y_hat = Y_hat, W_hat = mean(W), tune.parameters = "all", equalize.cluster.weights = TRUE, clusters = school_id)
tau_hat_noprop <- predict(cf_noprop)$predictions

# Estimate ATE without fitting the propensity score
ATE_noprop <- average_treatment_effect(cf_noprop)
paste("95% CI for the ATE:", round(ATE_noprop[1], 3), "+/-", round(qnorm(0.975) * ATE_noprop[2], 3))

# Plot estimated CATE without fitting the propensity score
pdf("tauhat_noprop.pdf")
par(mar = c(5, 4, 4, 2) + 0.5, cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex.sub = 1.5)
plot(tau_hat, tau_hat_noprop, xlim = range(tau_hat, tau_hat_noprop), ylim = range(tau_hat, tau_hat_noprop), xlab = "Orthogonalized causal forest estimates", ylab = "Non-orthogonalized causal forest")
abline(0, 1, lwd = 2, lty = 2, col = 4)
dev.off()

**Q7: The code plot six plots in the Make some plots section, so explain what you find there**

Answer: ...

**Q8: Visualize school-level covariates by treatment heterogeneity**

Answer: ...

In [None]:
# Standardize school-level covariates
school_X_std <- scale(school.X)

# Create terciles based on the predicted school treatment effects
school_tercile <- cut(school.pred, breaks = c(-Inf, quantile(school.pred, c(1/3, 2/3)), Inf))
school_tercile_mat <- model.matrix(~ school_tercile + 0)

# Calculate means for each school-level covariate within each tercile
school_means <- diag(1 / colSums(school_tercile_mat)) %*% t(school_tercile_mat) %*% as.matrix(school_X_std)

# Generate heatmap colors
MM <- max(abs(school_means))
HC <- heat.colors(21)
school_col <- apply(school_means, 1:2, function(aa) HC[1 + round(20 * (0.5 + aa / MM))])

# Create a data frame for plotting
DF_plot <- data.frame(
  tercile = rep(factor(1:3, labels = c("low", "mid", "high")), ncol(school_X_std)),
  mean = as.numeric(school_means),
  feature = factor(rep(colnames(school_X_std), each = 3))
)

# Plot the heatmap using ggplot2
ggplot(data = DF_plot, aes(x = feature, y = tercile, fill = mean)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "steelblue") +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    panel.background = element_blank()
  )

# Save the plot
ggsave("tercile_plot.pdf", width = 8, height = 4.5, dpi = 120)


**Q9: CATE by school**

Answer: ...

In [None]:
# Load necessary libraries
library(grf)
library(sandwich)
library(lmtest)
library(Hmisc)
library(ggplot2)

# Set seed for reproducibility
set.seed(1)

# Load the synthetic data
data_all <- read.csv("synthetic_data.csv")
data_all$schoolid <- factor(data_all$schoolid)

# Preprocess data
DF <- data_all[,-1]
school_id <- as.numeric(data_all$schoolid)
W <- DF$Z
Y <- DF$Y
X_raw <- DF[,-(1:2)]
C1_exp <- model.matrix(~ factor(X_raw$C1) + 0)
XC_exp <- model.matrix(~ factor(X_raw$XC) + 0)
X <- cbind(X_raw[,-which(names(X_raw) %in% c("C1", "XC"))], C1_exp, XC_exp)

# Train regression forests for Y and W
Y_forest <- regression_forest(X, Y, clusters = school_id, equalize.cluster.weights = TRUE)
Y_hat <- predict(Y_forest)$predictions
W_forest <- regression_forest(X, W, clusters = school_id, equalize.cluster.weights = TRUE)
W_hat <- predict(W_forest)$predictions

# Train causal forest with clustering
cf <- causal_forest(X, Y, W, Y_hat = Y_hat, W_hat = W_hat, clusters = school_id, equalize.cluster.weights = TRUE)
tau_hat <- predict(cf)$predictions

# Analysis ignoring clusters
cf_noclust <- causal_forest(X[,selected.idx], Y, W, Y_hat = Y_hat, W_hat = W_hat, tune.parameters = "all")
tau_hat_noclust <- predict(cf_noclust)$predictions

# Reorder schools based on predicted treatment effects
ord <- order(order(school.pred))
school.sort <- ord[school.id]

# Plot boxplot of CATE by school
pdf("school_boxplot.pdf")
par(mar = c(5, 4, 4, 2) + 0.5, cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex.sub = 1.5)
boxplot(tau_hat_noclust ~ school.sort, xaxt = "n", xlab = "school", ylab = "estimated CATE")
points(1:76, sort(school.pred), col = 4, pch = 16)
legend("topleft", c("school mean CATE", "CATE w/o clustering"), pch = c(16, 1), col = c(4, 1), cex = 1.5)
dev.off()
