# Workgroup 4

Authors: Valerie Dube, Erzo Garay, Juan Marcos Guerrero y Matias Villalba

## Bootstraping

## Causal Forest

In [101]:
# Libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from econml.grf import CausalForest, RegressionForest

### 1. Preprocessing

In [102]:
# Import synthetic data from data folder
df = pd.read_csv("../../data/synthetic_data.csv")

In [103]:
df.head()

Unnamed: 0,schoolid,Z,Y,S3,C1,C2,C3,XC,X1,X2,X3,X4,X5
0,76,1,0.081602,6,4,2,1,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
1,76,1,-0.385869,4,12,2,1,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
2,76,1,0.398184,6,4,2,0,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
3,76,1,-0.175037,6,4,2,0,4,0.334544,0.648586,-1.310927,0.224077,-0.426757
4,76,1,0.884583,6,4,1,0,4,0.334544,0.648586,-1.310927,0.224077,-0.426757


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10391 entries, 0 to 10390
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   schoolid  10391 non-null  int64  
 1   Z         10391 non-null  int64  
 2   Y         10391 non-null  float64
 3   S3        10391 non-null  int64  
 4   C1        10391 non-null  int64  
 5   C2        10391 non-null  int64  
 6   C3        10391 non-null  int64  
 7   XC        10391 non-null  int64  
 8   X1        10391 non-null  float64
 9   X2        10391 non-null  float64
 10  X3        10391 non-null  float64
 11  X4        10391 non-null  float64
 12  X5        10391 non-null  float64
dtypes: float64(6), int64(7)
memory usage: 1.0 MB


In [105]:
# Save school clusters in variable
school_id = df['schoolid'].astype('category').cat.codes

In [106]:
# Fit treatment (w) OLS
formula = 'Z ~ ' + ' + '.join(df.columns.drop(['Z', 'Y']))
w_lm = smf.glm(formula=formula, data=df, family=sm.families.Binomial()).fit()

# Print summary of the GLM model
print(w_lm.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      Z   No. Observations:                10391
Model:                            GLM   Df Residuals:                    10379
Model Family:                Binomial   Df Model:                           11
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -6519.5
Date:                Tue, 04 Jun 2024   Deviance:                       13039.
Time:                        19:06:52   Pearson chi2:                 1.04e+04
No. Iterations:                     4   Pseudo R-squ. (CS):           0.007280
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.0758      0.146     -7.348      0.0

In the previous OLS, we can observe that only the ctudent’s self-reported expectations for success (S3), student gender (C2), student first-generation status (C3), and school-level mean of students’ fixed mindsets (X1) variables are significat

In [107]:
# We define W, Y, and X_raw
W = df['Z']
Y = df['Y']
X_raw = df.drop(columns=['schoolid', 'Z', 'Y']) # School ID does not affect pscore

In [108]:
# Create model matrices for categorical variables
C1_exp = pd.get_dummies(X_raw['C1'], prefix='C1')
XC_exp = pd.get_dummies(X_raw['XC'], prefix='XC')

In [109]:
# Combine these matrices with the rest of the data
X = pd.concat([X_raw.drop(columns=['C1', 'XC']), C1_exp, XC_exp], axis=1)

### 2. Estimation

#### 2.1. Cluster-Robust Random Forests

In this section, we grow a forest. We add extra trees for the causal forest.

First, we train a regression forest that can be used to estimate the conditional mean function mu(x) = E[Y | X = x]

In [110]:
regf = RegressionForest(max_depth=None, random_state=0, n_estimators=1000)

Y_hat = regf.fit(X, Y, sample_weight=school_id).predict(X)
W_hat = regf.fit(X, W, sample_weight=school_id).predict(X)

#### 2.2. Causal Forests for Observational Studies

In [111]:
cf_raw = CausalForest(max_depth=None, random_state=0, n_estimators=1000)

cf_raw.fit(X, Y, W, Y_hat, W_hat)

TypeError: CausalForest.fit() takes 4 positional arguments but 6 were given

**Q1: How the tree was built?**

Answer: ...