# Assignment 4 - Part 2: Causal Forest

This notebook implements a causal forest analysis to estimate heterogeneous treatment effects of a random cash transfer program encouraging medical check-ups.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Set random state for reproducibility
RANDOM_STATE = 123
np.random.seed(RANDOM_STATE)

## Load and Prepare Data

In [None]:
# Load the dataset
column_names = ['age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'hd']

df = pd.read_csv('../input/processed.cleveland.data', 
                 names=column_names, 
                 na_values='?')

# Remove missing values
df = df.dropna()

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

## (0.5 points) Create binary treatment variable T

In [None]:
# Create binary treatment variable with random assignment
df['T'] = np.random.binomial(1, 0.5, size=len(df))

print("Treatment distribution:")
print(df['T'].value_counts())
print(f"\nProportion treated: {df['T'].mean():.4f}")

## (1 point) Create outcome variable Y

In [None]:
# Create outcome variable Y
# Y = (1 + 0.05*age + 0.3*sex + 0.2*restbp) * T + 0.5*oldpeak + epsilon
# epsilon ~ N(0, 1)

epsilon = np.random.normal(0, 1, size=len(df))

df['Y'] = ((1 + 0.05 * df['age'] + 0.3 * df['sex'] + 0.2 * df['restbp']) * df['T'] + 
           0.5 * df['oldpeak'] + epsilon)

print("Outcome variable Y statistics:")
print(df['Y'].describe())

# Visualize Y distribution by treatment group
plt.figure(figsize=(10, 6))
plt.hist(df[df['T'] == 0]['Y'], alpha=0.5, label='Control (T=0)', bins=30)
plt.hist(df[df['T'] == 1]['Y'], alpha=0.5, label='Treated (T=1)', bins=30)
plt.xlabel('Y (Health Improvement)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Outcome Variable by Treatment Group', fontsize=14)
plt.legend()
plt.savefig('../output/outcome_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## (1 point) Calculate treatment effect using OLS

In [None]:
# Estimate treatment effect using OLS regression
# Simple model: Y ~ T
model_simple = ols('Y ~ T', data=df).fit()
print("Simple OLS Model (Y ~ T):")
print(model_simple.summary())
print(f"\nAverage Treatment Effect (ATE): {model_simple.params['T']:.4f}")

In [None]:
# More complete model with covariates
model_full = ols('Y ~ T + age + sex + restbp + oldpeak', data=df).fit()
print("\nFull OLS Model with Covariates:")
print(model_full.summary())
print(f"\nAverage Treatment Effect (ATE) with controls: {model_full.params['T']:.4f}")

## (2 points) Use Random Forest to estimate causal effects

In [None]:
# Prepare features for Random Forest
# Include covariates and treatment
feature_cols = ['age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'T']

X_rf = df[feature_cols]
y_rf = df['Y']

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, 
                                  random_state=RANDOM_STATE, min_samples_split=10)
rf_model.fit(X_rf, y_rf)

print("Random Forest model trained successfully")
print(f"RÂ² Score: {rf_model.score(X_rf, y_rf):.4f}")

In [None]:
# Estimate individual treatment effects using Random Forest
# Create counterfactual datasets
X_treated = X_rf.copy()
X_treated['T'] = 1

X_control = X_rf.copy()
X_control['T'] = 0

# Predict outcomes under treatment and control
y_pred_treated = rf_model.predict(X_treated)
y_pred_control = rf_model.predict(X_control)

# Calculate Conditional Average Treatment Effect (CATE)
df['CATE'] = y_pred_treated - y_pred_control

print("Conditional Average Treatment Effect (CATE) statistics:")
print(df['CATE'].describe())

# Visualize CATE distribution
plt.figure(figsize=(10, 6))
plt.hist(df['CATE'], bins=30, edgecolor='black')
plt.axvline(x=df['CATE'].mean(), color='r', linestyle='--', 
            label=f'Mean CATE = {df["CATE"].mean():.4f}')
plt.xlabel('Conditional Average Treatment Effect (CATE)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Estimated Treatment Effects', fontsize=14)
plt.legend()
plt.savefig('../output/cate_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## (2 points) Plot representative tree capturing heterogeneous treatment effects

In [None]:
# Train a single decision tree with max_depth=2 to visualize heterogeneous treatment effects
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(max_depth=2, random_state=RANDOM_STATE, min_samples_split=10)
tree_model.fit(X_rf, y_rf)

# Plot the tree
plt.figure(figsize=(20, 10))
plot_tree(tree_model, 
          feature_names=feature_cols, 
          filled=True, 
          rounded=True,
          fontsize=12)
plt.title('Representative Decision Tree (max_depth=2) for Heterogeneous Treatment Effects', fontsize=16)
plt.savefig('../output/representative_tree.png', dpi=300, bbox_inches='tight')
plt.show()

print("Tree interpretation:")
print("This tree shows how different patient characteristics lead to different predicted outcomes.")
print("The splits indicate which features are most important for determining treatment response.")

**Interpretation:**

The representative decision tree with max_depth=2 reveals the key features that drive heterogeneous treatment effects:

1. **Root Split:** The tree first splits on the most important feature for predicting the outcome.
2. **Subsequent Splits:** Further splits reveal interactions between covariates and treatment.
3. **Leaf Nodes:** Each leaf represents a subgroup with similar predicted outcomes.
4. **Heterogeneity:** Different paths through the tree represent different subpopulations that may benefit differently from the treatment.

## (1.5 points) Compute and visualize feature importances

In [None]:
# Get feature importances from Random Forest
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances
}).sort_values('importance', ascending=False)

print("Feature Importances:")
print(feature_importance_df)

# Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Feature Importances from Random Forest Model', fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../output/feature_importances.png', dpi=300, bbox_inches='tight')
plt.show()

## (2 points) Plot distribution of standardized covariates by predicted treatment effect terciles

In [None]:
# Standardize all covariates
covariate_cols = ['age', 'sex', 'cp', 'restbp', 'chol', 'fbs', 'restecg', 
                  'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

scaler = StandardScaler()
df_standardized = df.copy()
df_standardized[covariate_cols] = scaler.fit_transform(df[covariate_cols])

print("Covariates standardized successfully")

In [None]:
# Divide CATE into terciles
df_standardized['tercile'] = pd.qcut(df_standardized['CATE'], q=3, 
                                      labels=['Low', 'Medium', 'High'])

print("CATE tercile distribution:")
print(df_standardized['tercile'].value_counts())

In [None]:
# Compute mean of each covariate within each tercile
tercile_means = df_standardized.groupby('tercile')[covariate_cols].mean()

print("Mean standardized covariates by tercile:")
print(tercile_means)

# Transpose for heatmap (terciles on y-axis, covariates on x-axis)
tercile_means_transposed = tercile_means.T

# Create heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(tercile_means_transposed.T, annot=True, fmt='.2f', cmap='RdBu_r', 
            center=0, cbar_kws={'label': 'Standardized Mean'})
plt.xlabel('Covariates', fontsize=12)
plt.ylabel('Treatment Effect Tercile', fontsize=12)
plt.title('Distribution of Standardized Covariates by Predicted Treatment Effect Terciles', 
          fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../output/tercile_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

**Interpretation:**

The heatmap shows how patient characteristics differ across treatment effect terciles:

1. **Red cells** (positive values): indicate that patients in this tercile have above-average values for that covariate.
2. **Blue cells** (negative values): indicate that patients in this tercile have below-average values for that covariate.
3. **White cells** (near zero): indicate that patients in this tercile have average values for that covariate.

This visualization helps identify which patient characteristics are associated with higher or lower treatment effects, revealing heterogeneity in treatment response.

## Summary

This analysis demonstrates:

1. **Randomized Treatment Assignment:** Successfully simulated a randomized cash transfer program.
2. **Outcome Generation:** Created a realistic outcome variable with treatment effects varying by patient characteristics.
3. **OLS Estimation:** Estimated average treatment effects using regression.
4. **Random Forest for Causal Inference:** Used machine learning to estimate heterogeneous treatment effects.
5. **Visualization:** Identified key features and patient subgroups with different treatment responses.

The results suggest that treatment effects vary across patients based on their characteristics, with age, sex, and blood pressure being particularly important predictors of treatment response.