# Project 2 - Double ML and Causal Forest

Data from: Heyes, Anthony, and Soodeh Saberian. 2019. "Temperature and Decisions: Evidence from 207,000 Court Cases." American Economic Journal: Applied Economics, 11 (2): 238–65.

Notebooks used troughout the code: 
- Double Machine Learning Examples-econml

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install econml

In [None]:
from econml.dml import DML, LinearDML, SparseLinearDML, CausalForestDML
from itertools import product
from sklearn.linear_model import (Lasso, MultiTaskElasticNetCV)
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


## Data Description 

In [None]:
df = pd.read_stata('matched_corrected.dta')
df.describe()

In [None]:
# Print the list of columns to identify all variables
columns_list = df.columns.tolist()
print(columns_list)

In [None]:
#Create a dummy for asylum
df['dummy_asylum'] = df['c_asy_type'].apply(lambda x: 1 if x == 'E' else 0)
#Create a dummy for gender
df['dummy_gender'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)

In [None]:
#As outlined in the correction article drop the observation for China
df = df[df['nat_name'] != 'CHINA']

In [None]:
# Get unique values to identify variables for the dummy variables
unique__names = df['nat_name'].unique()
locations = df['location'].unique()

In [None]:
# Define the list of regions
middle_eastern_countries = ["BAHRAIN", "CYPRUS", "EGYPT", "IRAN", "IRAQ", "ISRAEL", "JORDAN", 
    "KUWAIT", "LEBANON", "OMAN", "PALESTINE", "QATAR", "SAUDI ARABIA", 
    "SYRIA", "TURKEY", "UNITED ARAB EMIRATES", "YEMEN"]

africa = ["ERITREA", "RWANDA", "SOMALIA", "SUDAN", "CONGO", "ETHIOPIA", "LIBYA", 
    "MALI", "ANGOLA", "BURUNDI", "TANZANIA", "NIGERIA", "GABON", "GHANA", 
    "SENEGAL", "CHAD", "DJIBOUTI", "CAMEROON", "UGANDA", "KENYA", 
    "ZAMBIA", "MAURITANIA", "SOUTH AFRICA", "GUINEA", "BURKINA FASO", 
    "MOROCCO", "ALGERIA", "COMORO ISLANDS", "EQUATORIAL GUINEA", 
    "CENTRAL AFRICAN REPUBLIC", "CAPE VERDE", "LESOTHO", "SWAZILAND", 
    "GAMBIA", "SIERRA LEONE", "GUINEA BISSAU"]

america = ["GUATEMALA", "EL SALVADOR", "PANAMA", "COLOMBIA", 
    "ARGENTINA", "HAITI", "VENEZUELA", "MEXICO", "CUBA", "DOMINICAN REPUBLIC", 
    "BRAZIL", "CHILE", "SURINAME", "TRINIDAD AND TOBAGO", "JAMAICA", 
    "CANADA", "USA", "ST. KITTS, WEST INDIES", "ANTIGUA AND BARBUDA", 
    "BARBADOS", "BAHAMAS", "BELIZE", "DOMINICA", "GRENADA", 
    "NICARAGUA", "URUGUAY", "PARAGUAY", "ST. LUCIA", "ST. VINCENT AND THE GRENADINES"]

asia = ["PAKISTAN", "VIETNAM", "INDONESIA", "AFGHANISTAN", 
    "IRAN", "BANGLADESH", "PHILIPPINES", "TAIWAN", "MALAYSIA", 
    "KAZAKHSTAN", "KYRGYZSTAN", "THAILAND", "TURKMENISTAN", "UZBEKISTAN", 
    "MONGOLIA", "SRI LANKA", "BHUTAN", "LAOS", "NEPAL", 
    "MYANMAR", "KAMPUCHEA", "BRUNEI", "BURMA", "KOREA", "NORTH KOREA"]

europe = ["RUSSIA", "ARMENIA", "ALBANIA", "YUGOSLAVIA", "UNITED KINGDOM", 
    "BULGARIA", "ROMANIA", "HUNGARY", "POLAND", "CZECH REPUBLIC", 
    "SLOVAK REPUBLIC", "GERMANY", "FRANCE", "ITALY", "SPAIN", 
    "SWEDEN", "DENMARK", "FINLAND", "AUSTRIA", "SWITZERLAND", 
    "BELGIUM", "GREECE", "NETHERLANDS", "CROATIA", "SLOVENIA", 
    "MONACO", "LITHUANIA", "LATVIA", "ESTONIA", "ICELAND"]

df['middleast'] = 0
df['america'] = 0
df['africa'] = 0
df['asia'] = 0
df['europe'] = 0

df.loc[df['nat_name'].isin(middle_eastern_countries), 'middleast'] = 1
df.loc[df['nat_name'].isin(america), 'america'] = 1
df.loc[df['nat_name'].isin(africa), 'africa'] = 1
df.loc[df['nat_name'].isin(asia), 'asia'] = 1
df.loc[df['nat_name'].isin(europe), 'europe'] = 1

#Create interaction terms
df['middleast_dev'] = df['middleast']*df['temp6t4']
df['america_dev'] = df['america']*df['temp6t4']
df['africa_dev'] = df['africa']*df['temp6t4']
df['asia_dev'] = df['asia']*df['temp6t4']
df['europe_dev'] = df['europe']*df['temp6t4']

In [None]:
#Create a categorical variable for location and group locations into regions
northeast = ['NEWARK', 'BOSTON', 'NEW YORK CITY', 'BUFFALO', 'PHILADELPHIA', 
    'NEW YORK ANNEX', 'NY DET (VARICK ST.)', 'HARTFORD', 
    '*PA DOC.', 'CLEVELAND', '*BOP  DANBURY', '*RI  DOC',
    '*WISCONSIN DOC', '*NH  DOC', '*SUFFOLK COUNTY','*NEWARK VIDEO HEARINGS','*JESSUP'
    '*BOP ALLENWOOD', '*NORTHERN STATE NJ DOC','YORK COUNTY DET','YORK COUNTY DET']

midwest = ['CHICAGO', 'DETROIT', 'CINCINNATI', 'CLEVELAND', 'ST. LOUIS', 
    'MEMPHIS', 'KANSAS CITY', 'OMAHA', '*MI  DOC', 
    '*IL DOC - STATESVILLE', '*MO DOC', '*OHIO DOC', 
    '*INDIANA YOUTH CENTER']

south = ['ARLINGTON', 'DALLAS', 'HOUSTON', 'MIAMI', 'ATLANTA', 
    'NEW ORLEANS', 'SAN ANTONIO', 'DALLAS DET', 'SAN ANTONIO DET', 
    'HOUSTON DET', 'ATLANTA DET', '*GEORGIA DOC', '*VA DOC', 
    '*DADE COUNTY FL DOC', '*BROWARD  FL DOC', 'ORLANDO', 'KROME DET',
    'PORT ISABEL DET', 'EL PASO', 'EL PASO DET', '*TX DOC', 
    'LOUISVILLE', 'OKLAHOMA CITY', 'OKLAHOMA CITY DET', 
    'BATAVIA SPC', 'BROWARD TRANS CTR','ST. THOMAS', 'ST. CROIX', 'ROLLING PLAINS DETENTION CENTER',
    '*BOP BIG SPRING AIRPARK','BRADENTON DET','SAN ANTONIO DET']

west = ['DENVER', 'SAN DIEGO', 'LOS ANGELES', 'SAN FRANCISCO', 
    'PHOENIX', 'LAS VEGAS', 'RENO', 'SALT LAKE CITY', 'OTAY MESA', 
    'TUCSON', 'HONOLULU', 'SAN JUAN', 'SEATTLE', 'PORTLAND',
    'SAN FRANCISCO DET', 'DENVER DET', 'SAN DIEGO DETAINED', 
    'MIRA LOMA DET', 'HONOLULU DET', '*CO DOC', '*AZ DOC',
    '*WA DOC', '*AK DOC', 'ANCHORAGE', 'SAN PEDRO', 
    'IMPERIAL', '*NM DOC','PORTLAND DET','*MONROE WA DOC','SAN FRANCISCO ANNEX']


df['northeast'] = 0
df['midwest'] = 0
df['south'] = 0
df['west'] = 0

df.loc[df['location'].isin(northeast), 'northeast'] = 1
df.loc[df['location'].isin(midwest), 'midwest'] = 1
df.loc[df['location'].isin(south), 'south'] = 1
df.loc[df['location'].isin(west), 'west'] = 1

In [None]:
df['month'] = df['date'].dt.month
df = pd.get_dummies(df, columns=['month'], prefix='month', drop_first=False)

In [None]:
df['year'] = df['date'].dt.year

df['year2000'] = 0
df['year2001'] = 0
df['year2002'] = 0
df['year2003'] = 0
df['year2004'] = 0

df.loc[df['year'] == 2000, 'year2000'] = 1
df.loc[df['year'] == 2001, 'year2001'] = 1
df.loc[df['year'] == 2002, 'year2002'] = 1
df.loc[df['year'] == 2003, 'year2003'] = 1
df.loc[df['year'] == 2004, 'year2004'] = 1

# Interaction term for location and year
years = [2000, 2001, 2002, 2003, 2004]
locations = ['northeast', 'midwest', 'south', 'west']

for year in years:
    for location in locations:
        df[f'{location}_year{year}'] = df[location] * df[f'year{year}']

In [None]:
#Clean the dataset

#Drop asylum cases with no classification
df = df[df['c_asy_type'].isin(['E', 'I'])]
df_final = df.dropna(axis=0) 

# Double Machine Learning

In [None]:
df_final['T_binary'] = (df_final['deviation'] > 0.000095).astype(int)
print(df_final['T_binary'].value_counts())
T = 'T_binary'
Y = 'res'
X = ['chair', 'dummy_asylum', 'dummy_gender', 
                     'middleast', 'america', 'africa', 'europe', 'northeast', 'midwest', 
                     'south', 'year2000', 'year2001', 'year2002', 
                     'year2003','month_1',
                     'month_2','month_3','month_4','month_5','month_6','month_7','month_8',
                     'month_9','month_10','month_11']

In [None]:
df_final['deviation'].describe()

In [None]:
sampled_df = df_final.sample(frac=0.3, random_state=42)

In [None]:
df_train, df_test = train_test_split(sampled_df, test_size=0.5, random_state=42)

In [None]:
Y_train = df_train[Y]
X_train = df_train[X]
T_train = df_train[T]

Y_test = df_test[Y]
X_test = df_test[X]
T_test = df_test[T]

In [None]:
est = LinearDML(model_y=RandomForestClassifier(min_samples_leaf=10),
                model_t=RandomForestClassifier(min_samples_leaf=10),
                discrete_treatment=True,
                discrete_outcome=True,  #
                cv=6)
est.fit(Y=Y_train, T=T_train, X=X_train)
te_pred = est.effect(X_test)
lb, ub = est.effect_interval(X_test, alpha=0.01)

In [None]:
print(np.mean(te_pred))

In [None]:
te_pred

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.gca()
ax.set_facecolor('#f5f5f5')  
for spine in ax.spines.values():
    spine.set_visible(False)
plt.hist(te_pred, bins=30, color='lightblue', alpha=0.7, edgecolor='none')
plt.xlabel("Predicted Treatment Effect")
plt.ylabel("Frequency")
plt.grid(True, alpha=0.6)
plt.show()

In [None]:
def elast(data, y, t):
    return (np.sum((data[t] - data[t].mean()) * (data[y] - data[y].mean())) /
            np.sum((data[t] - data[t].mean()) ** 2))

def cumulative_gain(dataset, prediction, y, t, min_periods=30, steps=100):
    size = dataset.shape[0]
    ordered_df = dataset.sort_values(prediction, ascending=False).reset_index(drop=True)
    n_rows = list(range(min_periods, size, size // steps)) + [size]
    return np.array([elast(ordered_df.head(rows), y, t) * (rows / size) for rows in n_rows])

In [None]:
cate_train = est.effect(X_train)
cate_test = est.effect(X_test)

ATE_test = cate_test.mean()
print(f"ATE (Test): {ATE_test}")

df_train = df_train.assign(cate=cate_train)
df_test = df_test.assign(cate=cate_test)

# Cumulative gain functions
gain_curve_test = cumulative_gain(df_test, "cate", y="res", t="T_binary")
gain_curve_train = cumulative_gain(df_train, "cate", y="res", t="T_binary")

import matplotlib.pyplot as plt

plt.plot(gain_curve_test, color="C0", label="Test")
plt.plot(gain_curve_train, color="C1", label="Train")
plt.plot([0, 100], [0, elast(df_test, y="res", t="T_binary")], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.show()

In [None]:
#Non Linear DML
est2 = DML(model_y=RandomForestClassifier(min_samples_leaf=10),
           model_t=RandomForestClassifier(min_samples_leaf=10),
            discrete_treatment=True,
            discrete_outcome=True,
           model_final=Lasso(alpha=0.1, fit_intercept=False),
           featurizer=PolynomialFeatures(degree=3),
           random_state=123)
est2.fit(Y_train, T_train, X=X_train)
te_pred2 = est2.effect(X_test)

In [None]:
print(np.mean(te_pred2))

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.gca()
ax.set_facecolor('#f5f5f5')  
for spine in ax.spines.values():
    spine.set_visible(False)
plt.hist(te_pred2, bins=30, color='lightblue', alpha=0.7, edgecolor='none')
plt.xlabel("Predicted Treatment Effect")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
cate_train_2 = est2.effect(X_train)
cate_test_2 = est2.effect(X_test)

ATE_test_2 = cate_test_2.mean()
print(f"ATE (Test): {ATE_test_2}")

df_train_2 = df_train.assign(cate=cate_train_2)
df_test_2 = df_test.assign(cate=cate_test_2)

# Cumulative gain functions
gain_curve_test_2 = cumulative_gain(df_test_2, "cate", y="res", t="T_binary")
gain_curve_train_2 = cumulative_gain(df_train_2, "cate", y="res", t="T_binary")
import matplotlib.pyplot as plt

plt.plot(gain_curve_test_2, color="C0", label="Test")
plt.plot(gain_curve_train_2, color="C1", label="Train")
plt.plot([0, 100], [0, elast(df_test_2, y="res", t="T_binary")], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.show()

In [None]:
#Causal Forest
est3 = CausalForestDML(model_y=RandomForestRegressor(),
                       model_t=RandomForestClassifier(min_samples_leaf=10),
                       discrete_treatment=True,
                       discrete_outcome=True,
                       n_estimators=1000,
                       min_impurity_decrease=0.001,
                       verbose=0,
                       cv=6)
est3.tune(Y=Y_train, T=T_train, X=X_train)
est3.fit(Y=Y_train, T=T_train, X=X_train)
te_pred3 = est3.effect(X_test)
lb3, ub3 = est3.effect_interval(X_test, alpha=0.01)

est3.fit(Y_train, T_train, X=X_train)
te_pred3 = est3.effect(X_test)

In [None]:
print(np.mean(te_pred3))

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.gca()
ax.set_facecolor('#f5f5f5')  
for spine in ax.spines.values():
    spine.set_visible(False)
plt.hist(te_pred3, bins=30, color='lightblue', alpha=0.7, edgecolor='none')
plt.xlabel("Predicted Treatment Effect")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
cate_train_3 = est3.effect(X_train)
cate_test_3 = est3.effect(X_test)

ATE_test_3 = cate_test_3.mean()
print(f"ATE (Test): {ATE_test_3}")

df_train_3 = df_train.assign(cate=cate_train_3)
df_test_3 = df_test.assign(cate=cate_test_3)

gain_curve_test_3 = cumulative_gain(df_test_3, "cate", y="res", t="T_binary")
gain_curve_train_3 = cumulative_gain(df_train_3, "cate", y="res", t="T_binary")

import matplotlib.pyplot as plt

plt.plot(gain_curve_test_3, color="C0", label="Test")
plt.plot(gain_curve_train_3, color="C1", label="Train")
plt.plot([0, 100], [0, elast(df_test_3, y="res", t="T_binary")], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.show()

## Visualize Heterogenous Effects 

In [None]:
feature_to_plot = 'america'  

plt.figure(figsize=(10, 6))
plt.scatter(X_test[feature_to_plot], te_pred3, alpha=0.5)
plt.xlabel(feature_to_plot)
plt.ylabel("Estimated Treatment Effect")
plt.title("Heterogeneous Treatment Effects by Feature")
plt.show()

In [None]:
feature_to_plot = 'middleast'  

plt.figure(figsize=(10, 6))
plt.scatter(X_test[feature_to_plot], te_pred3, alpha=0.5)
plt.xlabel(feature_to_plot)
plt.ylabel("Estimated Treatment Effect")
plt.title("Heterogeneous Treatment Effects by Feature")
plt.show()

In [None]:
feature_to_plot = 'africa'  

plt.figure(figsize=(10, 6))
plt.scatter(X_test[feature_to_plot], te_pred3, alpha=0.5)
plt.xlabel(feature_to_plot)
plt.ylabel("Estimated Treatment Effect")
plt.title("Heterogeneous Treatment Effects by Feature")
plt.show()

In [None]:
feature_to_plot = 'europe'  

plt.figure(figsize=(10, 6))
plt.scatter(X_test[feature_to_plot], te_pred3, alpha=0.5)
plt.xlabel(feature_to_plot)
plt.ylabel("Estimated Treatment Effect")
plt.title("Heterogeneous Treatment Effects by Feature")
plt.show()

In [None]:
feature_column = X_test[feature_to_plot].flatten() if len(X_test[feature_to_plot].shape) > 1 else X_test[feature_to_plot]
treatment_effects = te_pred3.flatten() if len(te_pred3.shape) > 1 else te_pred3

In [None]:
data = pd.DataFrame({
    feature_to_plot: feature_column,  
    "Treatment Effect": treatment_effects  
})

mean_effects = data.groupby(feature_to_plot)["Treatment Effect"].mean()
std_errors = data.groupby(feature_to_plot)["Treatment Effect"].sem()

plt.figure(figsize=(8, 6))
plt.bar(mean_effects.index, mean_effects, yerr=std_errors, capsize=5, color=["#3498db", "#e74c3c"])
plt.xlabel(feature_to_plot)
plt.ylabel("Mean Estimated Treatment Effect")
plt.title("Average Heterogeneous Treatment Effects by Feature")
plt.xticks([0, 1], ['Not Europe', 'Europe'])  
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
#Relabel for better graphs
relabel_mapping = {
    'chair': 'Judge',
    'dummy_asylum': 'Asylum Status (Dummy)',
    'dummy_gender': 'Gender (Dummy)',
    'middleast': 'Middle East',
    'america': 'America',
    'africa': 'Africa',
    'europe': 'Europe',
    'northeast': 'Northeast Region',
    'midwest': 'Midwest Region',
    'south': 'Southern Region',
    'year2000': 'Year 2000',
    'year2001': 'Year 2001',
    'year2002': 'Year 2002',
    'year2003': 'Year 2003',
    'month_1': 'January',
    'month_2': 'February',
    'month_3': 'March',
    'month_4': 'April',
    'month_5': 'May',
    'month_6': 'June',
    'month_7': 'July',
    'month_8': 'August',
    'month_9': 'September',
    'month_10': 'October',
    'month_11': 'November'
}

In [None]:
importances = est3.feature_importances_
features = X_test.columns
relabelled_features = [relabel_mapping.get(feature, feature) for feature in features]

plt.figure(figsize=(12, 6))
sns.barplot(x=importances, y=relabelled_features, orient='h', palette='viridis')
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Features', fontsize=14)
plt.show()

## Simulate Variables

In [None]:
## Simulate the deviation variable data
sample_size = int(len(df_final) * 0.8)
random_indices = np.random.choice(df_final.index, size=sample_size, replace=False)
df_final['adjusted_deviation'] = df_final['deviation'] + 0.002

In [None]:
df_final['T_binary_simulated'] = (df_final['adjusted_deviation'] > 0.000095).astype(int)
print(df_final['T_binary_simulated'].value_counts())
T_simulated = 'T_binary_simulated'

In [None]:
sampled_df_simulated = df_final.sample(frac=0.3, random_state=42)

In [None]:
df_train_s, df_test_s = train_test_split(sampled_df_simulated, test_size=0.5, random_state=42)

In [None]:
T_simulated_train = df_train_s[T_simulated]

T_simulated_test = df_test_s[T_simulated]

In [None]:
est = LinearDML(model_y=RandomForestClassifier(min_samples_leaf=10),
                model_t=RandomForestClassifier(min_samples_leaf=10),
                discrete_treatment=True,
                discrete_outcome=True,  
                cv=6)
est.fit(Y=Y_train, T=T_simulated_train, X=X_train)
te_pred_est = est.effect(X_test)
lb, ub = est.effect_interval(X_test, alpha=0.01)

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.gca()
ax.set_facecolor('#f5f5f5')  
for spine in ax.spines.values():
    spine.set_visible(False)
plt.hist(te_pred_est, bins=30, color='lightblue', alpha=0.7, edgecolor='none')
plt.xlabel("Predicted Treatment Effect")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()