In [1]:
import pandas as pd
import numpy as np
from src.boot import compute_treatment_effects

I need to add a function to replicate the way the authors compute the clustered standard errors

In [2]:
def aggregate_treatment_effects(data, treatment_column, estimand='overall', groupby_column=None, time_column=None):
    """
    Aggregate treatment effects into a specific estimand.
    
    Parameters:
        data (DataFrame): The data containing the treatment effects.
        treatment_column (str): The column name for treatment effects.
        estimand (str): The type of estimand to calculate ('overall', 'cohort', 'event').
        groupby_column (str): The column to group by for cohort-specific estimands.
        time_column (str): The time column for event-specific estimands.
    
    Returns:
        float or Series: The aggregated treatment effect estimand.
    """
    
    if estimand == 'overall':
        weights = 1 / data[treatment_column].count()
        weighted_treatment_effect = np.sum(weights * data[treatment_column])
        return weighted_treatment_effect
    
    elif estimand == 'cohort':
        if groupby_column is None or time_column is None:
            raise ValueError('Both groupby and time columns must be provided for cohort-specific estimands.')
        
        # Calculate elapsed time
        data['elapsed_time'] = data[time_column] - data[groupby_column]
        
        # Initialize an empty Series to hold the final weighted average treatment effects for each cohort
        weighted_avg_treatment_effects = pd.Series(dtype=float)
        
        # Loop through each cohort to calculate the weighted average treatment effect
        for cohort, cohort_data in data.groupby(groupby_column):
            group_data = cohort_data.groupby('elapsed_time')
            
            # Calculate average treatment effects for each elapsed time
            avg_treatment_effect = group_data[treatment_column].mean()
            
            # Count the number of units in each elapsed time
            counts_in_elapsed = group_data[treatment_column].count()
            
            # Calculate the weights
            weights = counts_in_elapsed / counts_in_elapsed.sum()
            
            # Compute the weighted average treatment effect for this cohort
            weighted_avg_treatment_effect = (avg_treatment_effect * weights).sum()
            
            # Append this to the final Series
            weighted_avg_treatment_effects[cohort] = weighted_avg_treatment_effect
        
        return weighted_avg_treatment_effects
    
    elif estimand == 'event':
        if groupby_column is None or time_column is None:
            raise ValueError('Both groupby and time columns must be provided for event-specific estimands.')
        
        # Calculate elapsed time
        data['elapsed_time'] = data[time_column] - data[groupby_column]
        
        # Group by cohort and elapsed time
        group_data = data.groupby([groupby_column, 'elapsed_time'])
        
        # Calculate average treatment effects for each cohort and elapsed time
        avg_treatment_effect = group_data[treatment_column].mean()
        
        # Count the number of units in each cohort and elapsed time combination
        counts_in_cohort_elapsed = group_data[treatment_column].count()
        
        # Count the number of units in each elapsed time across all cohorts
        total_counts_in_elapsed = data.groupby('elapsed_time')[treatment_column].count()
        
        # Calculate the weights
        weights = counts_in_cohort_elapsed / total_counts_in_elapsed.reindex(counts_in_cohort_elapsed.index, level=1)
        
        # Compute the weighted sum of treatment effects for each elapsed time
        weighted_avg_treatment_effect = (avg_treatment_effect * weights).groupby('elapsed_time').sum()
        
        return weighted_avg_treatment_effect
    
    else:
        raise ValueError('Invalid estimand type. Choose among "overall", "cohort", or "event".')

In [3]:
# Number of units, time periods, and cohorts
n_units = 10
n_time = 10
n_cohorts = 3

# Random seed for reproducibility
np.random.seed(42)

# Generate a DataFrame
df = pd.DataFrame({
    'unit_id': np.repeat(range(1, n_units + 1), n_time),
    'time_id': list(range(1, n_time + 1)) * n_units,
    'treatment': np.random.choice([0, 1], n_units * n_time),
    'covariate1': np.random.normal(0, 1, n_units * n_time),
    'covariate2': np.random.normal(0, 1, n_units * n_time),
    'cohort': np.random.choice([2010, 2011, 2012], n_units * n_time)
})

# Adding a few never-treated units
df.loc[df['unit_id'].isin([1, 2]), 'treatment'] = 0

# Simulating potential outcomes under the control (Y0)
df['Y0'] = 5 + 0.5 * df['covariate1'] + 0.3 * df['covariate2'] + 0.2 * df['unit_id'] + 0.1 * df['time_id'] + np.random.normal(0, 1, n_units * n_time)

# Defining treatment effect (constant for all units as 2)
df['treatment_effect'] = 2

# Simulating potential outcomes under the treatment (Y1)
df['Y1'] = df['Y0'] + df['treatment_effect']

# Constructing the observed outcome based on treatment status
df['outcome'] = np.where(df['treatment'] == 1, df['Y1'], df['Y0'])

# Adding pre-treatment and post-treatment periods (assuming treatment starts at time 5 for everyone)
df['period_type'] = np.where(df['time_id'] < 5, 'pre-treatment', 'post-treatment')

In [4]:
df.head()

Unnamed: 0,unit_id,time_id,treatment,covariate1,covariate2,cohort,Y0,treatment_effect,Y1,outcome,period_type
0,1,1,0,0.738467,0.22746,2011,4.49924,2,6.49924,4.49924,pre-treatment
1,1,2,0,0.171368,1.307143,2012,5.053176,2,7.053176,5.053176,pre-treatment
2,1,3,0,-0.115648,-1.607483,2011,6.911211,2,8.911211,6.911211,pre-treatment
3,1,4,0,-0.301104,0.184634,2010,5.464753,2,7.464753,5.464753,pre-treatment
4,1,5,0,-1.478522,0.259883,2011,5.568139,2,7.568139,5.568139,post-treatment


In [5]:
result_df = compute_treatment_effects(df, 'outcome', 'treatment', 'unit_id', 'time_id', covariates=['covariate1', 'covariate2'])

Skipping column period_type as it appears to be truly textual.


In [6]:
result_df.head()

Unnamed: 0,unit_id,time_id,treatment,covariate1,covariate2,cohort,Y0,treatment_effect,Y1,outcome,period_type,y_0,t_effects
0,1,1,0,0.738467,0.22746,2011,4.49924,2,6.49924,4.49924,pre-treatment,,
1,1,2,0,0.171368,1.307143,2012,5.053176,2,7.053176,5.053176,pre-treatment,,
2,1,3,0,-0.115648,-1.607483,2011,6.911211,2,8.911211,6.911211,pre-treatment,,
3,1,4,0,-0.301104,0.184634,2010,5.464753,2,7.464753,5.464753,pre-treatment,,
4,1,5,0,-1.478522,0.259883,2011,5.568139,2,7.568139,5.568139,post-treatment,,


In [7]:
data = result_df.copy()
data = data[data['treatment'] == 1]
data.head()

Unnamed: 0,unit_id,time_id,treatment,covariate1,covariate2,cohort,Y0,treatment_effect,Y1,outcome,period_type,y_0,t_effects
20,3,1,1,-0.479174,-0.974682,2010,6.31238,2,8.31238,8.31238,pre-treatment,6.31238,2.0
22,3,3,1,-1.106335,1.158596,2010,5.067553,2,7.067553,7.067553,pre-treatment,5.067553,2.0
23,3,4,1,-1.196207,-0.820682,2010,3.813685,2,5.813685,5.813685,pre-treatment,3.813685,2.0
24,3,5,1,0.812526,0.963376,2012,6.469303,2,8.469303,8.469303,post-treatment,6.469303,2.0
25,3,6,1,1.35624,0.412781,2012,10.255939,2,12.255939,12.255939,post-treatment,10.255939,2.0


In [13]:
data['t_effects'].mean()

1.999999999994619

In [8]:
data

Unnamed: 0,unit_id,time_id,treatment,covariate1,covariate2,cohort,Y0,treatment_effect,Y1,outcome,period_type,y_0,t_effects
20,3,1,1,-0.479174,-0.974682,2010,6.31238,2,8.31238,8.31238,pre-treatment,6.31238,2.0
22,3,3,1,-1.106335,1.158596,2010,5.067553,2,7.067553,7.067553,pre-treatment,5.067553,2.0
23,3,4,1,-1.196207,-0.820682,2010,3.813685,2,5.813685,5.813685,pre-treatment,3.813685,2.0
24,3,5,1,0.812526,0.963376,2012,6.469303,2,8.469303,8.469303,post-treatment,6.469303,2.0
25,3,6,1,1.35624,0.412781,2012,10.255939,2,12.255939,12.255939,post-treatment,10.255939,2.0
26,3,7,1,-0.07201,0.82206,2010,4.965614,2,6.965614,6.965614,post-treatment,4.965614,2.0
27,3,8,1,1.003533,1.896793,2011,6.628069,2,8.628069,8.628069,post-treatment,6.628069,2.0
28,3,9,1,0.361636,-0.245388,2010,6.999243,2,8.999243,8.999243,post-treatment,6.999243,2.0
29,3,10,1,-0.64512,-0.753736,2012,5.926794,2,7.926794,7.926794,post-treatment,5.926794,2.0
32,4,3,1,-0.035826,-0.077102,2012,5.205697,2,7.205697,7.205697,pre-treatment,5.205697,2.0


In [9]:
# Test the aggregate effects function
print(aggregate_treatment_effects(data, 't_effects', estimand='overall'))
print(aggregate_treatment_effects(data, 't_effects', estimand='cohort', groupby_column='cohort', time_column='time_id'))
print(aggregate_treatment_effects(data, 't_effects', estimand='event', groupby_column='cohort' ,time_column='time_id'))

1.9999999999946187
2010    2.0
2011    2.0
2012    2.0
dtype: float64
elapsed_time
-2011    2.0
-2009    2.0
-2008    2.0
-2007    2.0
-2006    2.0
-2005    2.0
-2004    2.0
-2003    2.0
-2002    2.0
-2001    2.0
-2000    2.0
Name: t_effects, dtype: float64
