In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

from src.data_gen import data_gen
from src.csc_ipca import CSC_IPCA
from src.csc_ife import CSC_IFE
from src.scm import SCM

# set the global font to be Times New Roman
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 8
colors = sns.color_palette()
colors

### 1. Finite sample performance of CSC-IPCA estimator

In [2]:
# fix the number of treated units and post treatment periods to 5
N_tr, T1 = 5, 5
# fix the total number of covariates to 9, and the number of factors to 2
L, K = 9, 2 

# fix the drift
drift = 2

In [3]:
# define a function to gen simulated true and estimated att
def simulation_fun(T0, N_co, alpha, n_simulations, method):
    results = {}
    for i in range(n_simulations):
        # generate data
        df = data_gen(T0, T1, N_co, N_tr, L, K, drift)
        # add a constant
        df['const'] = 1
        # compute the true avg ATT
        att = df.query("tr_group==1").groupby('time')['eff'].mean()[-T1:].mean()

        # gen the covariates and observed covariates
        covariates = ['x' + str(i) for i in range(1, L+1)]
        L_obs = int(alpha*len(covariates))
        obs_covariates = list(np.random.choice(covariates, size=L_obs, replace=False)) + ['const']

         # Initialize model based on method
        if method == 'CSC_IPCA':
            model = CSC_IPCA()
        elif method == 'CSC_IFE':
            model = CSC_IFE()
        elif method == 'SCM':
            model = SCM()
        else:
            raise ValueError('method should be one of: CSC_IPCA, CSC_IFE, SCM')
        
        # Fit model
        if method in ['CSC_IPCA', 'CSC_IFE']:
            model.fit(df, 'id', 'time', 'y', 'treated', obs_covariates, K)
            y_syn = model.predict()
            att_est = (df.query("tr_group==1").groupby('time')['y'].mean()[-T1:] - y_syn.mean(axis=0)[-T1:]).mean()
        elif method == 'SCM':
            model.fit(df, 'id', 'time', 'y', 'treated', None)
            y_syn = model.predict()
            att_est = (df.query("tr_group==1").groupby('time')['y'].mean()[-T1:] - y_syn[-T1:]).mean()

        # compute the bias
        bias = att_est - att
        
        results[i] = att, att_est, bias, 
    
    results_df = pd.DataFrame(results, index=['att', 'att_est', 'bias']).T
    avg_bias = results_df.bias.mean()
    sd_att_est = results_df.att_est.std()
    rmse = np.sqrt(((results_df.bias)**2).mean())

    return avg_bias, sd_att_est, rmse

In [4]:
# Define a function to wrap the call to simulation_fun for readability and ease of use with joblib
def simulate(alpha, t, n, method):
    bias, std_att, rmse_att = simulation_fun(T0=t, N_co=n, alpha=alpha, n_simulations=1000, method=method)
    return {"alpha": alpha, "T0": t, "N_co": n, "bias": bias, "std_att": std_att, "rmse_att": rmse_att}

In [5]:
alphas = [1/3, 2/3, 1]
T0 = [10, 20, 40]
N_co = [10, 20, 40]

In [12]:
# Use joblib to run simulations in parallel
# n_jobs=-1 uses all available CPU cores
results1 = Parallel(n_jobs=-1)(delayed(simulate)(alpha, t, n, 'CSC_IPCA') for alpha in alphas for t in T0 for n in N_co)

# Convert the results to a DataFrame
df1 = pd.DataFrame(results1)

In [13]:
# this the 1000 simulations with the constant
df1.pivot_table(index=['T0', 'N_co'], columns='alpha', values=['bias', 'std_att', 'rmse_att']).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,bias,bias,bias,rmse_att,rmse_att,rmse_att,std_att,std_att,std_att
Unnamed: 0_level_1,alpha,0.333333,0.666667,1.000000,0.333333,0.666667,1.000000,0.333333,0.666667,1.000000
T0,N_co,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
10,10,2.328,0.703,0.13,4.77,3.068,1.642,4.175,3.032,1.684
10,20,1.367,0.312,0.053,3.484,2.209,0.914,3.26,2.219,1.008
10,40,1.026,0.196,0.051,2.776,1.752,0.714,2.616,1.781,0.821
20,10,2.957,1.029,0.217,4.817,2.696,1.135,3.814,2.544,1.179
20,20,1.435,0.438,0.055,3.28,1.754,0.745,2.982,1.773,0.86
20,40,1.093,0.167,0.042,2.613,1.348,0.602,2.43,1.409,0.757
40,10,2.905,1.232,0.145,4.911,3.035,0.969,3.972,2.797,1.065
40,20,1.67,0.399,0.019,3.592,1.718,0.724,3.221,1.737,0.861
40,40,0.876,0.295,0.006,2.675,1.418,0.574,2.556,1.441,0.697


In [14]:
# output the results to latex
df1.pivot_table(index=['T0', 'N_co'], columns='alpha', values=['bias', 'std_att', 'rmse_att']).round(3).to_latex('figs/sim_results1.tex', float_format="%.3f")

### 2. Finite sample performance of the CSC-IFE estimator

In [10]:
results2 = Parallel(n_jobs=-1)(delayed(simulate)(alpha, t, n, 'CSC_IFE') for alpha in alphas for t in T0 for n in N_co)

# Convert the results to a DataFrame
df2 = pd.DataFrame(results2)

In [11]:
df2.pivot_table(index=['T0', 'N_co'], columns='alpha', values=['bias', 'std_att', 'rmse_att']).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,bias,bias,bias,rmse_att,rmse_att,rmse_att,std_att,std_att,std_att
Unnamed: 0_level_1,alpha,0.333333,0.666667,1.000000,0.333333,0.666667,1.000000,0.333333,0.666667,1.000000
T0,N_co,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
10,10,6.478,3.184,0.045,7.996,4.311,0.747,4.701,2.939,0.854
10,20,6.173,2.885,-0.018,8.07,4.05,0.599,5.245,2.9,0.769
10,40,4.51,2.516,-0.007,6.785,3.931,0.527,5.096,3.044,0.675
20,10,6.65,3.536,-0.007,8.051,4.843,0.777,4.593,3.336,0.904
20,20,6.402,3.198,-0.013,8.085,4.529,0.587,4.939,3.272,0.74
20,40,5.69,2.555,0.001,7.633,3.864,0.57,5.111,2.935,0.72
40,10,7.353,3.523,-0.008,11.132,5.258,0.696,8.364,3.95,0.846
40,20,6.904,3.23,0.036,9.185,5.084,0.602,6.053,3.935,0.747
40,40,5.978,2.928,0.003,9.368,4.85,0.65,7.227,3.927,0.782


In [15]:
# output the results to latex
df2.pivot_table(index=['T0', 'N_co'], columns='alpha', values=['bias', 'std_att', 'rmse_att']).round(3).to_latex('figs/sim_results2.tex', float_format="%.3f")

### 3. Finite sample performance of the SCM estimator

In [8]:
# n_jobs=-1 uses all available CPU cores
results3 = Parallel(n_jobs=-1)(delayed(simulate)(alpha, t, n, 'SCM') for alpha in alphas for t in T0 for n in N_co)

# Convert the results to a DataFrame
df3 = pd.DataFrame(results3)

In [9]:
df3.pivot_table(index=['T0', 'N_co'], columns='alpha', values=['bias', 'std_att', 'rmse_att']).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,bias,bias,bias,rmse_att,rmse_att,rmse_att,std_att,std_att,std_att
Unnamed: 0_level_1,alpha,0.333333,0.666667,1.000000,0.333333,0.666667,1.000000,0.333333,0.666667,1.000000
T0,N_co,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
10,10,10.026,10.188,9.909,10.997,11.323,10.964,4.554,4.96,4.721
10,20,9.874,10.007,9.924,11.011,11.168,11.028,4.872,4.991,4.84
10,40,9.72,10.088,9.521,10.891,11.388,10.655,4.936,5.304,4.808
20,10,10.596,10.526,10.674,12.036,11.841,12.149,5.714,5.454,5.816
20,20,10.269,10.25,10.113,11.935,11.671,11.745,6.109,5.614,5.985
20,40,9.719,9.654,10.206,11.309,11.206,12.353,5.81,5.699,6.986
40,10,10.678,11.069,11.117,12.794,12.946,13.705,7.087,6.731,8.033
40,20,10.97,11.207,11.148,13.719,13.459,13.601,8.249,7.451,7.78
40,40,10.742,10.851,10.221,13.303,13.786,12.684,7.853,8.52,7.539


In [16]:
# output the results to latex
df3.pivot_table(index=['T0', 'N_co'], columns='alpha', values=['bias', 'std_att', 'rmse_att']).round(3).to_latex('figs/sim_results3.tex', float_format="%.3f")