In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats

In [2]:
# Function to estimate your model
def run_model(data):
    X = sm.add_constant(data[['x1', 'x2']])
    y = data['y']
    model = sm.OLS(y, X).fit()
    return model.params['x1'], model.bse['x1']

# Bootstrap-t function
def bootstrap_t(df, model_func, B=1000, alpha=0.05):
    N = len(df['entity'].unique())
    original_estimate, _ = model_func(df)

    bootstrap_estimates = np.zeros(B)
    for b in range(B):
        bootstrap_sample = df.sample(frac=1, replace=True)
        bootstrap_estimate, _ = model_func(bootstrap_sample)
        bootstrap_estimates[b] = np.sqrt(N) * (bootstrap_estimate - original_estimate)

    q25, q75 = np.percentile(bootstrap_estimates, [25, 75])
    z25, z75 = stats.norm.ppf([0.25, 0.75])
    normalization_factor = (q75 - q25) / (z75 - z25)

    normalized_t_stats = bootstrap_estimates / normalization_factor
    lower_percentile = np.percentile(normalized_t_stats, 100 * (alpha / 2))
    upper_percentile = np.percentile(normalized_t_stats, 100 * (1 - alpha / 2))

    SE_original_estimate = np.std(bootstrap_estimates)
    CI_lower = original_estimate - upper_percentile * SE_original_estimate
    CI_upper = original_estimate - lower_percentile * SE_original_estimate

    return original_estimate, CI_lower, CI_upper, SE_original_estimate

In [3]:
# Simulated data
N = 100
T = 10
np.random.seed(42)
df = pd.DataFrame({
    'y': np.random.randn(N * T),
    'x1': np.random.randn(N * T),
    'x2': np.random.randn(N * T),
    'entity': np.array([[i] * T for i in range(N)]).flatten(),
    'time': np.array([list(range(T)) for _ in range(N)]).flatten()
})

# Bootstrap results
original_estimate, CI_lower, CI_upper, bootstrap_SE = bootstrap_t(df, run_model)
print(f"Bootstrap Results:")
print(f"Original Estimate: {original_estimate}")
print(f"Bootstrap Confidence Interval: ({CI_lower}, {CI_upper})")
print(f"Bootstrap Standard Error: {bootstrap_SE}")

# Statsmodels results
X = sm.add_constant(df[['x1', 'x2']])
y = df['y']
model = sm.OLS(y, X).fit()
statsmodels_estimate = model.params['x1']
statsmodels_SE = model.bse['x1']
statsmodels_CI_lower = statsmodels_estimate - 1.96 * statsmodels_SE
statsmodels_CI_upper = statsmodels_estimate + 1.96 * statsmodels_SE

print(f"\nStatsmodels Results:")
print(f"Estimate: {statsmodels_estimate}")
print(f"Confidence Interval: ({statsmodels_CI_lower}, {statsmodels_CI_upper})")
print(f"Standard Error: {statsmodels_SE}")

Bootstrap Results:
Original Estimate: -0.03942280690283285
Bootstrap Confidence Interval: (-0.590087548044941, 0.5499552657649268)
Bootstrap Standard Error: 0.30706585835553074

Statsmodels Results:
Estimate: -0.03942280690283285
Confidence Interval: (-0.1003013460770375, 0.021455732271371805)
Standard Error: 0.031060479170512578


In [4]:
# ... (previous code remains unchanged)

# Block Bootstrap-t function
def block_bootstrap_t(df, model_func, B=1000, alpha=0.05):
    N = len(df['entity'].unique())
    
    original_estimate, original_se = model_func(df)
    block_ids = df['entity'].unique()

    bootstrap_estimates = np.zeros(B)
    bootstrap_se = np.zeros(B)

    for b in range(B):
        sampled_blocks = np.random.choice(block_ids, size=N, replace=True)
        bootstrap_sample = df[df['entity'].isin(sampled_blocks)]
        
        bootstrap_estimate, bootstrap_std_error = model_func(bootstrap_sample)
        bootstrap_estimates[b] = bootstrap_estimate
        bootstrap_se[b] = bootstrap_std_error

    # Studentized bootstrap
    t_star = (bootstrap_estimates - original_estimate) / bootstrap_se
    bootstrap_se_estimate = np.std(t_star)  # Bootstrap standard error
    lower = np.percentile(t_star, 100 * alpha / 2)
    upper = np.percentile(t_star, 100 * (1 - alpha / 2))
    
    CI_lower = original_estimate - upper * original_se
    CI_upper = original_estimate - lower * original_se
    
    return original_estimate, CI_lower, CI_upper, bootstrap_se_estimate


In [5]:
# Simulated data
N = 100
T = 10
np.random.seed(42)
df = pd.DataFrame({
    'y': np.random.randn(N * T),
    'x1': np.random.randn(N * T),
    'x2': np.random.randn(N * T),
    'entity': np.array([[i] * T for i in range(N)]).flatten(),
    'time': np.array([list(range(T)) for _ in range(N)]).flatten()
})

# Block Bootstrap results
original_estimate, CI_lower, CI_upper, bootstrap_se_estimate = block_bootstrap_t(df, run_model)
print(f"Block Bootstrap Results:")
print(f"Original Estimate: {original_estimate}")
print(f"Block Bootstrap Confidence Interval: ({CI_lower}, {CI_upper})")
print(f"Bootstrap Standard Error: {bootstrap_se_estimate}")

# Statsmodels results
X = sm.add_constant(df[['x1', 'x2']])
y = df['y']
model = sm.OLS(y, X).fit()
statsmodels_estimate = model.params['x1']
statsmodels_SE = model.bse['x1']
statsmodels_CI_lower = statsmodels_estimate - 1.96 * statsmodels_SE
statsmodels_CI_upper = statsmodels_estimate + 1.96 * statsmodels_SE

print(f"\nStatsmodels Results:")
print(f"Estimate: {statsmodels_estimate}")
print(f"Confidence Interval: ({statsmodels_CI_lower}, {statsmodels_CI_upper})")
print(f"Standard Error: {statsmodels_SE}")


Block Bootstrap Results:
Original Estimate: -0.03942280690283285
Block Bootstrap Confidence Interval: (-0.07340271790063685, -0.0030960253386790246)
Bootstrap Standard Error: 0.5597827362359974

Statsmodels Results:
Estimate: -0.03942280690283285
Confidence Interval: (-0.1003013460770375, 0.021455732271371805)
Standard Error: 0.031060479170512578


In [6]:


# Dummy function to simulate your model (Replace with your actual model)
def run_model(df):
    X = sm.add_constant(df['x'])
    model = sm.OLS(df['y'], X).fit()
    return model.params['x'], model.bse['x']

# Simplified Bootstrap-t function
def bootstrap_t(df, model_func, B=1000, alpha=0.05):
    N = len(df['entity'].unique())
    original_estimate, _ = model_func(df)

    bootstrap_estimates = np.zeros(B)
    for b in range(B):
        bootstrap_sample = df.sample(frac=1, replace=True)
        bootstrap_estimate, _ = model_func(bootstrap_sample)
        bootstrap_estimates[b] = np.sqrt(N) * (bootstrap_estimate - original_estimate)

    lower_percentile = np.percentile(bootstrap_estimates, 100 * (alpha / 2))
    upper_percentile = np.percentile(bootstrap_estimates, 100 * (1 - alpha / 2))

    SE_original_estimate = np.std(bootstrap_estimates)

    CI_lower = original_estimate - upper_percentile / np.sqrt(N)
    CI_upper = original_estimate - lower_percentile / np.sqrt(N)

    return original_estimate, CI_lower, CI_upper, SE_original_estimate

# Simulated data: Replace with your actual panel data
np.random.seed(42)
df = pd.DataFrame({
    'y': np.random.normal(0, 1, 1000),
    'x': np.random.normal(0, 1, 1000),
    'entity': np.array([i//10 for i in range(1000)])
})

# Run Bootstrap-t
original_estimate, CI_lower, CI_upper, SE_bootstrap = bootstrap_t(df, run_model, B=10000)
print(f"Bootstrap-t Results:")
print(f"Original Estimate: {original_estimate}")
print(f"Bootstrap-t Confidence Interval: ({CI_lower}, {CI_upper})")
print(f"Bootstrap-t Standard Error: {SE_bootstrap}")

# Run Statsmodels
X = sm.add_constant(df['x'])
model = sm.OLS(df['y'], X).fit()
statsmodels_estimate = model.params['x']
statsmodels_se = model.bse['x']
statsmodels_CI = model.conf_int(alpha=0.05).loc['x']

print(f"\nStatsmodels Results:")
print(f"Estimate: {statsmodels_estimate}")
print(f"Confidence Interval: ({statsmodels_CI[0]}, {statsmodels_CI[1]})")
print(f"Standard Error: {statsmodels_se}")

Bootstrap-t Results:
Original Estimate: -0.03966116022360016
Bootstrap-t Confidence Interval: (-0.1010221301937804, 0.02226658865685311)
Bootstrap-t Standard Error: 0.31372040631252984

Statsmodels Results:
Estimate: -0.03966116022360016
Confidence Interval: (-0.10059247702789614, 0.02127015658069583)
Standard Error: 0.03105027537440998


In [7]:
# Setting a seed for reproducibility
np.random.seed(0)

# Number of entities in each cohort and periods
n_entities_cohort1 = 50
n_entities_cohort2 = 50
n_entities_cohort3 = 50
n_periods = 5

# Generate a base level for outcome variable Y for each cohort and period (let's say income)
base_y_cohort1 = np.random.normal(50, 10, (n_entities_cohort1, n_periods))
base_y_cohort2 = np.random.normal(60, 10, (n_entities_cohort2, n_periods))
base_y_cohort3 = np.random.normal(55, 10, (n_entities_cohort3, n_periods))

# Treatment effects (these will vary by cohort and period)
# Assume these are the true treatment effects you want to estimate
treatment_effects_cohort1 = np.array([0, 0, 5, 5, 5])  # Treated in period 3
treatment_effects_cohort2 = np.array([0, 0, 0, 7, 7])  # Treated in period 4
treatment_effects_cohort3 = np.array([0, 0, 0, 0, 0])  # Never treated

# Apply treatment effects to the Y variable (plus some random noise for each individual)
for i in range(n_periods):
    base_y_cohort1[:, i] += treatment_effects_cohort1[i] + np.random.normal(0, 2, n_entities_cohort1)  # Assume a std error of 2
    base_y_cohort2[:, i] += treatment_effects_cohort2[i] + np.random.normal(0, 2, n_entities_cohort2)  # Assume a std error of 2
    base_y_cohort3[:, i] += treatment_effects_cohort3[i] + np.random.normal(0, 2, n_entities_cohort3)  # Assume a std error of 2

# Prepare data frame
df_cohort1 = pd.DataFrame(base_y_cohort1, columns=[f'period_{i+1}' for i in range(n_periods)])
df_cohort1['entity'] = [f'cohort1_{i+1}' for i in range(n_entities_cohort1)]
df_cohort1 = df_cohort1.melt(id_vars=['entity'], var_name='period', value_name='y')

df_cohort2 = pd.DataFrame(base_y_cohort2, columns=[f'period_{i+1}' for i in range(n_periods)])
df_cohort2['entity'] = [f'cohort2_{i+1}' for i in range(n_entities_cohort2)]
df_cohort2 = df_cohort2.melt(id_vars=['entity'], var_name='period', value_name='y')

df_cohort3 = pd.DataFrame(base_y_cohort3, columns=[f'period_{i+1}' for i in range(n_periods)])
df_cohort3['entity'] = [f'cohort3_{i+1}' for i in range(n_entities_cohort3)]
df_cohort3 = df_cohort3.melt(id_vars=['entity'], var_name='period', value_name='y')

# Combine all cohorts into one DataFrame
df = pd.concat([df_cohort1, df_cohort2, df_cohort3], ignore_index=True)

# Add a cohort indicator
df['cohort'] = df['entity'].apply(lambda x: x.split('_')[0])

# Add a treatment indicator (1 if treated, 0 otherwise)
df['treatment'] = 0
df.loc[(df['cohort'] == 'cohort1') & (df['period'].isin(['period_3', 'period_4', 'period_5'])), 'treatment'] = 1
df.loc[(df['cohort'] == 'cohort2') & (df['period'].isin(['period_4', 'period_5'])), 'treatment'] = 1

print(df.head())

      entity    period          y   cohort  treatment
0  cohort1_1  period_1  69.185913  cohort1          0
1  cohort1_2  period_1  36.904025  cohort1          0
2  cohort1_3  period_1  52.336826  cohort1          0
3  cohort1_4  period_1  56.729106  cohort1          0
4  cohort1_5  period_1  24.440386  cohort1          0
