In [3]:
import pandas as pd

In [5]:
filepath = '../processed_data/two_digit_NAICS_bds_tfp.csv'

df = pd.read_csv(filepath)
df.head()

Unnamed: 0,year,NAICS,firms,estabs,emp,denom,estabs_entry,estabs_entry_rate,estabs_exit,estabs_exit_rate,...,reallocation_rate,firmdeath_firms,firmdeath_estabs,firmdeath_emp,Industry,Basis,weight,tfp_pct_change,tfp_index_2017,firms_percent_destroyed
0,1987,11,18421,18977,170858,170501,3109,16.513,2811,14.93,...,54.075,2058,2060,11605,"Agriculture, forestry, fishing, and hunting",All workers,0.009333,N.A.,68.776,11.172032
1,1987,21,21621,27222,578807,632507,3291,11.498,6090,21.278,...,30.951,4029,4064,31974,Mining,All workers,0.006984,N.A.,66.059,18.634661
2,1987,22,5844,12393,774290,772758,654,5.294,575,4.655,...,11.146,313,315,1606,Utilities,All workers,0.01357,N.A.,76.927,5.355921
3,1987,23,485658,492062,4965017,4877830,90479,18.838,66956,13.94,...,50.605,49892,49934,282447,Construction,All workers,0.030664,N.A.,116.637,10.273073
4,1987,31-33,286017,332737,16522435,16658447,35126,10.577,33854,10.194,...,22.779,23996,24227,300581,Manufacturing sector,All workers,0.101077,N.A.,76.544,8.389711


In [None]:
df.columns

Index(['year', 'NAICS', 'firms', 'estabs', 'emp', 'denom', 'estabs_entry',
       'estabs_entry_rate', 'estabs_exit', 'estabs_exit_rate', 'job_creation',
       'job_creation_births', 'job_creation_continuers',
       'job_creation_rate_births', 'job_creation_rate', 'job_destruction',
       'job_destruction_deaths', 'job_destruction_continuers',
       'job_destruction_rate_deaths', 'job_destruction_rate',
       'net_job_creation', 'net_job_creation_rate', 'reallocation_rate',
       'firmdeath_firms', 'firmdeath_estabs', 'firmdeath_emp', 'Industry',
       'Basis', 'weight', 'tfp_pct_change', 'tfp_index_2017',
       'firms_percent_destroyed'],
      dtype='object')

In [17]:
df.dtypes

year                             int64
NAICS                           object
firms                            int64
estabs                           int64
emp                              int64
denom                            int64
estabs_entry                     int64
estabs_entry_rate              float64
estabs_exit                      int64
estabs_exit_rate               float64
job_creation                     int64
job_creation_births              int64
job_creation_continuers          int64
job_creation_rate_births       float64
job_creation_rate              float64
job_destruction                  int64
job_destruction_deaths           int64
job_destruction_continuers       int64
job_destruction_rate_deaths    float64
job_destruction_rate           float64
net_job_creation                 int64
net_job_creation_rate          float64
reallocation_rate              float64
firmdeath_firms                  int64
firmdeath_estabs                 int64
firmdeath_emp            

In [None]:
'''
which way round is the causality? You would think to have business dynamism on the RHS as a proxy for creative destruction
'''

In [29]:
# Business dynamism measures
 # 'job_creation_rate', 'job_destruction_rate'
bus_vars = [ 'reallocation_rate', 'estabs_entry_rate', 'estabs_exit_rate']

# generate lags
for v in bus_vars:
    df[f"{v}_lag"] = df.groupby("NAICS")[v].shift(1)

In [30]:
# Convert tfp_pct_change to numeric, replacing non-numeric values with NaN
df['tfp_pct_change'] = pd.to_numeric(df['tfp_pct_change'], errors='coerce')

# Drop rows where tfp_pct_change is NaN
df_clean = df.dropna(subset=['tfp_pct_change'])

In [31]:
df_clean.head()

Unnamed: 0,year,NAICS,firms,estabs,emp,denom,estabs_entry,estabs_entry_rate,estabs_exit,estabs_exit_rate,...,Basis,weight,tfp_pct_change,tfp_index_2017,firms_percent_destroyed,estabs_entry_rate_lag,estabs_exit_rate_lag,job_creation_rate_lag,job_destruction_rate_lag,reallocation_rate_lag
1,1988,11,18757,19233,178943,174832,3139,16.423,2899,15.168,...,All workers,0.008802,-7.9,63.369,10.993229,16.513,14.93,27.669,27.037,54.075
2,1989,11,19066,19577,179228,179791,3020,15.554,2698,13.896,...,All workers,0.008845,5.5,66.843,10.510857,16.423,15.168,29.056,24.317,48.634
3,1990,11,19708,20253,183371,181703,3324,16.681,2672,13.409,...,All workers,0.008622,5.5,70.492,9.813274,15.554,13.896,24.794,25.445,49.589
4,1991,11,20140,20668,183846,185670,2904,14.042,2931,14.172,...,All workers,0.008256,0.8,71.025,10.307845,16.681,13.409,25.378,23.566,47.133
5,1992,11,20283,20789,186909,184322,3036,14.671,2847,13.757,...,All workers,0.008289,7.1,76.087,10.39787,14.042,14.172,24.052,26.196,48.104


In [32]:
import statsmodels.formula.api as smf

formula = (
    "tfp_pct_change ~ " +
    " + ".join(bus_vars) +                       # contemporaneous
    " + " + " + ".join([v+"_lag" for v in bus_vars]) +  # lagged
    " + C(NAICS) + C(year)"                      # industry & year FE
)

fe_mod = smf.ols(formula=formula, data=df_clean).fit(
    cov_type="cluster", cov_kwds={"groups": df_clean["year"]}
)
print(fe_mod.summary())



                            OLS Regression Results                            
Dep. Variable:         tfp_pct_change   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     19.62
Date:                Sat, 24 May 2025   Prob (F-statistic):           1.56e-13
Time:                        14:34:31   Log-Likelihood:                -1561.0
No. Observations:                 630   AIC:                             3238.
Df Residuals:                     572   BIC:                             3496.
Df Model:                          57                                         
Covariance Type:              cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                -1.08

