In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.tsa.stattools import grangercausalitytests
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from itertools import product

In [2]:
df = pd.read_csv('../processed_data/three_digit_NAICS_final.csv')

Unnamed: 0,year,NAICS,firms,estabs,emp,denom,estabs_entry,estabs_entry_rate,estabs_exit,estabs_exit_rate,...,Basis,weight,tfp_pct_change,tfp_index_2017,firms_percent_destroyed,eer,pct_high_growth_emp,tfp_log,tfp_diff3,tfp_ann_pct
0,1987,211,7297,8942,130532,142230,909,9.665,1836,19.52,...,All workers,0.02326,,60.776,16.499931,5.170498,4.685441,4.107195,,
1,1987,212,6245,8515,285319,296796,1120,13.007,1312,15.236,...,All workers,0.012166,,86.967,12.281825,6.673607,6.086521,4.465529,,
2,1987,213,8315,9765,162956,193482,1262,11.9,2942,27.742,...,All workers,0.00377,,58.849,24.78653,7.373813,6.601782,4.074975,,
3,1987,321,14489,16107,517089,510664,1884,11.717,1828,11.369,...,All workers,0.01619,,104.62,9.11036,4.360401,5.893956,4.650335,,
4,1987,322,3683,5633,572571,572627,330,5.847,351,6.22,...,All workers,0.02875,,96.878,6.163454,1.993968,1.237576,4.573452,,


In [3]:
num_industries = df['Industry'].nunique()
print(num_industries)

36


In [6]:
df = df.sort_values(['Industry', 'year'])

df['tfp_ann_pct_lead'] = df.groupby('Industry')['tfp_ann_pct'].shift(-1)
df['tfp_pct_change_lead'] = df.groupby('Industry')['tfp_pct_change'].shift(-1)


Basic regressions

In [102]:
df = df.sort_values(['Industry', 'year'])

mod1 = smf.ols(
    'tfp_ann_pct_lead ~ eer',
    data=df
).fit(cov_type='HC1')          

print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.7380
Date:                Mon, 26 May 2025   Prob (F-statistic):              0.390
Time:                        12:27:54   Log-Likelihood:                -2936.2
No. Observations:                1188   AIC:                             5876.
Df Residuals:                    1186   BIC:                             5887.
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.7089      0.119      5.940      0.0

In [9]:

df = df.sort_values(['Industry', 'year'])

dependent_vars = [
    'tfp_ann_pct_lead', 
    'tfp_pct_change_lead', 
    'tfp_diff3', 
    'tfp_ann_pct', 
    'tfp_log', 
    'tfp_index_2017'
]

independent_vars = [
    'eer', 
    'pct_high_growth_emp', 
    'reallocation_rate'
]

results = {}

print("="*80)
print("SIMPLE OLS REGRESSION RESULTS")
print("="*80)

for dep_var in dependent_vars:
    for indep_var in independent_vars:
        
        # Create the formula
        formula = f'{dep_var} ~ {indep_var}'
        
        try:
            # Run the regression
            model = smf.ols(formula, data=df).fit(cov_type='HC1')
            
            # Store results
            key = f'{dep_var}_vs_{indep_var}'
            results[key] = model
            
            # Print summary information
            print(f"\n{'-'*60}")
            print(f"MODEL: {dep_var} ~ {indep_var}")
            print(f"{'-'*60}")
            print(f"R-squared: {model.rsquared:.4f}")
            print(f"Adj. R-squared: {model.rsquared_adj:.4f}")
            print(f"F-statistic: {model.fvalue:.4f}")
            print(f"Prob (F-statistic): {model.f_pvalue:.4f}")
            print(f"N observations: {model.nobs:.0f}")
            
            # Print coefficient information
            coef = model.params[indep_var]
            se = model.HC1_se[indep_var]  # Robust standard errors
            t_stat = model.tvalues[indep_var]
            p_value = model.pvalues[indep_var]
            
            print(f"\nCoefficient on {indep_var}:")
            print(f"  Estimate: {coef:.6f}")
            print(f"  Std Error: {se:.6f}")
            print(f"  t-statistic: {t_stat:.4f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Significance: {'***' if p_value < 0.01 else '**' if p_value < 0.05 else '*' if p_value < 0.10 else ''}")
            
        except Exception as e:
            print(f"\nERROR with {dep_var} ~ {indep_var}: {str(e)}")
            continue

print("\n" + "="*80)
print("SUMMARY TABLE OF ALL REGRESSIONS")
print("="*80)

summary_data = []
for dep_var in dependent_vars:
    for indep_var in independent_vars:
        key = f'{dep_var}_vs_{indep_var}'
        if key in results:
            model = results[key]
            coef = model.params[indep_var]
            se = model.HC1_se[indep_var]
            p_val = model.pvalues[indep_var]
            r2 = model.rsquared
            n_obs = model.nobs
            
            summary_data.append({
                'Dependent Variable': dep_var,
                'Independent Variable': indep_var,
                'Coefficient': f"{coef:.6f}",
                'Std Error': f"{se:.6f}",
                'p-value': f"{p_val:.4f}",
                'R-squared': f"{r2:.4f}",
                'N': f"{n_obs:.0f}",
                'Significance': '***' if p_val < 0.01 else '**' if p_val < 0.05 else '*' if p_val < 0.10 else ''
            })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

summary_df.to_csv('/Users/danielseymour/Developer/EC334-Summative/processed_data/ols_regression_results.csv', index=False)
print(f"\nResults saved to: /Users/danielseymour/Developer/EC334-Summative/processed_data/ols_regression_results.csv")

SIMPLE OLS REGRESSION RESULTS

------------------------------------------------------------
MODEL: tfp_ann_pct_lead ~ eer
------------------------------------------------------------
R-squared: 0.0007
Adj. R-squared: -0.0001
F-statistic: 0.7380
Prob (F-statistic): 0.3905
N observations: 1188

Coefficient on eer:
  Estimate: -0.016165
  Std Error: 0.018817
  t-statistic: -0.8591
  p-value: 0.3903
  Significance: 

------------------------------------------------------------
MODEL: tfp_ann_pct_lead ~ pct_high_growth_emp
------------------------------------------------------------
R-squared: 0.0106
Adj. R-squared: 0.0097
F-statistic: 23.6689
Prob (F-statistic): 0.0000
N observations: 1188

Coefficient on pct_high_growth_emp:
  Estimate: -0.069148
  Std Error: 0.014213
  t-statistic: -4.8651
  p-value: 0.0000
  Significance: ***

------------------------------------------------------------
MODEL: tfp_ann_pct_lead ~ reallocation_rate
---------------------------------------------------------

In [10]:
data = df[['tfp_ann_pct','pct_high_growth_emp']].dropna()
grangercausalitytests(data[['pct_high_growth_emp','tfp_ann_pct']], maxlag=3)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=2.5507  , p=0.1105  , df_denom=1184, df_num=1
ssr based chi2 test:   chi2=2.5571  , p=0.1098  , df=1
likelihood ratio test: chi2=2.5544  , p=0.1100  , df=1
parameter F test:         F=2.5507  , p=0.1105  , df_denom=1184, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.8516  , p=0.4270  , df_denom=1181, df_num=2
ssr based chi2 test:   chi2=1.7105  , p=0.4252  , df=2
likelihood ratio test: chi2=1.7093  , p=0.4254  , df=2
parameter F test:         F=0.8516  , p=0.4270  , df_denom=1181, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.6303  , p=0.5955  , df_denom=1178, df_num=3
ssr based chi2 test:   chi2=1.9021  , p=0.5930  , df=3
likelihood ratio test: chi2=1.9006  , p=0.5933  , df=3
parameter F test:         F=0.6303  , p=0.5955  , df_denom=1178, df_num=3


{np.int64(1): ({'ssr_ftest': (np.float64(2.550674270696185),
    np.float64(0.11051403225024332),
    np.float64(1184.0),
    np.int64(1)),
   'ssr_chi2test': (np.float64(2.557137127800989),
    np.float64(0.10979725119605252),
    np.int64(1)),
   'lrtest': (np.float64(2.5543866752914255),
    np.float64(0.10998849063099143),
    np.int64(1)),
   'params_ftest': (np.float64(2.5506742706966268),
    np.float64(0.11051403225022109),
    np.float64(1184.0),
    1.0)},
  [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1120da050>,
   <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1120da0b0>,
   array([[0., 1., 0.]])]),
 np.int64(2): ({'ssr_ftest': (np.float64(0.851637517140171),
    np.float64(0.426977492512716),
    np.float64(1181.0),
    np.int64(2)),
   'ssr_chi2test': (np.float64(1.710486190225644),
    np.float64(0.4251798193820432),
    np.int64(2)),
   'lrtest': (np.float64(1.709253916632406),
    np.float64(0.42544186903487125),
    np.int64

Fixed Effects

In [16]:
# ------------------------------------------------------------------
# 3.2  Year OR industry fixed effects, cluster by industry
# ------------------------------------------------------------------
# tfp_ann_pct_lead, tfp_pct_change_lead, tfp_diff3, tfp_ann_pct, tfp_log, tfp_index_2017
# eer, pct_high_growth_emp, reallocation_rate

# 1. First, check for missing values in the data
missing_values = df[['tfp_ann_pct_lead']].isnull().sum()
print("Missing values in each column:")
print(missing_values)

# 2. Drop missing values explicitly to ensure consistency
df_clean = df.dropna(subset=['tfp_ann_pct_lead'])
print(f"Original dataframe shape: {df.shape}, Clean dataframe shape: {df_clean.shape}")

# 3. Run the regression with the clean data
mod2 = smf.ols(
    'tfp_ann_pct_lead ~ pct_high_growth_emp + C(year)',
    data=df_clean
).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_clean['year']}
)

print(mod2.summary())

Missing values in each column:
tfp_ann_pct_lead    108
dtype: int64
Original dataframe shape: (1296, 39), Clean dataframe shape: (1188, 39)
                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     10.09
Date:                Mon, 26 May 2025   Prob (F-statistic):            0.00330
Time:                        13:50:52   Log-Likelihood:                -2912.1
No. Observations:                1188   AIC:                             5892.
Df Residuals:                    1154   BIC:                             6065.
Df Model:                          33                                         
Covariance Type:              cluster                                         
                          coef    std err          z      P>|z|      [



In [18]:
missing_values = df[['tfp_pct_change_lead']].isnull().sum()
print("Missing values in each column:")
print(missing_values)

df_clean = df.dropna(subset=['tfp_ann_pct_lead'])
print(f"Original dataframe shape: {df.shape}, Clean dataframe shape: {df_clean.shape}")

mod2 = smf.ols(
    'tfp_pct_change_lead ~ pct_high_growth_emp + C(year)',
    data=df_clean
).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_clean['year']}
)

print(mod2.summary())

Missing values in each column:
tfp_pct_change_lead    36
dtype: int64
Original dataframe shape: (1296, 39), Clean dataframe shape: (1188, 39)
                             OLS Regression Results                            
Dep. Variable:     tfp_pct_change_lead   R-squared:                       0.048
Model:                             OLS   Adj. R-squared:                  0.021
Method:                  Least Squares   F-statistic:                     172.5
Date:                 Mon, 26 May 2025   Prob (F-statistic):           1.95e-14
Time:                         13:55:45   Log-Likelihood:                -3712.8
No. Observations:                 1188   AIC:                             7494.
Df Residuals:                     1154   BIC:                             7666.
Df Model:                           33                                         
Covariance Type:               cluster                                         
                          coef    std err          z      



In [19]:
missing_values = df[['tfp_ann_pct_lead']].isnull().sum()
print("Missing values in each column:")
print(missing_values)

df_clean = df.dropna(subset=['tfp_ann_pct_lead'])
print(f"Original dataframe shape: {df.shape}, Clean dataframe shape: {df_clean.shape}")

mod2 = smf.ols(
    'tfp_ann_pct_lead ~ reallocation_rate + C(year)',
    data=df_clean
).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_clean['year']}
)

print(mod2.summary())

Missing values in each column:
tfp_ann_pct_lead    108
dtype: int64
Original dataframe shape: (1296, 39), Clean dataframe shape: (1188, 39)
                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     3.950
Date:                Mon, 26 May 2025   Prob (F-statistic):             0.0555
Time:                        13:56:06   Log-Likelihood:                -2918.7
No. Observations:                1188   AIC:                             5905.
Df Residuals:                    1154   BIC:                             6078.
Df Model:                          33                                         
Covariance Type:              cluster                                         
                        coef    std err          z      P>|z|      [0.



In [20]:
missing_values = df[['tfp_ann_pct_lead']].isnull().sum()
print("Missing values in each column:")
print(missing_values)

df_clean = df.dropna(subset=['tfp_ann_pct_lead'])
print(f"Original dataframe shape: {df.shape}, Clean dataframe shape: {df_clean.shape}")

mod2 = smf.ols(
    'tfp_ann_pct_lead ~ eer + C(year)',
    data=df_clean
).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_clean['year']}
)

print(mod2.summary())

Missing values in each column:
tfp_ann_pct_lead    108
dtype: int64
Original dataframe shape: (1296, 39), Clean dataframe shape: (1188, 39)
                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     3.414
Date:                Mon, 26 May 2025   Prob (F-statistic):             0.0739
Time:                        13:56:39   Log-Likelihood:                -2919.1
No. Observations:                1188   AIC:                             5906.
Df Residuals:                    1154   BIC:                             6079.
Df Model:                          33                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.02



In [15]:
df = df.sort_values(['Industry', 'year'])

# Define dependent and independent variables
dependent_vars = [
    'tfp_ann_pct_lead', 
    'tfp_pct_change_lead', 
    'tfp_diff3', 
    'tfp_ann_pct', 
    'tfp_log', 
    'tfp_index_2017'
]

independent_vars = [
    'eer', 
    'pct_high_growth_emp', 
    'reallocation_rate'
]

results_fe = {}

print("="*80)
print("OLS REGRESSION RESULTS WITH INDUSTRY FIXED EFFECTS")
print("="*80)

for dep_var in dependent_vars:
    for indep_var in independent_vars:
        
        formula = f'{dep_var} ~ {indep_var} + C(Industry)'
        
        try:
            model = smf.ols(formula, data=df).fit(cov_type='HC1')
            
            key = f'{dep_var}_vs_{indep_var}_FE'
            results_fe[key] = model
            
            # Print summary information
            print(f"\n{'-'*60}")
            print(f"MODEL: {dep_var} ~ {indep_var} + Industry FE")
            print(f"{'-'*60}")
            print(f"R-squared: {model.rsquared:.4f}")
            print(f"Adj. R-squared: {model.rsquared_adj:.4f}")
            print(f"F-statistic: {model.fvalue:.4f}")
            print(f"Prob (F-statistic): {model.f_pvalue:.4f}")
            print(f"N observations: {model.nobs:.0f}")
            
            coef = model.params[indep_var]
            se = model.HC1_se[indep_var]  # Robust standard errors
            t_stat = model.tvalues[indep_var]
            p_value = model.pvalues[indep_var]
            
            print(f"\nCoefficient on {indep_var}:")
            print(f"  Estimate: {coef:.6f}")
            print(f"  Std Error: {se:.6f}")
            print(f"  t-statistic: {t_stat:.4f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Significance: {'***' if p_value < 0.01 else '**' if p_value < 0.05 else '*' if p_value < 0.10 else ''}")
            
            industry_params = [param for param in model.params.index if 'C(Industry)' in param]
            print(f"  Number of Industry FE: {len(industry_params)}")
            
        except Exception as e:
            print(f"\nERROR with {dep_var} ~ {indep_var} + Industry FE: {str(e)}")
            continue

print("\n" + "="*80)
print("SUMMARY TABLE OF ALL REGRESSIONS WITH INDUSTRY FIXED EFFECTS")
print("="*80)

summary_data_fe = []
for dep_var in dependent_vars:
    for indep_var in independent_vars:
        key = f'{dep_var}_vs_{indep_var}_FE'
        if key in results_fe:
            model = results_fe[key]
            coef = model.params[indep_var]
            se = model.HC1_se[indep_var]
            p_val = model.pvalues[indep_var]
            r2 = model.rsquared
            adj_r2 = model.rsquared_adj
            n_obs = model.nobs
            
            # Count industry fixed effects
            industry_params = [param for param in model.params.index if 'C(Industry)' in param]
            n_industries = len(industry_params)
            
            summary_data_fe.append({
                'Dependent Variable': dep_var,
                'Independent Variable': indep_var,
                'Coefficient': f"{coef:.6f}",
                'Std Error': f"{se:.6f}",
                'p-value': f"{p_val:.4f}",
                'R-squared': f"{r2:.4f}",
                'Adj R-squared': f"{adj_r2:.4f}",
                'N': f"{n_obs:.0f}",
                'Industry FE': f"{n_industries}",
                'Significance': '***' if p_val < 0.01 else '**' if p_val < 0.05 else '*' if p_val < 0.10 else ''
            })

summary_df_fe = pd.DataFrame(summary_data_fe)
print(summary_df_fe.to_string(index=False))

summary_df_fe.to_csv('/Users/danielseymour/Developer/EC334-Summative/processed_data/ols_industry_fe_results.csv', index=False)
print(f"\nResults saved to: /Users/danielseymour/Developer/EC334-Summative/processed_data/ols_industry_fe_results.csv")

print(f"\nTo access individual models, use: results_fe['dependent_var_vs_independent_var_FE']")
print(f"Example: results_fe['tfp_ann_pct_lead_vs_eer_FE'].summary()")

print(f"\n" + "="*80)
print("EXAMPLE: Full summary for tfp_ann_pct_lead ~ eer + Industry FE")
print("="*80)
if 'tfp_ann_pct_lead_vs_eer_FE' in results_fe:
    print(results_fe['tfp_ann_pct_lead_vs_eer_FE'].summary())

print(f"\n" + "="*80)
print("INDUSTRY FIXED EFFECTS INCLUDED")
print("="*80)
if 'tfp_ann_pct_lead_vs_eer_FE' in results_fe:
    model_example = results_fe['tfp_ann_pct_lead_vs_eer_FE']
    industry_effects = [param for param in model_example.params.index if 'C(Industry)' in param]
    print(f"Total number of industry fixed effects: {len(industry_effects)}")
    
    # Show a few examples of the industry coefficients
    print(f"\nExample industry fixed effect coefficients:")
    for i, effect in enumerate(industry_effects[:5]):  # Show first 5
        coef = model_example.params[effect]
        print(f"  {effect}: {coef:.6f}")
    if len(industry_effects) > 5:
        print(f"  ... and {len(industry_effects) - 5} more industry fixed effects")

print(f"\n" + "="*80)
print("NOTE: COMPARISON WITH SIMPLE OLS")
print("="*80)
print("The industry fixed effects control for time-invariant industry characteristics.")
print("Compare R-squared values with the simple OLS results to see the improvement.")
print("The coefficient on your main variables now represents within-industry variation.")

OLS REGRESSION RESULTS WITH INDUSTRY FIXED EFFECTS

------------------------------------------------------------
MODEL: tfp_ann_pct_lead ~ eer + Industry FE
------------------------------------------------------------
R-squared: 0.1534
Adj. R-squared: 0.1269
F-statistic: 4.0528
Prob (F-statistic): 0.0000
N observations: 1188

Coefficient on eer:
  Estimate: 0.028829
  Std Error: 0.025905
  t-statistic: 1.1129
  p-value: 0.2658
  Significance: 
  Number of Industry FE: 35

------------------------------------------------------------
MODEL: tfp_ann_pct_lead ~ pct_high_growth_emp + Industry FE
------------------------------------------------------------
R-squared: 0.1529
Adj. R-squared: 0.1264
F-statistic: 4.1223
Prob (F-statistic): 0.0000
N observations: 1188

Coefficient on pct_high_growth_emp:
  Estimate: -0.028557
  Std Error: 0.028007
  t-statistic: -1.0196
  p-value: 0.3079
  Significance: 
  Number of Industry FE: 35

------------------------------------------------------------
MOD

Lags

In [80]:
resid = mod2.resid
out = df_clean.copy()
out['resid'] = resid

out = out.sort_values(['Industry','year'])
out['resid_lag1'] = out.groupby('Industry')['resid'].shift(1)
out['d_resid']    = out['resid'] - out['resid_lag1']

test_df = out.dropna(subset=['resid_lag1','d_resid'])

aux = sm.OLS(test_df['d_resid'], sm.add_constant(test_df['resid_lag1'])).fit()

print(aux.summary())

                            OLS Regression Results                            
Dep. Variable:                d_resid   R-squared:                       0.150
Model:                            OLS   Adj. R-squared:                  0.149
Method:                 Least Squares   F-statistic:                     203.2
Date:                Mon, 26 May 2025   Prob (F-statistic):           1.41e-42
Time:                        15:29:11   Log-Likelihood:                -2495.0
No. Observations:                1152   AIC:                             4994.
Df Residuals:                    1150   BIC:                             5004.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2.689e-17      0.062   4.32e-16      1.0

In [81]:
df_clean['tfp_ann_pct_lead_lag1'] = (
    df_clean
    .sort_values(['Industry','year'])
    .groupby('Industry')['tfp_ann_pct_lead']
    .shift(1)
)

df_dyn = df_clean.dropna(subset=['tfp_ann_pct_lead_lag1', 'tfp_ann_pct_lead'])

mod_ar1 = smf.ols(
    'tfp_ann_pct_lead ~ tfp_ann_pct_lead_lag1 + C(year)',
    data=df_dyn
).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_dyn['NAICS']}
)

print("AR(1) Model - TFP with its own lag:")
print(mod_ar1.summary())

AR(1) Model - TFP with its own lag:
                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.469
Model:                            OLS   Adj. R-squared:                  0.453
Method:                 Least Squares   F-statistic:                     146.5
Date:                Mon, 26 May 2025   Prob (F-statistic):           3.63e-29
Time:                        15:33:05   Log-Likelihood:                -2492.6
No. Observations:                1152   AIC:                             5051.
Df Residuals:                    1119   BIC:                             5218.
Df Model:                          32                                         
Covariance Type:              cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['tfp_ann_pct_lead_lag1'] = (


In [None]:

df_clean['tfp_ann_pct_lead_lag1'] = (
    df_clean
    .sort_values(['Industry','year'])
    .groupby('Industry')['tfp_ann_pct_lead']
    .shift(1)
)
df_dyn = df_clean.dropna(subset=['tfp_ann_pct_lead_lag1', 'tfp_ann_pct_lead'])

mod_dyn = smf.ols(
    'tfp_ann_pct_lead ~ tfp_ann_pct_lead_lag1 + eer + C(year)',
    data=df_dyn
).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_dyn['NAICS']}    # cluster by industry
)

print(mod_dyn.summary())

                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.469
Model:                            OLS   Adj. R-squared:                  0.453
Method:                 Least Squares   F-statistic:                     390.3
Date:                Mon, 26 May 2025   Prob (F-statistic):           1.28e-36
Time:                        14:03:00   Log-Likelihood:                -2492.6
No. Observations:                1152   AIC:                             5053.
Df Residuals:                    1118   BIC:                             5225.
Df Model:                          33                                         
Covariance Type:              cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                -0.37

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['tfp_ann_pct_lead_lag1'] = (


In [83]:
dyn_vars = ['eer', 'pct_high_growth_emp', 'reallocation_rate']

for var in dyn_vars:
    df_clean[f'{var}_lag1'] = (
        df_clean
        .sort_values(['Industry','year'])
        .groupby('Industry')[var]
        .shift(1)
    )

lag_vars = ['tfp_ann_pct_lead_lag1'] + [f'{var}_lag1' for var in dyn_vars]
df_dyn = df_clean.dropna(subset=lag_vars + ['tfp_ann_pct_lead'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[f'{var}_lag1'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[f'{var}_lag1'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[f'{var}_lag1'] = (


In [None]:
for var in dyn_vars:
    print(f"\n{'='*60}")
    print(f"Bivariate: TFP ~ TFP_lag + {var}_lag")
    print(f"{'='*60}")
    
    df_bivariate = df_clean.dropna(subset=['tfp_ann_pct_lead', 'tfp_ann_pct_lead_lag1', f'{var}_lag1'])
    
    mod_bivariate = smf.ols(
        f'tfp_ann_pct_lead ~ tfp_ann_pct_lead_lag1 + {var}_lag1 + C(year)',
        data=df_bivariate
    ).fit(
        cov_type='cluster',
        cov_kwds={'groups': df_bivariate['NAICS']}
    )
    
    print(mod_bivariate.summary())


Bivariate: TFP ~ TFP_lag + eer_lag
                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     23.07
Date:                Mon, 26 May 2025   Prob (F-statistic):           1.19e-15
Time:                        15:36:05   Log-Likelihood:                -2838.9
No. Observations:                1152   AIC:                             5744.
Df Residuals:                    1119   BIC:                             5910.
Df Model:                          32                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Interc

In [86]:
for var in dyn_vars:
    print(f"\n{'='*60}")
    print(f"Bivariate: TFP ~ TFP_lag + {var}_lag")
    print(f"{'='*60}")
    
    df_bivariate = df_clean.dropna(subset=['tfp_ann_pct_lead', 'tfp_ann_pct_lead_lag1', f'{var}_lag1'])
    
    mod_bivariate = smf.ols(
        f'tfp_ann_pct_lead ~ {var}_lag1 + C(year)',
        data=df_bivariate
    ).fit(
        cov_type='cluster',
        cov_kwds={'groups': df_bivariate['NAICS']}
    )
    
    print(mod_bivariate.summary())


Bivariate: TFP ~ TFP_lag + eer_lag
                            OLS Regression Results                            
Dep. Variable:       tfp_ann_pct_lead   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     23.07
Date:                Mon, 26 May 2025   Prob (F-statistic):           1.19e-15
Time:                        15:36:33   Log-Likelihood:                -2838.9
No. Observations:                1152   AIC:                             5744.
Df Residuals:                    1119   BIC:                             5910.
Df Model:                          32                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Interc

In [88]:
for var in dyn_vars:
    print(f"\n{'='*60}")
    print(f"Bivariate: TFP ~ {var}_lag1 + {var}_lag2")
    print(f"{'='*60}")
    
    df_bivariate = df_clean.dropna(subset=['tfp_ann_pct_lead', f'{var}_lag1', f'{var}_lag2'])
    
    mod_bivariate = smf.ols(
        f'tfp_ann_pct_lead ~ {var}_lag1 + {var}_lag2 + C(year)',
        data=df_bivariate
    ).fit(
        cov_type='cluster',
        cov_kwds={'groups': df_bivariate['NAICS']}
    )
    
    print(f"R-squared: {mod_bivariate.rsquared:.4f}")
    print(f"N: {mod_bivariate.nobs}")
    print(f"\nCoefficients of interest:")
    print(f"{var}_lag1: {mod_bivariate.params[f'{var}_lag1']:.4f} (p={mod_bivariate.pvalues[f'{var}_lag1']:.4f})")
    print(f"{var}_lag2: {mod_bivariate.params[f'{var}_lag2']:.4f} (p={mod_bivariate.pvalues[f'{var}_lag2']:.4f})")


Bivariate: TFP ~ eer_lag1 + eer_lag2
R-squared: 0.0298
N: 1116.0

Coefficients of interest:
eer_lag1: -0.0195 (p=0.2658)
eer_lag2: -0.0178 (p=0.1944)

Bivariate: TFP ~ pct_high_growth_emp_lag1 + pct_high_growth_emp_lag2
R-squared: 0.0379
N: 1116.0

Coefficients of interest:
pct_high_growth_emp_lag1: -0.0691 (p=0.0087)
pct_high_growth_emp_lag2: -0.0038 (p=0.8587)

Bivariate: TFP ~ reallocation_rate_lag1 + reallocation_rate_lag2
R-squared: 0.0289
N: 1116.0

Coefficients of interest:
reallocation_rate_lag1: -0.0062 (p=0.6244)
reallocation_rate_lag2: -0.0095 (p=0.3110)


Long differences

In [56]:
early_years = (1988, 1992)  
late_years  = (2015, 2019)

dyn_vars = ['eer','estabs_entry_rate', 'reallocation_rate', 'pct_high_growth_emp']
prod_var = 'tfp_index_2017'

early = (df['year'].between(*early_years))
late  = (df['year'].between(*late_years))

common = dyn_vars + [prod_var, 'weight']

early_means = (
    df[early]
      .groupby('NAICS')[ common ]
      .mean()
      .rename(columns=lambda c: f"{c}_early")
)

late_means = (
    df[late]
      .groupby('NAICS')[ common ]
      .mean()
      .rename(columns=lambda c: f"{c}_late")
)

wide = early_means.join(late_means, how='inner')

In [57]:
for v in dyn_vars:
    wide[f"long_run_diff_log_{v}"] = np.log(wide[f"{v}_late"]) - np.log(wide[f"{v}_early"])

In [58]:
wide["long_run_diff_log_TFP"] = np.log(wide["tfp_index_2017_late"]) - np.log(wide["tfp_index_2017_early"])

In [59]:
wide.head()

Unnamed: 0_level_0,eer_early,estabs_entry_rate_early,reallocation_rate_early,pct_high_growth_emp_early,tfp_index_2017_early,weight_early,eer_late,estabs_entry_rate_late,reallocation_rate_late,pct_high_growth_emp_late,tfp_index_2017_late,weight_late,long_run_diff_log_eer,long_run_diff_log_estabs_entry_rate,long_run_diff_log_reallocation_rate,long_run_diff_log_pct_high_growth_emp,long_run_diff_log_TFP
NAICS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
211,5.547239,10.4566,33.5176,5.390975,59.9544,0.018983,4.978778,6.981,25.2726,3.69123,98.4102,0.02151,-0.108116,-0.404041,-0.28235,-0.378767,0.49556
212,5.468806,10.8988,29.5378,5.450933,100.7284,0.01138,3.00022,6.1048,21.4978,2.508506,97.5342,0.006841,-0.600375,-0.579577,-0.31772,-0.776099,-0.032225
213,7.921582,13.0946,44.2532,8.941696,67.3168,0.00397,5.46812,12.0606,28.0966,6.194326,101.6398,0.008265,-0.370656,-0.082256,-0.454279,-0.367092,0.412025
321,3.900014,10.2046,23.9386,4.569284,102.947,0.014146,2.801099,7.7126,17.106,2.862043,98.0934,0.00824,-0.330968,-0.279983,-0.336063,-0.467821,-0.048294
322,1.676463,5.2398,12.7562,1.285503,96.3174,0.028315,1.371881,3.2436,12.9132,0.789106,102.7226,0.013158,-0.200503,-0.4796,0.012233,-0.488005,0.064383


In [60]:
df.columns

Index(['year', 'NAICS', 'firms', 'estabs', 'emp', 'denom', 'estabs_entry',
       'estabs_entry_rate', 'estabs_exit', 'estabs_exit_rate', 'job_creation',
       'job_creation_births', 'job_creation_continuers',
       'job_creation_rate_births', 'job_creation_rate', 'job_destruction',
       'job_destruction_deaths', 'job_destruction_continuers',
       'job_destruction_rate_deaths', 'job_destruction_rate',
       'net_job_creation', 'net_job_creation_rate', 'reallocation_rate',
       'firmdeath_firms', 'firmdeath_estabs', 'firmdeath_emp', 'Industry',
       'Basis', 'weight', 'tfp_pct_change', 'tfp_index_2017',
       'firms_percent_destroyed', 'eer', 'pct_high_growth_emp', 'tfp_log',
       'tfp_diff3', 'tfp_ann_pct', 'tfp_ann_pct_lead', 'tfp_pct_change_lead'],
      dtype='object')

In [61]:
wide.columns

Index(['eer_early', 'estabs_entry_rate_early', 'reallocation_rate_early',
       'pct_high_growth_emp_early', 'tfp_index_2017_early', 'weight_early',
       'eer_late', 'estabs_entry_rate_late', 'reallocation_rate_late',
       'pct_high_growth_emp_late', 'tfp_index_2017_late', 'weight_late',
       'long_run_diff_log_eer', 'long_run_diff_log_estabs_entry_rate',
       'long_run_diff_log_reallocation_rate',
       'long_run_diff_log_pct_high_growth_emp', 'long_run_diff_log_TFP'],
      dtype='object')

In [62]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.DataFrame({
    'var': X.columns,
    'VIF': [variance_inflation_factor(X.values, i) 
            for i in range(X.shape[1])]
})
print(vifs)

                                   var        VIF
0                                const  24.042431
1  long_run_diff_log_estabs_entry_rate   1.012439
2                     log_weight_early   1.012439


In [64]:
X = wide[[f"long_run_diff_log_{v}" for v in dyn_vars]]
X = sm.add_constant(X)
y = wide["long_run_diff_log_TFP"]

model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': wide.index})
print(model.summary())


                              OLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.160
Model:                               OLS   Adj. R-squared:                  0.052
Method:                    Least Squares   F-statistic:                     3.648
Date:                   Mon, 26 May 2025   Prob (F-statistic):             0.0138
Time:                           14:47:45   Log-Likelihood:                -8.1193
No. Observations:                     36   AIC:                             26.24
Df Residuals:                         31   BIC:                             34.16
Df Model:                              4                                         
Covariance Type:                 cluster                                         
                                            coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------

In [65]:
for var in dyn_vars:
    print(f"\n{'='*60}")
    print(f"Bivariate regression: Δlog TFP ~ Δlog {var}")
    print(f"{'='*60}")
    
    X_single = wide[[f"long_run_diff_log_{var}"]]
    X_single = sm.add_constant(X_single)
    y = wide["long_run_diff_log_TFP"]
    
    # Run regression with clustered standard errors
    model_single = sm.OLS(y, X_single).fit(cov_type='cluster', cov_kwds={'groups': wide.index})
    print(model_single.summary())


Bivariate regression: Δlog TFP ~ Δlog eer
                              OLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.108
Model:                               OLS   Adj. R-squared:                  0.082
Method:                    Least Squares   F-statistic:                     9.768
Date:                   Mon, 26 May 2025   Prob (F-statistic):            0.00356
Time:                           14:47:47   Log-Likelihood:                -9.2090
No. Observations:                     36   AIC:                             22.42
Df Residuals:                         34   BIC:                             25.59
Df Model:                              1                                         
Covariance Type:                 cluster                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------

In [67]:
wide.columns

Index(['eer_early', 'estabs_entry_rate_early', 'reallocation_rate_early',
       'pct_high_growth_emp_early', 'tfp_index_2017_early', 'weight_early',
       'eer_late', 'estabs_entry_rate_late', 'reallocation_rate_late',
       'pct_high_growth_emp_late', 'tfp_index_2017_late', 'weight_late',
       'long_run_diff_log_eer', 'long_run_diff_log_estabs_entry_rate',
       'long_run_diff_log_reallocation_rate',
       'long_run_diff_log_pct_high_growth_emp', 'long_run_diff_log_TFP',
       'log_weight_early', 'log_tfp0'],
      dtype='object')

In [68]:
wide['log_weight_early']   = np.log(wide['weight_early'])
wide['log_tfp0']           = np.log(wide['tfp_index_2017_early'])

X = wide[[
    'long_run_diff_log_eer',
    'long_run_diff_log_reallocation_rate',
    'long_run_diff_log_pct_high_growth_emp',
    'log_weight_early',
]]

X = sm.add_constant(X)

y = wide['long_run_diff_log_TFP']

model = sm.OLS(y, X).fit(cov_type='HC1')
print(model.summary())

                              OLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.132
Model:                               OLS   Adj. R-squared:                  0.020
Method:                    Least Squares   F-statistic:                     2.949
Date:                   Mon, 26 May 2025   Prob (F-statistic):             0.0356
Time:                           15:03:03   Log-Likelihood:                -8.7201
No. Observations:                     36   AIC:                             27.44
Df Residuals:                         31   BIC:                             35.36
Df Model:                              4                                         
Covariance Type:                     HC1                                         
                                            coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------

In [69]:
X = wide[[
    'long_run_diff_log_eer',
    'long_run_diff_log_reallocation_rate',
    'long_run_diff_log_pct_high_growth_emp',
    'log_weight_early'
]].copy()

X['const'] = 1.0

vif_data = pd.DataFrame({
    'variable': X.columns,
    'VIF': [
        variance_inflation_factor(X.values, i)
        for i in range(X.shape[1])
    ]
})

print(vif_data)

                                variable        VIF
0                  long_run_diff_log_eer   4.499490
1    long_run_diff_log_reallocation_rate   3.574638
2  long_run_diff_log_pct_high_growth_emp   4.047290
3                       log_weight_early   1.280081
4                                  const  26.148922


In [None]:

w = wide['weight_early']
X = wide[['long_run_diff_log_eer']]      
X = sm.add_constant(X)
y = wide['long_run_diff_log_TFP']

wls_mod = sm.WLS(y, X, weights=w).fit(cov_type='HC1')
print(wls_mod.summary())

                              WLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.131
Model:                               WLS   Adj. R-squared:                  0.105
Method:                    Least Squares   F-statistic:                     2.817
Date:                   Mon, 26 May 2025   Prob (F-statistic):              0.102
Time:                           15:15:53   Log-Likelihood:                -24.260
No. Observations:                     36   AIC:                             52.52
Df Residuals:                         34   BIC:                             55.69
Df Model:                              1                                         
Covariance Type:                     HC1                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------


In [None]:
wide['log_weight_early']   = np.log(wide['weight_early'])
X = wide[[
    'long_run_diff_log_eer',
    'log_weight_early',
]]

X = sm.add_constant(X)

y = wide['long_run_diff_log_TFP']

model = sm.OLS(y, X).fit(cov_type='HC1')
print(model.summary())

                              OLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.118
Model:                               OLS   Adj. R-squared:                  0.064
Method:                    Least Squares   F-statistic:                     5.530
Date:                   Mon, 26 May 2025   Prob (F-statistic):            0.00848
Time:                           15:17:13   Log-Likelihood:                -9.0086
No. Observations:                     36   AIC:                             24.02
Df Residuals:                         33   BIC:                             28.77
Df Model:                              2                                         
Covariance Type:                     HC1                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------


In [74]:
wide['log_weight_early']   = np.log(wide['weight_early'])
X = wide[[
    'long_run_diff_log_pct_high_growth_emp',
]]
X = sm.add_constant(X)
y = wide['long_run_diff_log_TFP']

model = sm.OLS(y, X).fit(cov_type='HC1')
print(model.summary())

                              OLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.052
Model:                               OLS   Adj. R-squared:                  0.024
Method:                    Least Squares   F-statistic:                     4.126
Date:                   Mon, 26 May 2025   Prob (F-statistic):             0.0501
Time:                           15:19:58   Log-Likelihood:                -10.302
No. Observations:                     36   AIC:                             24.60
Df Residuals:                         34   BIC:                             27.77
Df Model:                              1                                         
Covariance Type:                     HC1                                         
                                            coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------

In [75]:
w = wide['weight_early']
X = wide[['long_run_diff_log_pct_high_growth_emp']]   
X = sm.add_constant(X)
y = wide['long_run_diff_log_TFP']

wls_mod = sm.WLS(y, X, weights=w).fit(cov_type='HC1')
print(wls_mod.summary())

                              WLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.032
Model:                               WLS   Adj. R-squared:                  0.003
Method:                    Least Squares   F-statistic:                     3.907
Date:                   Mon, 26 May 2025   Prob (F-statistic):             0.0562
Time:                           15:20:24   Log-Likelihood:                -26.203
No. Observations:                     36   AIC:                             56.41
Df Residuals:                         34   BIC:                             59.57
Df Model:                              1                                         
Covariance Type:                     HC1                                         
                                            coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------

In [76]:
wide['log_weight_early']   = np.log(wide['weight_early'])

X = wide[[
    'long_run_diff_log_reallocation_rate',
]]

X = sm.add_constant(X)
y = wide['long_run_diff_log_TFP']
model = sm.OLS(y, X).fit(cov_type='HC1')
print(model.summary())

                              OLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.055
Model:                               OLS   Adj. R-squared:                  0.027
Method:                    Least Squares   F-statistic:                     4.633
Date:                   Mon, 26 May 2025   Prob (F-statistic):             0.0386
Time:                           15:21:06   Log-Likelihood:                -10.245
No. Observations:                     36   AIC:                             24.49
Df Residuals:                         34   BIC:                             27.66
Df Model:                              1                                         
Covariance Type:                     HC1                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

In [77]:
w = wide['weight_early']
X = wide[['long_run_diff_log_reallocation_rate']]     
X = sm.add_constant(X)
y = wide['long_run_diff_log_TFP']

wls_mod = sm.WLS(y, X, weights=w).fit(cov_type='HC1')
print(wls_mod.summary())

                              WLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.028
Model:                               WLS   Adj. R-squared:                 -0.001
Method:                    Least Squares   F-statistic:                     2.593
Date:                   Mon, 26 May 2025   Prob (F-statistic):              0.117
Time:                           15:21:06   Log-Likelihood:                -26.277
No. Observations:                     36   AIC:                             56.55
Df Residuals:                         34   BIC:                             59.72
Df Model:                              1                                         
Covariance Type:                     HC1                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

In [78]:
wide['log_weight_early']   = np.log(wide['weight_early'])
wide['log_tfp0']           = np.log(wide['tfp_index_2017_early'])

X = wide[[
    'long_run_diff_log_estabs_entry_rate'
]]

X = sm.add_constant(X)
y = wide['long_run_diff_log_TFP']

model = sm.OLS(y, X).fit(cov_type='HC1')
print(model.summary())

                              OLS Regression Results                             
Dep. Variable:     long_run_diff_log_TFP   R-squared:                       0.153
Model:                               OLS   Adj. R-squared:                  0.128
Method:                    Least Squares   F-statistic:                     7.951
Date:                   Mon, 26 May 2025   Prob (F-statistic):            0.00796
Time:                           15:21:22   Log-Likelihood:                -8.2832
No. Observations:                     36   AIC:                             20.57
Df Residuals:                         34   BIC:                             23.73
Df Model:                              1                                         
Covariance Type:                     HC1                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------