In [None]:
import pandas as pd
import numpy as np
from linearmodels.panel import PanelOLS
import statsmodels.api as sm
from scipy import stats  # Add this import for the t-distribution

In [None]:
filepath = '/Users/danielseymour/Developer/EC334-Summative/raw_data/productivity_data/total_factor_productivity.csv'

tfp_disp_df = pd.read_csv(filepath, encoding='latin1')

In [None]:
tfp_disp_df.head()

Unnamed: 0,naics4,year,sd,d7525,d9010,d9990,d1001,sd*,d7525*,d9010*,d9990*,d1001*
0,3111,1987.0,0.2883,0.2239,0.6477,0.6196,0.1455,0.2789,0.1947,0.4894,0.8043,0.1535
1,3111,1988.0,0.2782,0.1996,0.533,0.8425,0.1384,0.3102,0.1529,0.5752,0.8945,0.1245
2,3111,1989.0,0.3253,0.184,0.722,0.7709,0.1533,0.3094,0.1547,0.5261,0.8397,0.1173
3,3111,1990.0,0.2954,0.2001,0.5154,1.037,0.1929,0.2927,0.1822,0.4714,0.7978,0.1004
4,3111,1991.0,0.2842,0.2235,0.5372,0.6796,0.2426,0.2842,0.1806,0.4711,0.7504,0.2352


In [None]:

filepath = '/Users/danielseymour/Developer/EC334-Summative/processed_data/tfp_four_digit_pivoted.csv'

manufacturing_tfp = pd.read_csv(filepath)


In [None]:
filepath = '/Users/danielseymour/Developer/EC334-Summative/raw_data/dynamism_data/bds2022_vcn4.csv'

bds2022_vcn4 = pd.read_csv(filepath, encoding='latin1')

In [None]:
bds2022_vcn4.head() 

Unnamed: 0,year,vcnaics4,firms,estabs,emp,denom,estabs_entry,estabs_entry_rate,estabs_exit,estabs_exit_rate,...,job_destruction_deaths,job_destruction_continuers,job_destruction_rate_deaths,job_destruction_rate,net_job_creation,net_job_creation_rate,reallocation_rate,firmdeath_firms,firmdeath_estabs,firmdeath_emp
0,1978,1131,193,200,3067,2948,42,21.483,33,16.88,...,474,311,16.081,26.633,255,8.651,53.265,22,22,37
1,1978,1132,84,100,1745,1762,12,11.707,17,16.585,...,42,546,2.384,33.381,-51,-2.895,60.971,10,10,25
2,1978,1133,12079,12196,98382,93837,2382,19.806,2043,16.987,...,7320,10648,7.801,19.148,9138,9.738,38.296,1521,1524,5347
3,1978,1141,1504,1540,10486,9559,383,25.008,366,23.898,...,1253,1344,13.108,27.168,1837,19.217,54.336,272,272,799
4,1978,1142,147,147,694,650,34,23.368,31,21.306,...,71,142,10.931,32.794,91,14.011,65.589,25,25,64


In [None]:
# Ensure both keys are strings for merging
tfp_disp_df['naics4'] = tfp_disp_df['naics4'].astype(str)
bds2022_vcn4['vcnaics4'] = bds2022_vcn4['vcnaics4'].astype(str)

temp_df = pd.merge(
	tfp_disp_df, 
	bds2022_vcn4, 
	how='inner', 
	left_on=['naics4', 'year'], 
	right_on=['vcnaics4', 'year']
)

In [None]:
temp_df.columns

Index(['naics4', 'year', 'sd', 'd7525', 'd9010', 'd9990', 'd1001', 'sd*',
       'd7525*', 'd9010*', 'd9990*', 'd1001*', 'vcnaics4', 'firms', 'estabs',
       'emp', 'denom', 'estabs_entry', 'estabs_entry_rate', 'estabs_exit',
       'estabs_exit_rate', 'job_creation', 'job_creation_births',
       'job_creation_continuers', 'job_creation_rate_births',
       'job_creation_rate', 'job_destruction', 'job_destruction_deaths',
       'job_destruction_continuers', 'job_destruction_rate_deaths',
       'job_destruction_rate', 'net_job_creation', 'net_job_creation_rate',
       'reallocation_rate', 'firmdeath_firms', 'firmdeath_estabs',
       'firmdeath_emp'],
      dtype='object')

In [None]:
manufacturing_tfp.columns

Index(['NAICS', 'IndustryTitle', 'Basis', 'year', 'tfp_pct_change',
       'tfp_index_2017'],
      dtype='object')

In [None]:
temp_df.shape

(2924, 37)

In [None]:
# convert to numeric (will error or NaN if something’s not a number)
temp_df['naics4'] = pd.to_numeric(temp_df['naics4'], errors='raise').astype('int64')
manufacturing_tfp['NAICS'] = pd.to_numeric(manufacturing_tfp['NAICS'], errors='raise').astype('int64')

# if your 'year' columns aren’t already integers, do the same:
temp_df['year'] = temp_df['year'].astype(int)
# manufacturing_tfp['year'] may be float or string like '1988.0', so convert carefully:
manufacturing_tfp['year'] = pd.to_numeric(manufacturing_tfp['year'], errors='coerce').astype('Int64')
manufacturing_tfp = manufacturing_tfp.dropna(subset=['year'])
manufacturing_tfp['year'] = manufacturing_tfp['year'].astype(int)

# now merge
merged_df = pd.merge(
    temp_df,
    manufacturing_tfp,
    how='inner',
    left_on = ['naics4', 'year'],
    right_on= ['NAICS',  'year']
)

In [None]:
merged_df.head()

Unnamed: 0,naics4,year,sd,d7525,d9010,d9990,d1001,sd*,d7525*,d9010*,...,net_job_creation_rate,reallocation_rate,firmdeath_firms,firmdeath_estabs,firmdeath_emp,NAICS,IndustryTitle,Basis,tfp_pct_change,tfp_index_2017
0,3111,1987,0.2883,0.2239,0.6477,0.6196,0.1455,0.2789,0.1947,0.4894,...,-0.703,23.69,72,75,619,3111,Animal food manufacturing,All workers,N.A.,102.8
1,3111,1988,0.2782,0.1996,0.533,0.8425,0.1384,0.3102,0.1529,0.5752,...,-1.347,26.411,76,77,1085,3111,Animal food manufacturing,All workers,-1.6,101.189
2,3111,1989,0.3253,0.184,0.722,0.7709,0.1533,0.3094,0.1547,0.5261,...,-2.389,20.768,70,70,704,3111,Animal food manufacturing,All workers,1.4,102.619
3,3111,1990,0.2954,0.2001,0.5154,1.037,0.1929,0.2927,0.1822,0.4714,...,2.093,20.153,76,77,690,3111,Animal food manufacturing,All workers,3.0,105.689
4,3111,1991,0.2842,0.2235,0.5372,0.6796,0.2426,0.2842,0.1806,0.4711,...,-0.703,21.388,61,63,745,3111,Animal food manufacturing,All workers,-3.7,101.807


In [None]:
merged_df.columns

Index(['naics4', 'year', 'sd', 'd7525', 'd9010', 'd9990', 'd1001', 'sd*',
       'd7525*', 'd9010*', 'd9990*', 'd1001*', 'vcnaics4', 'firms', 'estabs',
       'emp', 'denom', 'estabs_entry', 'estabs_entry_rate', 'estabs_exit',
       'estabs_exit_rate', 'job_creation', 'job_creation_births',
       'job_creation_continuers', 'job_creation_rate_births',
       'job_creation_rate', 'job_destruction', 'job_destruction_deaths',
       'job_destruction_continuers', 'job_destruction_rate_deaths',
       'job_destruction_rate', 'net_job_creation', 'net_job_creation_rate',
       'reallocation_rate', 'firmdeath_firms', 'firmdeath_estabs',
       'firmdeath_emp', 'NAICS', 'IndustryTitle', 'Basis', 'tfp_pct_change',
       'tfp_index_2017'],
      dtype='object')

In [None]:
merged_df['tfp_pct_change'] = pd.to_numeric(
    merged_df['tfp_pct_change'],
    errors='coerce'
)
merged_df['tfp_index_2017'] = pd.to_numeric(
    merged_df['tfp_index_2017'],
    errors='coerce')

In [None]:
# List of columns that should be numeric based on the aggregation dictionary
columns_to_convert = [
    # Dispersion measures
    'd9990', 'd1001', 'd9990*', 'd1001*',
    
    # Entry/exit counts
    'estabs_entry', 'estabs_exit',
    'firmdeath_firms', 'firmdeath_estabs', 'firmdeath_emp',
    
    # Entry/exit rates  
    'estabs_entry_rate', 'estabs_exit_rate',
    
    # Job flow counts
    'job_creation_births', 'job_creation_continuers',
    'job_destruction_deaths', 'job_destruction_continuers',
    
    # Job flow rates
    'job_creation_rate_births', 'job_destruction_rate_deaths'
]

# Convert each column to numeric
print("Converting object columns to numeric...")
for col in columns_to_convert:
    if col in merged_df.columns and merged_df[col].dtype == 'object':
        merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')
        nan_count = merged_df[col].isna().sum()
        if nan_count > 0:
            print(f"  {col}: converted ({nan_count} NaN values created)")
        else:
            print(f"  {col}: converted successfully")

Converting object columns to numeric...
  d9990: converted (17 NaN values created)
  d1001: converted (16 NaN values created)
  d9990*: converted (8 NaN values created)
  d1001*: converted (8 NaN values created)
  estabs_entry: converted successfully
  estabs_exit: converted successfully
  firmdeath_firms: converted (4 NaN values created)
  firmdeath_estabs: converted (4 NaN values created)
  firmdeath_emp: converted (4 NaN values created)
  estabs_entry_rate: converted successfully
  estabs_exit_rate: converted successfully
  job_creation_births: converted successfully
  job_creation_continuers: converted successfully
  job_destruction_deaths: converted successfully
  job_destruction_continuers: converted successfully
  job_creation_rate_births: converted successfully
  job_destruction_rate_deaths: converted successfully


In [None]:
merged_df.describe()

Unnamed: 0,naics4,year,sd,d7525,d9010,d9990,d1001,sd*,d7525*,d9010*,...,job_destruction_rate,net_job_creation,net_job_creation_rate,reallocation_rate,firmdeath_firms,firmdeath_estabs,firmdeath_emp,NAICS,tfp_pct_change,tfp_index_2017
count,2924.0,2924.0,2924.0,2924.0,2924.0,2907.0,2908.0,2924.0,2924.0,2924.0,...,2924.0,2924.0,2924.0,2924.0,2920.0,2920.0,2920.0,2924.0,2838.0,2924.0
mean,3266.453488,2003.5,0.438629,0.500009,1.049623,0.766493,0.310975,0.361527,0.468077,0.874176,...,11.059133,-1776.145007,-1.445345,16.838861,222.292123,224.005137,2432.108904,3266.453488,0.595137,95.33977
std,89.749796,9.812386,0.149306,0.213762,0.389669,0.451225,0.189244,0.135884,0.274022,0.358649,...,4.577867,10892.721269,5.36038,5.126989,383.068255,385.090653,3920.690916,89.749796,4.991368,14.763867
min,3111.0,1987.0,0.158,0.08544,0.3487,0.002956,0.002989,0.1184,0.06308,0.2236,...,1.869,-152430.0,-33.949,3.738,3.0,3.0,5.0,3111.0,-31.9,3.493
25%,3169.0,1995.0,0.342675,0.3725,0.810275,0.46105,0.2151,0.28545,0.3307,0.671175,...,7.99575,-3609.5,-3.676,13.47825,47.0,48.0,552.0,3169.0,-2.0,89.18725
50%,3295.0,2003.5,0.4097,0.45195,0.96875,0.6629,0.2907,0.33985,0.41575,0.8123,...,10.0895,-436.5,-0.606,16.3475,93.0,94.5,1213.0,3295.0,0.5,97.2245
75%,3341.0,2012.0,0.496425,0.566825,1.178,0.9522,0.3755,0.40615,0.534175,1.003,...,12.96725,1863.0,1.77675,19.73825,207.0,208.0,2721.25,3341.0,2.8,102.80325
max,3399.0,2020.0,1.726,2.125,4.182,4.01,4.321,1.574,3.243,3.756,...,41.635,40502.0,20.283,40.673,3468.0,3476.0,43304.0,3399.0,41.6,155.354


In [None]:
merged_df.columns

Index(['naics4', 'year', 'sd', 'd7525', 'd9010', 'd9990', 'd1001', 'sd*',
       'd7525*', 'd9010*', 'd9990*', 'd1001*', 'vcnaics4', 'firms', 'estabs',
       'emp', 'denom', 'estabs_entry', 'estabs_entry_rate', 'estabs_exit',
       'estabs_exit_rate', 'job_creation', 'job_creation_births',
       'job_creation_continuers', 'job_creation_rate_births',
       'job_creation_rate', 'job_destruction', 'job_destruction_deaths',
       'job_destruction_continuers', 'job_destruction_rate_deaths',
       'job_destruction_rate', 'net_job_creation', 'net_job_creation_rate',
       'reallocation_rate', 'firmdeath_firms', 'firmdeath_estabs',
       'firmdeath_emp', 'NAICS', 'IndustryTitle', 'Basis', 'tfp_pct_change',
       'tfp_index_2017', 'window_id', 'window_end_year'],
      dtype='object')

In [None]:
agg_dict = {
   # DISPERSION MEASURES - AVERAGE within 2-year window
   'sd': 'mean',
   'd7525': 'mean',
   'd9010': 'mean',
   'd9990': 'mean',
   'd1001': 'mean',
   'sd*': 'mean',
   'd7525*': 'mean',
   'd9010*': 'mean',
   'd9990*': 'mean',
   'd1001*': 'mean',

   # TFP metrics
   'tfp_pct_change': 'mean',     # average % change over the 2-year window
   'tfp_index_2017': 'last',     # index level at the end of the window

   # STOCK VARIABLES - LAST value (end of period snapshot)
   'firms': 'last',
   'estabs': 'last',
   'emp': 'last',
   'denom': 'last',

   # ENTRY/EXIT COUNTS - SUM across 2 years
   'estabs_entry': 'sum',
   'estabs_exit': 'sum',
   'firmdeath_firms': 'sum',
   'firmdeath_estabs': 'sum',
   'firmdeath_emp': 'sum',

   # ENTRY/EXIT RATES - AVERAGE across 2 years
   'estabs_entry_rate': 'mean',
   'estabs_exit_rate': 'mean',

   # JOB FLOW COUNTS - SUM across 2 years
   'job_creation': 'sum',
   'job_creation_births': 'sum',
   'job_creation_continuers': 'sum',
   'job_destruction': 'sum',
   'job_destruction_deaths': 'sum',
   'job_destruction_continuers': 'sum',
   'net_job_creation': 'sum',

   # JOB FLOW RATES - AVERAGE across 2 years
   'job_creation_rate': 'mean',
   'job_creation_rate_births': 'mean',
   'job_destruction_rate': 'mean',
   'job_destruction_rate_deaths': 'mean',
   'net_job_creation_rate': 'mean',
   'reallocation_rate': 'mean',
}