In [1]:
import itertools

import pandas as pd
import numpy as np

import linearmodels

from pathlib import Path

# Prep
Read in data and create output directory

In [2]:
combined_3digit = pd.read_csv('data/final/combined_3digit.csv')
combined_3digit.head()

Unnamed: 0,year,naics_code,empl_births,empl_contract,empl_deaths,empl_expand,empl_initial,estb_births,estb_contract,estb_deaths,...,empl_destroyed,estb_dhs,empl_dhs,estb_birth_rate,empl_create_rate,empl_destroy_rate,regdata20,regdata21,regdata22,regdata31
0,1998.0,113,5619.0,-10608.0,-6013.0,10818.0,84150.0,1446.0,3495.0,1647.0,...,16621.0,,,,,,393.953985,4387.428106,,
1,1998.0,114,1106.0,-1244.0,-785.0,1579.0,9312.0,287.0,267.0,233.0,...,2029.0,,,,,,19014.211936,16010.209662,5592.539131,3316.4159
2,1998.0,115,8096.0,-11921.0,-6773.0,14643.0,93588.0,1190.0,2104.0,1095.0,...,18694.0,,,,,,,,2145.552466,1553.904
3,1998.0,211,5945.0,-15757.0,-5644.0,8268.0,97037.0,564.0,2043.0,968.0,...,21401.0,,,,,,22298.961579,66989.810756,10369.798674,4184.1912
4,1998.0,212,7521.0,-22715.0,-10536.0,16639.0,225356.0,598.0,2266.0,636.0,...,33251.0,,,,,,25860.865965,21824.120103,,12604.6857


In [3]:
combined_4digit = pd.read_csv('data/final/combined_4digit.csv')
combined_3digit.head()

Unnamed: 0,year,naics_code,empl_births,empl_contract,empl_deaths,empl_expand,empl_initial,estb_births,estb_contract,estb_deaths,...,empl_destroyed,estb_dhs,empl_dhs,estb_birth_rate,empl_create_rate,empl_destroy_rate,regdata20,regdata21,regdata22,regdata31
0,1998.0,113,5619.0,-10608.0,-6013.0,10818.0,84150.0,1446.0,3495.0,1647.0,...,16621.0,,,,,,393.953985,4387.428106,,
1,1998.0,114,1106.0,-1244.0,-785.0,1579.0,9312.0,287.0,267.0,233.0,...,2029.0,,,,,,19014.211936,16010.209662,5592.539131,3316.4159
2,1998.0,115,8096.0,-11921.0,-6773.0,14643.0,93588.0,1190.0,2104.0,1095.0,...,18694.0,,,,,,,,2145.552466,1553.904
3,1998.0,211,5945.0,-15757.0,-5644.0,8268.0,97037.0,564.0,2043.0,968.0,...,21401.0,,,,,,22298.961579,66989.810756,10369.798674,4184.1912
4,1998.0,212,7521.0,-22715.0,-10536.0,16639.0,225356.0,598.0,2266.0,636.0,...,33251.0,,,,,,25860.865965,21824.120103,,12604.6857


In [4]:
Path('data/tables').mkdir(parents=True, exist_ok=True)

# Summary Tables

In [5]:
summary_table_rows = {
    'estb_birth_rate': 'Startup Rate',
    'empl_create_rate': 'Job Creation Rate',
    'empl_destroy_rate': 'Job Destruction Rate',
    'estb_births': 'Establishment Births',
    'estb_deaths': 'Establishment Deaths',
    'empl_created': 'New Hires',
    'regdata20': 'RegData 2.0 Regulation Index',
    'regdata21': 'RegData 2.1 Regulation Index',
    'regdata22': 'RegData 2.2 Regulation Index',
    'regdata31': 'RegData 3.1 Regulation Index'
}

summary_3digit = (
    combined_3digit
    .loc[lambda df: df['year'] <= 2010]
    .describe()
    [[i for i in summary_table_rows if i in combined_3digit]]
    .round(2)
    .rename(columns=summary_table_rows)
    .T
    [['count', 'mean', 'std', 'min', 'max']]
    .rename(columns={'std':'standard deviation'})
    .assign(count=lambda df: df['count'].astype(int))
    .applymap(lambda x: f'{x:,.0f}' if x > 100 else f'{x:.02f}')
)
summary_3digit.to_csv('data/tables/summary_3digit.csv')
summary_3digit.to_latex('data/tables/summary_3digit.tex')
summary_3digit

Unnamed: 0,count,mean,standard deviation,min,max
Startup Rate,1030,10.84,4.83,1.05,46.15
Job Creation Rate,970,14.58,6.65,1.35,81.29
Job Destruction Rate,969,15.11,5.33,0.94,61.84
Establishment Births,1118,8422.0,14666.0,1.0,105010.0
Establishment Deaths,1116,8110.0,13526.0,1.0,94476.0
New Hires,1075,209215.0,333911.0,746.0,2297342.0
RegData 2.0 Regulation Index,869,5480.0,10007.0,0.36,63506.0
RegData 2.1 Regulation Index,975,32585.0,28725.0,4387.0,143593.0
RegData 2.2 Regulation Index,650,8717.0,13006.0,21.12,66351.0
RegData 3.1 Regulation Index,650,16644.0,20416.0,233.0,97502.0


In [6]:
summary_4digit = (
    combined_4digit.describe()
    [[i for i in summary_table_rows if i in combined_4digit]]
    .round(2)
    .rename(columns=summary_table_rows)
    .T
    [['count', 'mean', 'std', 'min', 'max']]
    .rename(columns={'std':'standard deviation'})
    .assign(count=lambda df: df['count'].astype(int))
    .applymap(lambda x: f'{x:,.0f}' if x > 100 else f'{x:.02f}')
)
summary_4digit.to_csv('data/tables/summary_4digit.csv')
summary_4digit.to_latex('data/tables/summary_4digit.tex')
summary_4digit

Unnamed: 0,count,mean,standard deviation,min,max
Startup Rate,4056,10.58,8.05,0.0,370.0
Job Creation Rate,3658,14.57,6.8,1.35,84.67
Job Destruction Rate,3645,14.78,5.75,0.94,81.63
Establishment Births,4637,2482.0,4423.0,1.0,38742.0
Establishment Deaths,4636,2353.0,4090.0,1.0,40944.0
New Hires,4377,61888.0,115468.0,162.0,1396593.0
RegData 2.0 Regulation Index,2956,1124.0,3935.0,0.0,36131.0
RegData 2.2 Regulation Index,1632,2778.0,3970.0,8.61,25482.0
RegData 3.1 Regulation Index,2142,10226.0,16530.0,149.0,84274.0


# Regressions

In [7]:
def get_model(dep, ind, data, logdep=True, cluster=False):
    safe_data = (
        data
        [[dep, ind, 'naics_code', 'year']]
        .dropna()
        .set_index(['naics_code', 'year'])
    )
    if logdep:
        formula = f'np.log({dep}) ~ 1 + np.log({ind}) + EntityEffects + TimeEffects'
    else:
        formula = f'{dep} ~ 1 + np.log({ind}) + EntityEffects + TimeEffects'
    print(formula)
    model = linearmodels.PanelOLS.from_formula(formula, data=safe_data)
    if cluster:
        fitted = model.fit(cov_type='clustered', cluster_entity=True)
    else:
        fitted = model.fit(cov_type='robust')
    print(fitted.summary)
    return (
        fitted.params[1],
        fitted.std_errors[1],
        fitted.pvalues[1]
    )


In [8]:
DIGITS = (
    (3, combined_3digit),
    (4, combined_4digit)
)

INDVARS = (
    'regdata20',
    'regdata21',
    'regdata22',
    'regdata31',
)

DEPVARS = (
    ('estb_births', True), # Variable, Log
    ('estb_deaths', True),
    ('empl_created', True),
    ('estb_birth_rate', False),
    ('empl_create_rate', False),
    ('empl_destroy_rate', False)
)

CLUSTER = (True, False)

In [9]:
results = []
for (digits, data), indvar, (depvar, logdep), cluster in itertools.product(DIGITS, INDVARS, DEPVARS, CLUSTER):
    if not indvar in data:
        continue
    param, stderr, pval = get_model(depvar, indvar, data, logdep, cluster)
    results.append([digits, depvar, indvar, param, stderr, pval, 'clustered' if cluster else 'robust'])
results = pd.DataFrame(results, columns=['digits', 'depvar', 'indvar', 'param', 'stderr', 'pval', 'errtype'])

np.log(estb_births) ~ 1 + np.log(regdata20) + EntityEffects + TimeEffects
                           PanelOLS Estimation Summary                           
Dep. Variable:     np.log(estb_births)   R-squared:                        0.0025
Estimator:                    PanelOLS   R-squared (Between):              0.0040
No. Observations:                  936   R-squared (Within):               0.0065
Date:                 Thu, Jun 13 2019   R-squared (Overall):              0.0030
Time:                         16:07:12   Log-likelihood                    47.611
Cov. Estimator:              Clustered                                           
                                         F-statistic:                      2.1099
Entities:                           67   P-value                           0.1467
Avg Obs:                        13.970   Distribution:                   F(1,855)
Min Obs:                        12.000                                           
Max Obs:                

# Regression Summary Table

In [10]:
regdata_labels = {
    'regdata20': '2.0',
    'regdata21': '2.1',
    'regdata22': '2.2',
    'regdata31': '3.1',
}

var_labels = {
    'estb_births': 'Log Establishment Births',
    'estb_deaths': 'Log Establishment Deaths',
    'empl_created': 'Log New Hires',
    'estb_birth_rate': 'Startup Rate',
    'empl_create_rate': 'Job Creation Rate',
    'empl_destroy_rate': 'Job Destruction Rate',
}

study_labels = {
    'estb_births': 'Bailey and Thomas Measures',
    'estb_deaths': 'Bailey and Thomas Measures',
    'empl_created': 'Bailey and Thomas Measures',
    'estb_birth_rate': 'Goldschlag and Tabarrok Measures',
    'empl_create_rate': 'Goldschlag and Tabarrok Measures',
    'empl_destroy_rate': 'Goldschlag and Tabarrok Measures',
}

index_labels = {
    'indvar': "RegData Version",
    'digits': "NAICS Level"
}

def format_row(row):
    return (
        f'{row["param"]:01.02f} '
        f'({row["pval"]:01.02f})'
        f'{"*" if row["pval"] < 0.1 else ""}'
        f'{"*" if row["pval"] < 0.05 else ""}'
        f'{"*" if row["pval"] < 0.01 else ""}'
    )

regression_results = (
    results
    .assign(indvar=lambda df: df['indvar'].map(regdata_labels))
    .assign(digits=lambda df: df['digits'].map('{}-digit'.format))
    .assign(study=lambda df: df['depvar'].map(study_labels))
    .rename(columns=index_labels)
    .set_index(['errtype', 'RegData Version', 'NAICS Level', 'study', 'depvar'])
    .apply(format_row, axis=1)
    .unstack(level=['study', 'depvar'])
    .sort_index()
    .rename(columns=var_labels, level=1)
)
regression_results

Unnamed: 0_level_0,Unnamed: 1_level_0,study,Bailey and Thomas Measures,Bailey and Thomas Measures,Bailey and Thomas Measures,Goldschlag and Tabarrok Measures,Goldschlag and Tabarrok Measures,Goldschlag and Tabarrok Measures
Unnamed: 0_level_1,Unnamed: 1_level_1,depvar,Log Establishment Births,Log Establishment Deaths,Log New Hires,Startup Rate,Job Creation Rate,Job Destruction Rate
errtype,RegData Version,NAICS Level,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
clustered,2.0,3-digit,-0.06 (0.47),-0.08 (0.45),-0.21 (0.11),0.16 (0.83),-0.97 (0.33),-0.44 (0.51)
clustered,2.0,4-digit,-0.02 (0.20),-0.01 (0.68),-0.06 (0.01)***,-0.02 (0.89),0.19 (0.36),0.17 (0.39)
clustered,2.1,3-digit,-0.32 (0.02)**,-0.25 (0.15),-0.52 (0.00)***,3.85 (0.23),5.76 (0.01)**,2.54 (0.06)*
clustered,2.2,3-digit,0.05 (0.53),0.07 (0.31),0.01 (0.90),0.20 (0.69),0.14 (0.81),0.41 (0.23)
clustered,2.2,4-digit,-0.16 (0.06)*,-0.10 (0.17),-0.11 (0.03)**,-0.20 (0.63),-0.03 (0.95),0.28 (0.46)
clustered,3.1,3-digit,-0.19 (0.31),-0.22 (0.25),-0.27 (0.13),0.41 (0.59),1.14 (0.28),0.42 (0.53)
clustered,3.1,4-digit,0.18 (0.15),0.11 (0.33),0.14 (0.32),1.73 (0.23),0.78 (0.39),0.14 (0.84)
robust,2.0,3-digit,-0.06 (0.17),-0.08 (0.09)*,-0.21 (0.00)***,0.16 (0.72),-0.97 (0.09)*,-0.44 (0.25)
robust,2.0,4-digit,-0.02 (0.03)**,-0.01 (0.49),-0.06 (0.00)***,-0.02 (0.88),0.19 (0.27),0.17 (0.35)
robust,2.1,3-digit,-0.32 (0.00)***,-0.25 (0.01)**,-0.52 (0.00)***,3.85 (0.02)**,5.76 (0.00)***,2.54 (0.13)


## Results with Robust SEs

In [11]:
results_robust = regression_results.xs('robust')
results_robust.to_csv('data/tables/regression_summary_robust.csv')
results_robust.to_latex('data/tables/regression_summary_robust.tex')
results_robust

Unnamed: 0_level_0,study,Bailey and Thomas Measures,Bailey and Thomas Measures,Bailey and Thomas Measures,Goldschlag and Tabarrok Measures,Goldschlag and Tabarrok Measures,Goldschlag and Tabarrok Measures
Unnamed: 0_level_1,depvar,Log Establishment Births,Log Establishment Deaths,Log New Hires,Startup Rate,Job Creation Rate,Job Destruction Rate
RegData Version,NAICS Level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2.0,3-digit,-0.06 (0.17),-0.08 (0.09)*,-0.21 (0.00)***,0.16 (0.72),-0.97 (0.09)*,-0.44 (0.25)
2.0,4-digit,-0.02 (0.03)**,-0.01 (0.49),-0.06 (0.00)***,-0.02 (0.88),0.19 (0.27),0.17 (0.35)
2.1,3-digit,-0.32 (0.00)***,-0.25 (0.01)**,-0.52 (0.00)***,3.85 (0.02)**,5.76 (0.00)***,2.54 (0.13)
2.2,3-digit,0.05 (0.18),0.07 (0.03)**,0.01 (0.79),0.20 (0.56),0.14 (0.76),0.41 (0.29)
2.2,4-digit,-0.16 (0.00)***,-0.10 (0.00)***,-0.11 (0.00)***,-0.20 (0.51),-0.03 (0.93),0.28 (0.37)
3.1,3-digit,-0.19 (0.01)**,-0.22 (0.00)***,-0.27 (0.00)***,0.41 (0.53),1.14 (0.21),0.42 (0.48)
3.1,4-digit,0.18 (0.00)***,0.11 (0.02)**,0.14 (0.02)**,1.73 (0.10),0.78 (0.32),0.14 (0.81)


## Results with Clustered SEs

In [12]:
results_robust = regression_results.xs('clustered')
results_robust.to_csv('data/tables/regression_summary_clustered.csv')
results_robust.to_latex('data/tables/regression_summary_clustered.tex')
results_robust

Unnamed: 0_level_0,study,Bailey and Thomas Measures,Bailey and Thomas Measures,Bailey and Thomas Measures,Goldschlag and Tabarrok Measures,Goldschlag and Tabarrok Measures,Goldschlag and Tabarrok Measures
Unnamed: 0_level_1,depvar,Log Establishment Births,Log Establishment Deaths,Log New Hires,Startup Rate,Job Creation Rate,Job Destruction Rate
RegData Version,NAICS Level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2.0,3-digit,-0.06 (0.47),-0.08 (0.45),-0.21 (0.11),0.16 (0.83),-0.97 (0.33),-0.44 (0.51)
2.0,4-digit,-0.02 (0.20),-0.01 (0.68),-0.06 (0.01)***,-0.02 (0.89),0.19 (0.36),0.17 (0.39)
2.1,3-digit,-0.32 (0.02)**,-0.25 (0.15),-0.52 (0.00)***,3.85 (0.23),5.76 (0.01)**,2.54 (0.06)*
2.2,3-digit,0.05 (0.53),0.07 (0.31),0.01 (0.90),0.20 (0.69),0.14 (0.81),0.41 (0.23)
2.2,4-digit,-0.16 (0.06)*,-0.10 (0.17),-0.11 (0.03)**,-0.20 (0.63),-0.03 (0.95),0.28 (0.46)
3.1,3-digit,-0.19 (0.31),-0.22 (0.25),-0.27 (0.13),0.41 (0.59),1.14 (0.28),0.42 (0.53)
3.1,4-digit,0.18 (0.15),0.11 (0.33),0.14 (0.32),1.73 (0.23),0.78 (0.39),0.14 (0.84)
