In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

# NAICS Crosswalks

Following G&T, we create a weighted mapping from each standard to the next based on the concordance of 6-digit industries.

In [2]:
def get_weights(concord, fromstd, tostd, digits):
    return (
        concord
        [[f'{fromstd}_{digits}digit', f'{tostd}_{digits}digit']]
        .assign(weight=lambda df: (
            1 / (df[f'{fromstd}_{digits}digit'].map(
                df.groupby(f'{fromstd}_{digits}digit')
                .count()[f'{tostd}_{digits}digit']
            ))
        ))
        .groupby([f'{fromstd}_{digits}digit', f'{tostd}_{digits}digit'])
        .sum()
        .reset_index()
        .rename(columns=lambda x: x.split('_')[0])
    )

In [3]:
concord9702 = (
    pd.read_excel(
        'data/xwalks/1997_NAICS_to_2002_NAICS.xls',
        sheet_name=1,
        skipfooter=1,
        usecols=[0, 2],
        names=['naics97_6digit', 'naics02_6digit']
    )
    .assign(naics97_4digit=lambda df: df['naics97_6digit'] // 100)
    .assign(naics97_3digit=lambda df: df['naics97_6digit'] // 1000)
    .assign(naics02_4digit=lambda df: df['naics02_6digit'] // 100)
    .assign(naics02_3digit=lambda df: df['naics02_6digit'] // 1000)
)
concord9702.head()

Unnamed: 0,naics97_6digit,naics02_6digit,naics97_4digit,naics97_3digit,naics02_4digit,naics02_3digit
0,111110,111110,1111,111,1111,111
1,111120,111120,1111,111,1111,111
2,111130,111130,1111,111,1111,111
3,111140,111140,1111,111,1111,111
4,111150,111150,1111,111,1111,111


In [4]:
concord0207 = (
    pd.read_excel(
        'data/xwalks/2002_to_2007_NAICS.xls',
        skiprows=2,
        usecols=[0, 2],
        names=['naics02_6digit', 'naics07_6digit']
    )
    .assign(naics02_4digit=lambda df: df['naics02_6digit'] // 100)
    .assign(naics02_3digit=lambda df: df['naics02_6digit'] // 1000)
    .assign(naics07_4digit=lambda df: df['naics07_6digit'] // 100)
    .assign(naics07_3digit=lambda df: df['naics07_6digit'] // 1000)
)
concord0207.head()

Unnamed: 0,naics02_6digit,naics07_6digit,naics02_4digit,naics02_3digit,naics07_4digit,naics07_3digit
0,111110,111110,1111,111,1111,111
1,111120,111120,1111,111,1111,111
2,111130,111130,1111,111,1111,111
3,111140,111140,1111,111,1111,111
4,111150,111150,1111,111,1111,111


In [5]:
concord1207 = (
    pd.read_excel(
        'data/xwalks/2007_to_2012_NAICS.xls',
        skiprows=2,
        usecols=[0, 2],
        names=['naics07_6digit', 'naics12_6digit']
    )
    .assign(naics12_4digit=lambda df: df['naics12_6digit'] // 100)
    .assign(naics12_3digit=lambda df: df['naics12_6digit'] // 1000)
    .assign(naics07_4digit=lambda df: df['naics07_6digit'] // 100)
    .assign(naics07_3digit=lambda df: df['naics07_6digit'] // 1000)
)
concord1207.head()

Unnamed: 0,naics07_6digit,naics12_6digit,naics12_4digit,naics12_3digit,naics07_4digit,naics07_3digit
0,111110,111110,1111,111,1111,111
1,111120,111120,1111,111,1111,111
2,111130,111130,1111,111,1111,111
3,111140,111140,1111,111,1111,111
4,111150,111150,1111,111,1111,111


In [6]:
xwalk9702 = pd.concat(tuple(
    get_weights(concord9702, 'naics97', 'naics02', digits)
    for digits in (3, 4)
))
xwalk0207 = pd.concat(tuple(
    get_weights(concord0207, 'naics02', 'naics07', digits)
    for digits in (3, 4)
))
# NB: Converting from NAICS 2012 to 2007, not vice versa
xwalk1207 = pd.concat(tuple(
    get_weights(concord1207, 'naics12', 'naics07', digits)
    for digits in (3, 4)
))

# Compiling SUSB

We read in 98/99 through 06/07 from the SUSB Table files, which all follow the same format.
We read in 07/08 through 15/16 from the SUSB Database files.

The NAICS standards used are:

start year | end year | standard
:-:|:-:|:-:
98/99 | 02/03 | NAICS 97
03/04 | 07/08 | NAICS 02
08/09 | 12/13 | NAICS 07
13/14 | 15/16 | NAICS 12

All SUSB vintages are converted to NAICS 2007


In [53]:
def convert_naics(raw, fromstd, tostd, xwalk):
    return (
        raw
        .rename(columns={'naics_code': fromstd})
        .merge(xwalk.rename(columns={tostd: 'naics_code'}))
        .assign(value = lambda df: df['value'].mul(df['weight']))
        .groupby(['initial_year', 'naics_code', 'var'])[['value']]
        .sum(min_count=1)
        .reset_index()
    )

In [54]:
VARMAP_TABLE = {
    1: 'estb_initial',
    4: 'estb_births',
    5: 'estb_deaths',
    6: 'estb_expand',
    7: 'estb_contract',
    10: 'empl_initial',
    13: 'empl_births',
    14: 'empl_deaths',
    15: 'empl_expand',
    16: 'empl_contract',    
}

def read_susb_table(initial_year):
    print(initial_year)
    return (
        pd.read_excel(
            f'data/susb/us_4digitnaics_emplchange_{initial_year}-{initial_year+1}.xls',
            skiprows=7,
            na_values=['(D)', '[j]', '[k]', '[i]'],
        )
        .rename(columns={
            'CODE': 'naics_code',
            'CODE.1': 'var_code',
            'TOTAL': 'value',
        })
        .loc[lambda df: df['naics_code'].astype(str).str.contains('\d+')]
        .assign(naics_code=lambda df: (
            df['naics_code']
            .astype(str)
            .str.strip('-')
            .str.split('-').str[0]
            .astype(int)
        ))
        .loc[lambda df: df['naics_code'] >= 100]
        .loc[lambda df: df['var_code'].isin(VARMAP_TABLE)]
        .assign(var=lambda df: df['var_code'].map(VARMAP_TABLE))
        .assign(initial_year=initial_year)
        [['initial_year', 'naics_code', 'var', 'value']]
    )

In [55]:
VARMAP_DB = {
    'NAICS': 'naics_code',
    'INIT_ESTB': 'estb_initial',
    'INIT_EMPL': 'empl_initial',
    'INIT_EMPLFL_N': 'empl_initial_flag',
    'BIRTHS_ESTB': 'estb_births',
    'BIRTHS_EMPL': 'empl_births',
    'BIRTHS_EMPLFL_N': 'empl_births_flag',
    'DEATHS_ESTB': 'estb_deaths',
    'DEATHS_EMPL': 'empl_deaths',
    'DEATHS_EMPLFL_N': 'empl_deaths_flag',
    'EXP_ESTB': 'estb_expand',
    'EXP_EMPL': 'empl_expand',
    'EXP_EMPLFL_N': 'empl_expand_flag',
    'CONTR_ESTB': 'estb_contract',
    'CONTR_EMPL': 'empl_contract',
    'CONTR_EMPLFL_N': 'empl_contract_flag'
}
def read_susb_db(initial_year):
    print(initial_year)
    return (
        pd.read_csv(f'data/susb/us_state_emplchange_{initial_year}-{initial_year + 1}.txt')
        .loc[lambda df: df['STATE'] == 0]
        .loc[lambda df: df['ENTRSIZE'] == 1]
        [VARMAP_DB.keys()]
        .rename(columns=VARMAP_DB)
        .loc[lambda df: df['naics_code'].astype(str).str.contains('\d+')]
        .assign(naics_code=lambda df: (
            df['naics_code']
            .astype(str)
            .str.strip('-')
            .str.split('-').str[0]
            .astype(int)
        ))
        .loc[lambda df: df['naics_code'] >= 100]
        .assign(empl_initial=lambda df: (
            df['empl_initial']
            [~df['empl_initial_flag'].isin({'D', 'S'})] # Missing or Suppressed
        ))
        .assign(empl_births=lambda df: (
            df['empl_births']
            [~df['empl_births_flag'].isin({'D', 'S'})]
        ))
        .assign(empl_deaths=lambda df: (
            df['empl_deaths']
            [~df['empl_deaths_flag'].isin({'D', 'S'})]
        ))
        .assign(empl_expand=lambda df: (
            df['empl_expand']
            [~df['empl_expand_flag'].isin({'D', 'S'})]
        ))
        .assign(empl_contract=lambda df: (
            df['empl_contract']
            [~df['empl_contract_flag'].isin({'D', 'S'})]
        ))
        .drop([i for i in VARMAP_DB.values() if i.endswith('_flag')], axis='columns')
        .melt(id_vars=['naics_code'], var_name='var')
        .assign(initial_year=initial_year)
        [['initial_year', 'naics_code', 'var', 'value']]
    )

In [56]:
susb_naics97 = pd.concat(tuple(
    read_susb_table(initial_year)
    for initial_year in range(1998, 2003)
))
susb_naics97.sort_values(['initial_year', 'naics_code', 'var']).head()

1998
1999
2000
2001
2002


Unnamed: 0,initial_year,naics_code,var,value
52,1998,113,empl_births,5619.0
55,1998,113,empl_contract,-10608.0
53,1998,113,empl_deaths,-6013.0
54,1998,113,empl_expand,10818.0
49,1998,113,empl_initial,84150.0


In [57]:
susb_naics02 = [convert_naics(susb_naics97, 'naics97', 'naics02', xwalk9702)]
susb_naics02.extend([
    read_susb_table(initial_year)
    for initial_year in range(2003, 2007)
])
susb_naics02.append(read_susb_db(2007))
susb_naics02 = pd.concat(susb_naics02)
susb_naics02.sort_values(['initial_year', 'naics_code', 'var']).head()

2003
2004
2005
2006
2007


Unnamed: 0,initial_year,naics_code,var,value
0,1998,113,empl_births,5619.0
1,1998,113,empl_contract,-10608.0
2,1998,113,empl_deaths,-6013.0
3,1998,113,empl_expand,10818.0
4,1998,113,empl_initial,84150.0


In [58]:
susb_naics12 = pd.concat([
    read_susb_db(initial_year)
    for initial_year in range(2013, 2015)
])

2013
2014


In [59]:
susb_naics07 = [
    convert_naics(susb_naics02, 'naics02', 'naics07', xwalk0207)
]
susb_naics07.extend([
    read_susb_db(initial_year)
    for initial_year in range(2008, 2012)
])
susb_naics07.append(
    convert_naics(susb_naics12, 'naics12', 'naics07', xwalk1207)
)
susb_naics07 = pd.concat(susb_naics07)

2008
2009
2010
2011


In [84]:
Path('data/final').mkdir(parents=True, exist_ok=True)
susb_naics07.to_csv('data/final/susb_combined.csv')

In [85]:
(
    pd.read_csv(f'data/susb/us_state_emplchange_2015-2016.txt')
    .loc[lambda df: df['STATE'] == 0]
    .loc[lambda df: df['ENTRSIZE'] == 1]
    [VARMAP_DB.keys()]
    .rename(columns=VARMAP_DB)
    .loc[lambda df: df['naics_code'].astype(str).str.contains('\d+')]
    .assign(naics_code=lambda df: (
        df['naics_code']
        .astype(str)
        .str.strip('-')
        .str.split('-').str[0]
        .astype(int)
    ))
    .loc[lambda df: df['naics_code'] >= 100]
)

Unnamed: 0,naics_code,estb_initial,empl_initial,empl_initial_flag,estb_births,empl_births,empl_births_flag,estb_deaths,empl_deaths,empl_deaths_flag,estb_expand,empl_expand,empl_expand_flag,estb_contract,empl_contract,empl_contract_flag
16,113,7749,56155,G,848,3153,G,739,-2451,G,1975,5338,G,2112,-6526,G
24,1131,422,3666,H,33,123,G,30,-67,J,113,727,H,72,-203,H
32,1132,149,1395,G,21,94,J,23,-130,G,38,125,H,31,-135,G
40,1133,7178,51094,G,794,2936,G,686,-2254,G,1824,4486,G,2009,-6188,G
48,114,1530,7509,G,285,646,H,213,-922,H,214,600,G,190,-711,G
56,1141,1240,5905,G,258,596,H,185,-841,H,148,374,G,129,-569,G
64,1142,290,1604,G,27,50,G,28,-81,H,66,226,G,61,-142,G
72,115,9181,96455,G,1107,5585,G,998,-4805,H,2022,9057,G,1934,-8923,G
80,1151,3949,65303,G,450,3784,G,398,-3035,H,993,6175,G,1001,-6253,G
88,1152,3877,19468,G,504,1321,G,456,-1285,G,750,1705,G,634,-1370,G


In [90]:
(
    susb_naics07
    .loc[lambda df: (
        (df['var'] == 'estb_births')
        & (df['value'] == 0)
    )]
)

Unnamed: 0,initial_year,naics_code,var,value
4375,2014,521,estb_births,0.0
6595,2014,5211,estb_births,0.0
6645,2014,5232,estb_births,0.0
