# Census LEHD WAC Supportive Stats

This notebook is meant to explore summary statistics that may be used to support the 2015 LQ, 2002–2015 LQ diff, and job density analyses.

Ideas could be:
- determine raw job count changes at the county level for each county
- look for noticeable changes by job sector (e.g. places where manufacturing jobs took a hit and were replaced by another job category)

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import sys, os

## Load Data

In [2]:
# specify paths for csv and shapefile data
dirname = os.path.dirname(os.path.realpath("__file__"))
wac2015_filepath = os.path.join(dirname, "../data/wac/ca_wac_S000_JT00_2015.csv.gz")
wac2002_filepath = os.path.join(dirname, "../data/wac/ca_wac_S000_JT00_2002.csv.gz")
cxwalk_filepath = os.path.join(dirname, "../data/wac/ca_xwalk.csv.gz")
tracts_shp_filepath = os.path.join(dirname, "../data/census_tracts/tracts_2010_4326.shp")

In [3]:
# load 2002 & 2015 census wac data, plus crosswalk file
wac2015 = pd.read_csv(wac2015_filepath, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, compression="gzip")
wac2002 = pd.read_csv(wac2002_filepath, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, compression="gzip")
cxwalk = pd.read_csv(cxwalk_filepath, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, compression="gzip", encoding="ISO-8859-1", low_memory=False)

## Filter and Rollup 

In [4]:
# filter crosswalk table by 9 counties of SF Bay Area
cty_fips_list = [6001, 6013, 6041, 6055, 6075, 6081, 6085, 6095, 6097]
cxwalk = cxwalk[cxwalk['cty'].isin(cty_fips_list)]

In [5]:
# keep only the block and tract id columns
cxwalk = cxwalk[['tabblk2010', 'trct', 'cty', 'ctyname']]

In [6]:
# join 2015 and 2002 wac files to cxwalk using fields w_geocode and tabblk2010
wac2015 = wac2015.merge(cxwalk, how="inner", left_on="w_geocode", right_on="tabblk2010")
wac2002 = wac2002.merge(cxwalk, how="inner", left_on="w_geocode", right_on="tabblk2010")

In [7]:
# NAICS codes for each super category
makers = ['CNS01', 'CNS02', 'CNS03', 'CNS04', 'CNS05', 'CNS06', 'CNS08']
services = ['CNS07', 'CNS14', 'CNS17', 'CNS18']
professions = ['CNS09', 'CNS10', 'CNS11', 'CNS12', 'CNS13']
support = ['CNS15', 'CNS16', 'CNS19', 'CNS20']

In [8]:
# create new aggregate columns for various job sectors
wac2015['makers'] = wac2015[makers].sum(axis=1)
wac2015['services'] = wac2015[services].sum(axis=1)
wac2015['professions'] = wac2015[professions].sum(axis=1)
wac2015['support'] = wac2015[support].sum(axis=1)
wac2015['total'] = wac2015['C000']

wac2002['makers'] = wac2002[makers].sum(axis=1)
wac2002['services'] = wac2002[services].sum(axis=1)
wac2002['professions'] = wac2002[professions].sum(axis=1)
wac2002['support'] = wac2002[support].sum(axis=1)
wac2002['total'] = wac2002['C000']

In [9]:
# make sure things add up
assert sum(wac2015['C000'] -(wac2015['makers'] + wac2015['services'] + wac2015['professions'] + wac2015['support'])) == 0
assert sum(wac2002['C000'] -(wac2002['makers'] + wac2002['services'] + wac2002['professions'] + wac2002['support'])) == 0

In [10]:
# keep only the columns we need from the wac dataframe
to_keep = ['trct', 'cty', 'ctyname', 'makers', 'services', 'professions', 'support', 'total']
wac2015 = wac2015[to_keep]
wac2002 = wac2002[to_keep]

In [11]:
# create a map of county names to county fips codes
cty_map = wac2015[['cty', 'ctyname']].copy().drop_duplicates()

In [12]:
# group and aggregate data by county
wac2015_cty = wac2015.groupby('cty', as_index=False).agg(np.sum)
wac2002_cty = wac2002.groupby('cty', as_index=False).agg(np.sum)

wac2015_cty

Unnamed: 0,cty,trct,makers,services,professions,support,total
0,6001,67137994689004,189422,193351,143201,225266,751240
1,6013,39297182491667,61866,102059,71442,124395,359762
2,6041,11592901957680,16390,34820,21659,39602,112471
3,6055,6957425901726,25190,22246,5669,20499,73604
4,6075,30709363047279,74938,192580,238913,194185,700616
5,6081,29678247471438,86735,101077,115786,84334,387932
6,6085,67567382424048,262518,245661,256660,242029,1006868
7,6095,16835088103469,30630,37413,11800,64630,144473
8,6097,23681339408254,52227,56591,22549,61678,193045


In [13]:
# new dataframe for county stats
cd = pd.DataFrame()

In [14]:
cd['county_fips'] = wac2015_cty['cty']

cd = cd.merge(cty_map, left_on='county_fips', right_on='cty')
cd = cd[['ctyname']]
cd['ctyname'] = cd['ctyname'].str[:-10]

cd['maker_change'] = wac2015_cty['makers'] - wac2002_cty['makers']
cd['prof_change'] = wac2015_cty['professions'] - wac2002_cty['professions']
cd['services_change'] = wac2015_cty['services'] - wac2002_cty['services']
cd['support_change'] = wac2015_cty['support'] - wac2002_cty['support']
cd['total_change'] = wac2015_cty['total'] - wac2002_cty['total']

cd['maker_pct_change'] = (cd['maker_change'] / wac2002_cty['makers']).round(2)
cd['prof_pct_change'] = (cd['prof_change'] / wac2002_cty['professions']).round(2)
cd['services_pct_change'] = (cd['services_change'] / wac2002_cty['services']).round(2)
cd['support_pct_change'] = (cd['support_change'] / wac2002_cty['support']).round(2)

cd = cd.sort_values('total_change', ascending=False)

print(cd)

          ctyname  maker_change  prof_change  services_change  support_change  \
4  San Francisco           6406        77280            35238           63860   
6    Santa Clara         -20752        62599            43517           58457   
0        Alameda         -18162         -852            32079           47572   
5      San Mateo          -8786        31506             9817           14624   
1   Contra Costa          -3155         2673            10900           25038   
7         Solano          -1118         -827             1975           26611   
8         Sonoma          -2506        -1205             9040            9202   
3           Napa           3068          122             5251            5655   
2          Marin          -1055        -3105             1707            9353   

   total_change  maker_pct_change  prof_pct_change  services_pct_change  \
4        182784              0.09             0.48                 0.22   
6        143821             -0.07      

In [15]:
# save above table to csv
cd.to_csv(os.path.join(dirname, "county_level_change.csv"), index=False)

## Regional change of each category from 2002 – 2015

In [16]:
# new data frame for 9 county sf bay area region
reg = pd.DataFrame(
    columns=['category', '2002', '2015', 'change', 'pct_change'],
    index=[0, 1, 2, 3, 4]
)

reg['category'] = ['makers', 'services', 'professions', 'support', 'total']

#print(reg)

In [17]:
# calc 2015 & 2002 yearly totals for each category
reg['2015'] = [
    wac2015['makers'].sum(),
    wac2015['services'].sum(),
    wac2015['professions'].sum(),
    wac2015['support'].sum(),
    wac2015['total'].sum()
]

reg['2002'] = [
    wac2002['makers'].sum(),
    wac2002['services'].sum(),
    wac2002['professions'].sum(),
    wac2002['support'].sum(),
    wac2002['total'].sum()
]

#print(reg)

In [18]:
# calc the net and percentage diff between years
reg['change'] = reg['2015'] - reg['2002']
reg['pct_change'] = (reg['change'] / reg['2002'] * 100).round(2)

print(reg)

      category     2002     2015  change  pct_change
0       makers   845976   799916  -46060       -5.44
1     services   836274   985798  149524       17.88
2  professions   719488   887679  168191       23.38
3      support   796246  1056618  260372       32.70
4        total  3197984  3730011  532027       16.64


In [20]:
# save above table to csv
reg.to_csv(os.path.join(dirname, "region_level.csv"), index=False)