In [None]:
import pandas as pd
import os
from collections import defaultdict
from data_setup import FixID
import config

### Note on use
This notebook assumes cbp data from https://www.census.gov/programs-surveys/cbp/data/datasets.html has been downloaded and placed in a folder called cbp_data/

In [None]:
cbp_files = os.listdir('./cbp_data/')

In [None]:
cbp_dfs = defaultdict()
for file in cbp_files:
    cbp_dfs['20'+file[3:5]] = pd.read_csv('./cbp_data/'+file)
    cbp_dfs['20'+file[3:5]] = cbp_dfs['20'+file[3:5]].assign(year = '20'+file[3:5])

In [None]:
cbp_dfs['2015'].columns = map(str.lower, cbp_dfs['2015'].columns)

In [None]:
#stack all of the data
CBP_merged = pd.concat([cbp_dfs[year] for year in cbp_dfs])

In [None]:
#only looking at totals
CBP_merged = CBP_merged[CBP_merged.naics == '------']

In [None]:
CBP_merged[CBP_merged.year == '2015']

In [None]:
CBP_merged = CBP_merged.assign(fipscty = CBP_merged['fipscty'].apply(lambda x: FixID(x,3)))
CBP_merged = CBP_merged.assign(fipstate = CBP_merged['fipstate'].apply(lambda x: FixID(x,2)))

In [None]:
CBP_merged['county'] = CBP_merged['fipstate'] + CBP_merged['fipscty']

In [None]:
CBP_merged.to_pickle('./pickles/CBP.p', protocol = 4)

In [None]:
#Select path to credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=config.GOOGLE_APPLICATION_CREDENTIALS
%load_ext google.cloud.bigquery

In [None]:
CBP = pd.read_pickle('./pickles/CBP.p')
ZRI = pd.read_pickle('./pickles/ZRI_filtered.p')


In [None]:
%%bigquery --use_rest_api hud
SELECT COUNTY, ZIP, BUS_RATIO, DATE
FROM `high-empire-220313.hud_crosswalk.county_zip_formatted` 

In [None]:
#Change date to year and aggregate for each year
hud['Year'] = hud.DATE.astype(str).apply(lambda x: FixID(x,6)[2:])
hud['Quarter'] = hud.DATE.astype(str).apply(lambda x: str(int(int(FixID(x,6)[:2])/3)))

In [None]:
hud_agg = hud.drop('DATE',axis =1).groupby(['COUNTY','ZIP','Year']).mean().reset_index()

In [None]:
zip_year = hud_agg[['ZIP','Year']]

In [None]:
zip_year = zip_year.drop_duplicates()

In [None]:
zri_zips = ZRI.RegionName.apply(FixID)

In [None]:
%%time
zip_filtered = zip_year[zip_year.ZIP.apply(lambda x: x in list(zri_zips))]

In [None]:
zip_filtered = zip_filtered[(zip_filtered.Year.astype(int)>2010) & (zip_filtered.Year.astype(int)<2019)]

In [None]:
zipped_years = zip(list(zip_filtered.ZIP),list(zip_filtered.Year))

In [None]:
def zip_business(zip_code, year, value):
    hud_transforms = hud_agg[(hud_agg.ZIP == zip_code) & (hud_agg.Year == year)]
    zip_value = 0
    for county in hud_transforms.COUNTY:
        try:
            zip_value += hud_transforms[hud_transforms.COUNTY == county].BUS_RATIO.iloc[0] *\
                CBP[(CBP.county == county) & (CBP.year == year)][value].iloc[0]
        except IndexError:
            pass
    return(zip_value)

In [None]:
%%time
est =  [zip_business(zip_code,year,'est') for zip_code,year in zipped_years]

In [None]:
%%time
emp =  [zip_business(zip_code,year,'emp') for zip_code,year in zip(list(zip_filtered.ZIP),list(zip_filtered.Year))]

In [None]:
%%time
ap =  [zip_business(zip_code,year,'ap') for zip_code,year in zip(list(zip_filtered.ZIP),list(zip_filtered.Year))]

In [None]:
%%time
#zip_filtered['emp'] = zip_filtered.apply(lambda x: zip_business(x.loc['ZIP'],x.loc['Year'],'emp'),axis = 1)

In [None]:
zip_filtered = zip_filtered.assign(est = est, emp = emp, ap = ap)

In [None]:
zip_filtered.to_pickle('./pickles/cbp_zip.p',protocol = 4)