## Pull BPS Permit data for NYC Metro counties outside of NYC & NYC data from Bytes of the Big Apple

##### Requires storing the housing db file as a csv in the project repo

https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-housing-database.page#housingdevelopmentproject
For greatest similarity to BPS data, the HousingDB_post2010 (inactives included) file is used.

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [2]:
from geo import stco_fips,sub

#### SET THESE VARIABLES

In [3]:
base_yr = '2010'
end_yr = '2020'
nyc_db_datapath = '../data/nychdb_inactiveincluded_20q4_csv/HousingDB_post2010_inactive_included.csv'

### Build Master Table of all County Permit Data from BPS

In [4]:
# set link location
urls = 'https://www2.census.gov/econ/bps/County/'
resp = requests.get(urls)

#pull annual files ("a")
soup = BeautifulSoup(resp.text,'html.parser')
file_links = soup.find_all('a',href = True)
file_links = [link.get_text() for link in file_links if 'a.txt' in link.get_text()]

In [5]:
# set column names and cleanup data
id_fields = ['Survey Date', 'FIPS State', 'FIPS County', 'Region Code',
              'Division Code', 'County Name']

# each N of units has Buildings, Units, and Valuation columns,
col_names = ['1un_bldg', 'HP1', '1un_val',
            '2un_bldg', 'HP2', '2un_val',
            '3-4un_bldg', 'HP3-4', '3-4un_val',
            '5+un_bldg', 'HP5', '5+un_val']

# combine id fields, unit columns, and reported unit columns:
model_colnames = id_fields + col_names + ['reported_' + colname for colname in col_names]

In [6]:
# build county table of all data for full US
all_data = {year:None for year in file_links}

for link in file_links:
    df = pd.read_csv(f'{urls}{link}',header = 1, sep = ',', skipinitialspace=True,\
                     low_memory=False, dtype = str)
    df.columns = model_colnames
    df.columns = df.columns.str.replace(' ', '_')
    df['year'] = re.sub('\D+', '', link)
    df['id'] = df.FIPS_State.str.strip() + df.FIPS_County
    
    all_data[link] = df

all_data = pd.concat(all_data.values())
all_data = all_data.apply(lambda x: x.str.strip())
all_data['year'] = all_data.year.astype(int)

### Build NYC Metro County table & remove NYC

In [7]:
metro = all_data[(all_data.id.isin(stco_fips))&(all_data.year >= int(base_yr))].copy()
metro = metro[['id','HP1','HP2','HP3-4','HP5']]
for col in metro.columns[1:]:
    metro[col]=metro[col].astype(int)
metro = metro.rename({'HP2':'HP24','HP3-4':'HP24'},axis=1)

metro = metro.groupby(metro.columns,axis=1).sum()
metro['HP']=metro['HP1']+metro['HP24']+metro['HP5']
metro = metro.groupby(['id']).sum().reset_index()
metro['id']=metro['id'].astype(int)
nyc = [36005,36047,36061,36081,36085]
metro = metro[~(metro['id'].isin(nyc))]

### Pull NYC Housing Database permits

In [8]:
# data must be downloaded from bytes of the big apple
# https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-housing-database.page#housingdevelopmentproject

nyc_db = pd.read_csv(f'{nyc_db_datapath}',low_memory=False)
nyc_db.loc[nyc_db.PermitYear==' ','PermitYear'] = np.nan
nyc_db['PermitYear'] = pd.to_numeric(nyc_db['PermitYear'])
nyc_db['id'] = [int(str(block)[:5]) for block in nyc_db.CenBlock10] #make borough id
nyc_db = nyc_db[(nyc_db.Job_Type == 'New Building') & (nyc_db.PermitYear >= int(base_yr))]
nyc_db['size'] = pd.cut(nyc_db['ClassAProp'],bins=[0.1,1,4,np.inf], include_lowest=False,
                                  labels=['HP1','HP24','HP5'])

In [9]:
nyc_db = nyc_db[['id','size','ClassAProp']]
nyc_db = nyc_db.dropna(subset=['size'])
nyc_db['size'] = nyc_db['size'].astype(str)
nyc_db = pd.pivot_table(nyc_db,values='ClassAProp',index=['id'],columns=['size'],\
                        aggfunc=np.sum,margins=False).reset_index()

In [10]:
nyc_db['HP'] = nyc_db['HP1']+nyc_db['HP24']+nyc_db['HP5']

### Combine tables into final permits table

In [11]:
df = pd.concat([metro,nyc_db])

In [12]:
df['id'] = df['id'].astype(str).str.pad(width=5,side='left',fillchar='0')
subreg = df.copy()
subreg['sub'] = subreg.id.map(sub)
subreg = pd.pivot_table(subreg,values=subreg.columns[1:5],index='sub',\
                    aggfunc=np.sum,margins=True).reset_index() 
subreg.loc[subreg['sub']=='All',['sub']] = 'METRO'
subreg = subreg.rename(columns={'sub':'id'})

In [13]:
cosubreg = pd.concat([df,subreg])
for col in cosubreg.columns[1:]:
    cosubreg=cosubreg.rename({col:f'{col}_{base_yr[-2:]}{end_yr[-2:]}'},axis=1)

In [14]:
cosubreg.loc[cosubreg['id']=='09001',['id']] = '9001' #clean for master join
cosubreg.loc[cosubreg['id']=='09005',['id']] = '9005' #clean for master join
cosubreg.loc[cosubreg['id']=='09009',['id']] = '9009' #clean for master join

In [15]:
cosubreg.to_csv('../output/intermediate/houperm_csr.csv',index=False)