## Pull BPS Permit data for NYC Metro municipalities outside of NYC & NYC data from Bytes of the Big Apple - SINGLE YEAR

https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-housing-database.page#housingdevelopmentproject

For greatest similarity to BPS data, the HousingDB_post2010 (inactives included) file is used.

In [16]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [17]:
from geo import stco_fips,sub

#### SET THESE VARIABLES AND DATA LINKS

In [18]:
base_yr = '2010'
end_yr = '2020'
pull_yr = '2020'

#geoxwalk files and data file for NYC
geo_subpl = pd.read_csv('../data/geo/subpl10.csv')
geo_nyc = pd.read_csv('../data/geo/nyc_subbor_10.csv')

nyc_db_datapath = '../data/permits/HousingDB_post2010_inactive_included.csv'

### Build Table for NYC Metro Municipalities (no NYC)

In [19]:
# set link location
urls = 'https://www2.census.gov/econ/bps/Place/Northeast%20Region/'
resp = requests.get(urls)

#pull annual files ("a")
soup = BeautifulSoup(resp.text,'html.parser')
file_links = soup.find_all('a',href = True)
file_links = [link.get_text() for link in file_links if 'a.txt' in link.get_text()]
#reduce years to what we want vs. all historical files
data_yrs = [str(x) for x in range(int(base_yr),int(end_yr)+1)] 
file_links = [x for x in file_links if x[2:6] in data_yrs]

In [20]:
nyc = ['36005','36047','36061','36081','36085']

# set column names and cleanup data
id_fields = ['surveydate','statecode','6-digitid','countycode','fips placecode','fips mcdcode','placename']

# each N of units has Buildings, Units, and Valuation columns,
val_cols = ['1un_bldg', 'HP1', '1un_val',
            '2un_bldg', 'HP2', '2un_val',
            '3-4un_bldg', 'HP3-4', '3-4un_val',
            '5+un_bldg', 'HP5', '5+un_val']

In [21]:
# build muni table of all data for full Northeast
all_data = {year:None for year in file_links}

for link in file_links:
    rows = requests.get(f'{urls}{link}').text.split('\n')
    row0 = [x.lower() for x in rows[0].split(',')] + ['']
    row1 = [x.lower() for x in rows[1].split(',')]
    cols = [row0[ind] + col_y for ind, col_y in enumerate(row1)]
    
    cols_id = [cols.index(x) for x in cols if x in id_fields]
    model_colnames = [id_field for id_field in id_fields if id_field in cols] + \
                        val_cols + ['reported_' + colname for colname in val_cols]
    
    df = pd.read_csv(f'{urls}{link}',header = 1, sep = ',', skipinitialspace=True,\
                     low_memory = False, dtype = str)
    
    cols_val = cols.index('bldgs')
    cols_id += list(range(cols_val,len(df.columns)))
    
    df = df.iloc[:,cols_id]
    df.columns = model_colnames
    df['stco'] = df.statecode+df.countycode
    df = df[df.stco.isin(stco_fips)].copy()
    df.columns = df.columns.str.replace(' ', '_')
    df['year'] = re.sub('\D+', '', link)
    
    all_data[link] = df

In [22]:
permits = pd.concat(all_data.values())

In [23]:
#reduce table to rest of metro, cleanup place/cousub codes and adjust Long Island reporting geos
permits = permits[~permits['stco'].isin(nyc)].copy()
permits['fips_placecode'] = permits.fips_placecode.replace(np.nan,'00000').str.rstrip()
permits['id'] = np.where((permits['stco'].isin(['36059','36103']))&(permits['fips_placecode']!='00000'),\
                      (permits.statecode+permits.fips_placecode).str.strip(),(permits.stco+permits.fips_mcdcode).str.strip())
permits = permits.dropna(subset=['id'])

#join to geo crosswalk data, clean up ids
permits.loc[permits['id']=='3607156185',['id']] = '3607147999' #Manually correct Palm Tree NY & Kiryas Joel
permits['id'] = permits['id'].astype(int)
permits = pd.merge(permits,geo_subpl,on='id',how='left')
permits = permits.dropna(subset=['geoid'])

In [24]:
#reduce table to single year
permits = permits[permits['year']==pull_yr]

In [25]:
#reduced table for final nyc metro munis
reg = permits[['geoid','HP1','HP2','HP3-4','HP5']].copy()
reg['geoid'] = reg['geoid'].astype(int)
for col in reg.columns[1:]:
    reg[col]=reg[col].astype(int)
reg = reg.rename({'geoid':'id','HP2':'HP24','HP3-4':'HP24'},axis=1)

In [26]:
reg = reg.groupby(reg.columns,axis=1).sum().groupby(['id']).sum().reset_index()
reg['HP']=reg['HP1']+reg['HP24']+reg['HP5']

### Pull NYC Housing Database permits

In [27]:
# data must be downloaded & retrieved from folder in project
nyc_db = pd.read_csv(f'{nyc_db_datapath}',low_memory=False)
nyc_db.loc[nyc_db.PermitYear==' ','PermitYear'] = np.nan
nyc_db['PermitYear'] = pd.to_numeric(nyc_db['PermitYear'])

#make tract id, filter for new buildings and pull year
nyc_db['ct_id'] = [int(str(block)[:11]) for block in nyc_db.CenBlock10] 
nyc_db = nyc_db[(nyc_db.Job_Type == 'New Building') & (nyc_db.PermitYear == int(pull_yr))]
#separate permits into same categories as BPS data
nyc_db['size'] = pd.cut(nyc_db['ClassAProp'],bins=[0.1,1,4,np.inf], include_lowest=False,
                                  labels=['HP1','HP24','HP5'])

#reduce table and clean up
nyc_db = nyc_db[['ct_id','size','ClassAProp']]
nyc_db = nyc_db.dropna(subset=['size'])
nyc_db['size'] = nyc_db['size'].astype(str)

In [28]:
#recode to nyc sub borough breakdowns
nyc_db = pd.merge(nyc_db,geo_nyc,on='ct_id',how='left')
nyc_db = nyc_db.drop(columns=['boro','ct_id','puma','nta_id','nta_nm','name'])

In [29]:
#pivot for decade
nyc_db = pd.pivot_table(nyc_db,values='ClassAProp',index=['id'],columns=['size'],\
                        aggfunc=np.sum,fill_value=0,margins=False).reset_index()
nyc_db['HP'] = nyc_db['HP1']+nyc_db['HP24']+nyc_db['HP5']

### Combine tables into final permits table
Calculate subregion, region totals & export to intermediate csv

In [31]:
df = pd.concat([reg,nyc_db])

In [None]:
df.head()

In [32]:
df.to_csv('2020_houperm_m.csv',index=False)

In [33]:
nyc_db

size,id,HP1,HP24,HP5,HP
0,36005CS,1,40,1979,2020
1,36005NE,0,23,346,369
2,36005W,4,4,3326,3334
3,36047C,2,24,2270,2296
4,36047E,0,19,1244,1263
5,36047N,1,27,953,981
6,36047S,20,45,623,688
7,36047W,13,12,988,1013
8,36061E,0,0,521,521
9,36061L,0,0,349,349
