# ACS Housing Unit by Building Size

#### This notebook pulls raw building size categorizations from the U.S. Census Bureau ACS profile tables and recategorizes to DCP-defined building size categories: 1-unit, 2-unit, 3 to 4-unit, and 5+ unit buildings.

Can store multiple years in list to generate information for multiple ACS periods. User must set variables and other parameters. To generate new aggregate margins of error and statistical reliability figures, margins of error for aggregate columns must be separated from estimate columns.

- https://www.census.gov/data/developers/data-sets/acs-5year.html
- https://api.census.gov/data/2010/acs/acs5/profile/variables.html
- https://api.census.gov/data/2019/acs/acs5/profile/variables.html

Last date updated: 1/7/2022

### SET PROXY PRIOR TO RUNNING

In [1]:
# SET TO TRUE/FALSE
onDCPServer = True
#
#

In [2]:
import pandas as pd
import json
import requests
import math
import numpy as np
import os

In [3]:
import get_acs as get
import utilcalcs as calc
import geo_agg
from geo import *

In [4]:
## proxy assignment, None if working off-network
if onDCPServer == False:
    p = None
else:
    from proxy import p

## Set parameters for API call

In [5]:
#Indicate source, year(s), column to use in data getter functions

#Using 5-year ACS estimate
source ='acs/acs1/profile' # changed to 1-year for housing-jobs balance analysis

#Table DP04 has housing units by structure size
col = 'group(DP04)'

#End years for ranges we are 
years = ['2010','2019']

Housing unit variables we want from table DP04:
- 0006 = total units
- 0007 = 1 unit detached
- 0008 = 1 unit attached
- 0009 = 2 units
- 0010 = 3 or 4 units
- 0011 = 5 to 9 units 
- 0012 = 10 to 19 units
- 0013 = 20+ units
- 0014 = Mobile home
- 0015 = RV, boat, other

In [6]:
#Create separate lists for estimates and MOEs for aggregation
U1E = ['DP04_0007E','DP04_0008E']
U1M = ['DP04_0007M','DP04_0008M']
U5E = ['DP04_0011E','DP04_0012E','DP04_0013E']
U5M = ['DP04_0011M','DP04_0012M','DP04_0013M']
UOthE = ['DP04_0014E','DP04_0015E']
UOthM = ['DP04_0014M','DP04_0015M']

#all variables we want to keep
var_data = ['GEO_ID','DP04_0006E','DP04_0006M']+U1E+U1M+\
           ['DP04_0009E','DP04_0009M','DP04_0010E','DP04_0010M']+\
           U5E + U5M + UOthE + UOthM

### Import geography crosswalk files

In [7]:
# For NYC subborough calculations
geo_nyc = pd.read_csv('../../data/geo/nyc_subbor_10.csv')

#make adjusted subpl for calculation comparison over time
geo_subpl = pd.read_csv('../../data/geo/subpl10.csv')
subpl10 = [str(i) for i in list(geo_subpl['id'])]

# THIS IS CUSTOM FOR 2019
liplace19 = pd.read_csv('../../data/geo/liplaces19.csv')
liplaces19 = [str(i) for i in list(liplace19['GEOID'])]

#### Calculate building size columns function

In [8]:
# function to sum select building columns & recalc MOES
def group_bsize(df,year):
    df = get.clean_data(df,var_data)
    df['U1_E'] = df.loc[:,U1E].sum(axis=1)
    df['U1_M'] = df.apply(lambda x: (calc.get_moe(x[U1M])),axis=1)
    df['U5_E'] = df.loc[:,U5E].sum(axis=1)
    df['U5_M'] = df.apply(lambda x: (calc.get_moe(x[U5M])),axis=1)
    df['UOth_E'] = df.loc[:,UOthE].sum(axis=1)
    df['UOth_M'] = df.apply(lambda x: (calc.get_moe(x[UOthM])),axis=1)
    df = df.drop(U1E+U1M+U5E+U5M+UOthE+UOthM,axis=1).rename(\
                      {'DP04_0006E':'UT_E','DP04_0006M':'UT_M',
                      'DP04_0009E':'U2_E','DP04_0009M':'U2_M',
                      'DP04_0010E':'U34_E', 'DP04_0010M':'U34_M'},axis=1)
    #add end-year into column name
    for col in df.columns[1:]:
        df.rename(columns={col:f'{col[:-1]}{year[-2:]}{col[-1:]}'},inplace=True)
    return df


#make function to calculate change between the two years
#def calc_change(df):
#    pass

### County + Subregion - Year 0 & Year1

In [9]:
c_y0 = get.get_county(p,source,years[0],col)
c_y1 = get.get_county(p,source,years[1],col)

In [10]:
c_y0 = group_bsize(c_y0,years[0])
c_y1 = group_bsize(c_y1,years[1])

In [11]:
cou = pd.merge(c_y0,c_y1,on='GEO_ID')
cou.head()

Unnamed: 0,GEO_ID,UT_10E,UT_10M,U2_10E,U2_10M,U34_10E,U34_10M,U1_10E,U1_10M,U5_10E,...,U2_19E,U2_19M,U34_19E,U34_19M,U1_19E,U1_19M,U5_19E,U5_19M,UOth_19E,UOth_19M
0,9001,361355.0,1420.0,30828.0,3095.0,32502.0,3151.0,228072.0,4764.005458,68692.0,...,26683.0,2846.0,30628.0,2668.0,238853.0,5101.023917,77753.0,4850.845081,1451.0,630.436357
1,9005,87543.0,270.0,6899.0,1318.0,4874.0,1120.0,68606.0,1963.761951,6945.0,...,5489.0,1151.0,5938.0,1618.0,68614.0,2178.462302,8226.0,1534.784024,147.0,304.78353
2,9009,361966.0,1093.0,36450.0,3109.0,42612.0,3101.0,210443.0,4369.261723,70534.0,...,38272.0,3266.0,38917.0,3457.0,215884.0,5539.231716,73594.0,4235.386877,2013.0,771.129042
3,34003,352412.0,875.0,49977.0,3224.0,17509.0,1980.0,206623.0,4983.859649,76604.0,...,47023.0,4131.0,18716.0,2658.0,211523.0,5729.320466,82037.0,4643.124487,1523.0,642.343366
4,34013,312960.0,2038.0,46575.0,3313.0,51256.0,2896.0,124041.0,4257.473547,90285.0,...,44676.0,3665.0,48969.0,4144.0,126850.0,4409.002155,98740.0,5114.13238,454.0,377.974867


In [12]:
cou['sub'] = cou['GEO_ID'].map(sub_7)

In [13]:
subreg = cou.drop('GEO_ID',axis=1).copy()
subreg = subreg[['sub']+[col for col in subreg.columns if col !='sub']]
subreg = geo_agg.sumgeo_cv(subreg,'sub')
subreg = subreg.rename({'sub':'GEO_ID'},axis=1)

In [14]:
subreg

Unnamed: 0,GEO_ID,UOth_19E,UOth_19M,UOth_19C,U34_10E,U34_10M,U34_10C,UT_10E,UT_10M,UT_10C,...,U2_10C,U34_19E,U34_19M,U34_19C,U5_10E,U5_10M,U5_10C,UT_19E,UT_19M,UT_19C
0,CT,3611.0,1041.625173,17.535497,79988.0,4560.63614,3.466049,810864.0,1812.166935,0.135858,...,3.753951,75483.0,4656.929997,3.750461,146171.0,6056.058867,2.518622,832462.0,1217.907221,0.088937
1,INJ,5921.0,1364.504672,14.009223,166310.0,6080.996382,2.222749,1919412.0,3667.411485,0.116152,...,1.684571,167452.0,7443.584083,2.702252,478641.0,10222.149725,1.298274,1974898.0,2238.836528,0.068915
2,ONJ,12953.0,2103.102232,9.870158,25964.0,2684.14437,6.284466,836425.0,2048.130855,0.148855,...,5.670072,27337.0,3101.061592,6.895943,111599.0,5480.951195,2.985587,853033.0,1570.97613,0.111954
3,NYC,6056.0,1391.378453,13.96669,356925.0,8567.828313,1.459244,3370647.0,4781.717578,0.086239,...,1.254578,334866.0,9770.890594,1.77377,2012062.0,16074.57085,0.48566,3546601.0,3645.032099,0.062477
4,MHV,16520.0,2093.901144,7.705135,25221.0,2871.799262,6.921909,388642.0,3930.337772,0.614772,...,6.832887,23643.0,2896.972385,7.448621,48380.0,3588.591506,4.509125,403656.0,1393.110907,0.209801
5,LI,6388.0,1571.329692,14.953281,26808.0,2947.111467,6.68292,1038217.0,1778.560373,0.104139,...,4.28666,21043.0,2845.71397,8.220868,102052.0,4954.910292,2.951538,1051597.0,1098.219013,0.063485
6,LHV,2782.0,986.105471,21.547671,38769.0,3431.327586,5.380364,513042.0,2984.711041,0.353658,...,4.845231,43230.0,4394.663696,6.179801,141781.0,5490.971681,2.354319,522965.0,1013.686835,0.117833


In [15]:
subreg.columns

Index(['GEO_ID', 'UOth_19E', 'UOth_19M', 'UOth_19C', 'U34_10E', 'U34_10M',
       'U34_10C', 'UT_10E', 'UT_10M', 'UT_10C', 'U1_10E', 'U1_10M', 'U1_10C',
       'U1_19E', 'U1_19M', 'U1_19C', 'U2_19E', 'U2_19M', 'U2_19C', 'UOth_10E',
       'UOth_10M', 'UOth_10C', 'U5_19E', 'U5_19M', 'U5_19C', 'U2_10E',
       'U2_10M', 'U2_10C', 'U34_19E', 'U34_19M', 'U34_19C', 'U5_10E', 'U5_10M',
       'U5_10C', 'UT_19E', 'UT_19M', 'UT_19C'],
      dtype='object')

In [16]:
cousubreg = pd.concat([cou,subreg])

In [17]:
cousubreg.head()

Unnamed: 0,GEO_ID,UT_10E,UT_10M,U2_10E,U2_10M,U34_10E,U34_10M,U1_10E,U1_10M,U5_10E,...,UT_10C,U1_10C,U1_19C,U2_19C,UOth_10C,U5_19C,U2_10C,U34_19C,U5_10C,UT_19C
0,9001,361355.0,1420.0,30828.0,3095.0,32502.0,3151.0,228072.0,4764.005458,68692.0,...,,,,,,,,,,
1,9005,87543.0,270.0,6899.0,1318.0,4874.0,1120.0,68606.0,1963.761951,6945.0,...,,,,,,,,,,
2,9009,361966.0,1093.0,36450.0,3109.0,42612.0,3101.0,210443.0,4369.261723,70534.0,...,,,,,,,,,,
3,34003,352412.0,875.0,49977.0,3224.0,17509.0,1980.0,206623.0,4983.859649,76604.0,...,,,,,,,,,,
4,34013,312960.0,2038.0,46575.0,3313.0,51256.0,2896.0,124041.0,4257.473547,90285.0,...,,,,,,,,,,


### Subplace - Year 0

In [19]:
#all ny places,reduce to just li places
pl_y0 = get.get_place(p,source,years[0],col)
pl_y0 = pl_y0[pl_y0['GEO_ID'].isin(subpl10)]
#all mcds in region
mcd_y0 = get.get_mcd(p,source,years[0],col)
#combine and clean
df_y0 = pd.concat([pl_y0,mcd_y0])
df_y0 = group_bsize(df_y0,years[0])

### Subplace Year 1

In [20]:
#all ny places,reduce to just li places
pl_y1 = get.get_place(p,source,years[1],col)
pl_y1 = pl_y1[pl_y1['GEO_ID'].isin(liplaces19)]
#all mcds in region
mcd_y1 = get.get_mcd(p,source,years[1],col)
#combine and clean
df_y1 = pd.concat([pl_y1,mcd_y1])
df_y1 = group_bsize(df_y1,years[1])

### Census Tract - Year 0

In [21]:
ct_y0 = get.get_tract(p,source,years[0],col)
ct_y0 = group_bsize(ct_y0,years[0])

### Census Tract - Year 1

In [22]:
ct_y1 = get.get_tract(p,source,years[1],col)
ct_y1 = group_bsize(ct_y1,years[1])

In [23]:
ct_y1.head()

## Export files to Excel

In [24]:
cousubreg.to_excel(f'../../output/Housing/BuildSize_ACS_cousubreg_{years[0]}{years[1]}.xlsx',index=False)

In [25]:
ct_y1.to_excel(f'../output/Housing/BuildSize_ACS_tract_{years[1]}.xlsx',index=False)