### NHPD data aggregation

#### Load and filter data

In [1]:
import pandas as pd

In [37]:
# Read in data and see first few lines`b
nhpd = pd.read_csv("data/active_and_inconclusive_properties.csv")
print(nhpd.shape)
nhpd.head()

  interactivity=interactivity, compiler=compiler, result=result)


(82287, 252)


Unnamed: 0,NHPDPropertyID,PropertyName,PropertyAddress,City,State,Zip,CBSACode,CBSAType,County,CountyCode,...,NumberActiveMR,NumberInconclusiveMR,NumberInactiveMR,Mr_1_Status,Mr_1_ProgramName,Mr_1_AssistedUnits,Mr_2_Status,Mr_2_ProgramName,Mr_2_AssistedUnits,OldNHPDPropertyID
0,1000000,IVY ESTATES,6729 Zeigler Blvd,Mobile,AL,36608-4253,33660.0,Metropolitan Statistical Area,Mobile,1097.0,...,0,0,0,,,,,,,
1,1000001,RENDU TERRACE WEST,7400 Old Shell Rd,Mobile,AL,36608-4549,33660.0,Metropolitan Statistical Area,Mobile,1097.0,...,0,0,0,,,,,,,
2,1000002,TWB RESIDENTIAL OPPORTUNITIES II,93 Canal Rd,Port Jefferson Station,NY,11776-3024,35620.0,Metropolitan,Suffolk,36103.0,...,0,0,0,,,,,,,
3,1000003,THE DAISY HOUSE,615 Clarissa St,Rochester,NY,14608-2485,40380.0,Metropolitan,Monroe,36055.0,...,0,0,0,,,,,,,
4,1000004,MAIN AVENUE APARTMENTS,105 E Walnut St,Sylacauga,AL,35150-3012,45180.0,Micropolitan Statistical Area,Talladega,1121.0,...,0,0,0,,,,,,,


In [11]:
# Subset to Hillsborough, Miami-Dade and Orange counties
nhpd_fl = nhpd[nhpd['County'].isin(['Hillsborough','Miami-Dade','Orange'])]

# See number of records for each of the three counties
nhpd_fl['County'].value_counts()

Orange          739
Miami-Dade      535
Hillsborough    335
Name: County, dtype: int64

#### Data Cleaning

In [73]:
# NA cleaning 
# Subset cols which have less NAs than max_NA_percentage and drop them 
max_NA_percentage = 0.5
print("Total Columns", nhpd_fl.shape[1])
nhpd_fl_without_na = nhpd_fl.loc[:, nhpd_fl.isnull().sum() < max_NA_percentage*nhpd_fl.shape[0]]
print("Total Columns after removing columns with high NAs", nhpd_fl_without_na.shape[1])

Total Columns 252
Total Columns after removing columns with high NAs 66


In [83]:
# We see that the percent of Extremely Low income households got dropped while removing columns with high NAs
# We need this column to find the census tracts with high percentage of low income households 
# We will impute this column with the mean per county
nhpd_fl.groupby('County').PercentofELIHouseholds.mean().reset_index(name = 'mean_PercentofELIHouseholds')
nhpd_fl['PercentofELIHouseholds'].fillna(value=nhpd_fl['mean_PercentofELIHouseholds'], inplace=True)

Unnamed: 0,County,mean_PercentofELIHouseholds
0,Hillsborough,82.623173
1,Miami-Dade,87.048136
2,Orange,84.841608


In [None]:
# NA cleaning after imputing PercentofELIHouseholds
# Subset cols which have less NAs than max_NA_percentage and drop them 
max_NA_percentage = 0.5
print("Total Columns", nhpd_fl.shape[1])
nhpd_fl_without_na = nhpd_fl.loc[:, nhpd_fl.isnull().sum() < max_NA_percentage*nhpd_fl.shape[0]]
print("Total Columns after removing columns with high NAs", nhpd_fl_without_na.shape[1])

In [87]:
nhpd_fl_without_na.head()

Unnamed: 0,NHPDPropertyID,PropertyName,PropertyAddress,City,State,Zip,CBSACode,CBSAType,County,CountyCode,...,NumberActiveState,NumberInconclusiveState,NumberInactiveState,NumberActivePBV,NumberInconclusivePBV,NumberInactivePBV,NumberActiveMR,NumberInconclusiveMR,NumberInactiveMR,OldNHPDPropertyID
159,1000163,BUENA VISTA APARTMENTS,521 SW 6th St,Miami,FL,33130-2773,33100.0,Metropolitan Statistical Area,Miami-Dade,12086.0,...,0,0,0,0,0,0,0,0,0,
161,1000165,VILLA BEATRIZ,776 NW 2nd St,Miami,FL,33128-1454,33100.0,Metropolitan Statistical Area,Miami-Dade,12086.0,...,0,0,0,0,0,0,0,0,0,
312,1000323,RED LION INN,9 Pleasant St,Randolph,VT,05060-1131,17200.0,Micropolitan,Orange,50017.0,...,0,0,0,0,0,0,0,0,0,
547,1000612,STIRRUP PLAZA PHASE TWO,3170 Mundy St,Miami,FL,33133-4310,33100.0,Metropolitan Statistical Area,Miami-Dade,12086.0,...,0,0,0,0,0,0,0,0,0,
548,1000613,JOE MORETTI II,535 SW 6th St,Miami,FL,33130-2745,33100.0,Metropolitan Statistical Area,Miami-Dade,12086.0,...,0,0,0,0,0,0,0,0,0,


#### Aggregate at a census tract level

In [95]:
# Make lists of columns to sum and those to average at a census tract level and create a dictionary to pass to agg function 
columns_to_sum = ['ActiveSubsidies', 'TotalInconclusiveSubsidies',
       'TotalInactiveSubsidies', 'TotalUnits', 
       'NumberActiveSection8', 'NumberInconclusiveSection8',
       'NumberInactiveSection8', 'NumberActiveSection202',
       'NumberInconclusiveSection202', 'NumberInactiveSection202',
       'NumberActiveSection236', 'NumberInconclusiveSection236',
       'NumberInactiveSection236', 'NumberActiveHUDInsured',
       'NumberInconclusiveHUDInsured', 'NumberInactiveHud',
       'NumberActiveLihtc', 'NumberInconclusiveLihtc', 'NumberInactiveLihtc',
       'NumberActiveSection515', 'NumberInconclusiveSection515',
       'NumberInactiveSection515', 'NumberActiveSection538',
       'NumberInconclusiveSection538', 'NumberInactiveSection538',
       'NumberActiveHome', 'NumberInconclusiveHome', 'NumberInactiveHome',
       'NumberActivePublicHousing', 'NumberInconclusivePublicHousing',
       'NumberInactivePublicHousing', 'NumberActiveState',
       'NumberInconclusiveState', 'NumberInactiveState', 'NumberActivePBV',
       'NumberInconclusivePBV', 'NumberInactivePBV', 'NumberActiveMR',
       'NumberInconclusiveMR', 'NumberInactiveMR']
columns_to_average = ['TotalUnits',
 'StudioOneBedroomUnits',
 'TwoBedroomUnits',
 'ThreePlusBedroomUnits',
 'PercentofELIHouseholds',
 'FairMarketRent_2BR']

agg_dict = dict()

for col in columns_to_sum:
    agg_dict[col] = "sum"
for col in columns_to_average:
    agg_dict[col] = "mean"
    
print(agg_dict)

{'ActiveSubsidies': 'sum', 'TotalInconclusiveSubsidies': 'sum', 'TotalInactiveSubsidies': 'sum', 'TotalUnits': 'mean', 'NumberActiveSection8': 'sum', 'NumberInconclusiveSection8': 'sum', 'NumberInactiveSection8': 'sum', 'NumberActiveSection202': 'sum', 'NumberInconclusiveSection202': 'sum', 'NumberInactiveSection202': 'sum', 'NumberActiveSection236': 'sum', 'NumberInconclusiveSection236': 'sum', 'NumberInactiveSection236': 'sum', 'NumberActiveHUDInsured': 'sum', 'NumberInconclusiveHUDInsured': 'sum', 'NumberInactiveHud': 'sum', 'NumberActiveLihtc': 'sum', 'NumberInconclusiveLihtc': 'sum', 'NumberInactiveLihtc': 'sum', 'NumberActiveSection515': 'sum', 'NumberInconclusiveSection515': 'sum', 'NumberInactiveSection515': 'sum', 'NumberActiveSection538': 'sum', 'NumberInconclusiveSection538': 'sum', 'NumberInactiveSection538': 'sum', 'NumberActiveHome': 'sum', 'NumberInconclusiveHome': 'sum', 'NumberInactiveHome': 'sum', 'NumberActivePublicHousing': 'sum', 'NumberInconclusivePublicHousing': 

In [96]:
# Aggregate the dataframe at a census tract level using the aggregate dictionary we just created
nhpd_fl_census_tract = nhpd_fl_without_na.groupby('CensusTract').agg(agg_dict)
nhpd_fl_census_tract.head()

Unnamed: 0_level_0,ActiveSubsidies,TotalInconclusiveSubsidies,TotalInactiveSubsidies,TotalUnits,NumberActiveSection8,NumberInconclusiveSection8,NumberInactiveSection8,NumberActiveSection202,NumberInconclusiveSection202,NumberInactiveSection202,...,NumberInconclusivePBV,NumberInactivePBV,NumberActiveMR,NumberInconclusiveMR,NumberInactiveMR,StudioOneBedroomUnits,TwoBedroomUnits,ThreePlusBedroomUnits,PercentofELIHouseholds,FairMarketRent_2BR
CensusTract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6059001000.0,1,0,1,56.0,1,0,0,0,0,0,...,0,0,0,0,0,55.0,0.0,0.0,96.0,2010.0
6059001000.0,2,0,4,58.0,1,0,0,0,0,0,...,0,0,0,0,0,14.0,43.0,36.0,83.056701,2010.0
6059001000.0,1,0,0,72.0,0,0,0,0,0,0,...,0,0,0,0,0,20.0,52.0,,85.113402,2010.0
6059002000.0,2,0,1,69.0,0,0,0,0,0,0,...,0,0,0,0,0,41.0,50.0,1.0,85.113402,2440.0
6059002000.0,1,0,0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,,,,85.113402,2440.0


### Combining with Housing loss data

In [14]:
# Read in housing loss data
hill_hl = pd.read_csv("data/hillsborough_fl_processed_2017_to_2019_20210916.csv")
miami_dade_hl = pd.read_csv("data/miami_dade_fl_processed_2017_to_2019_20210916.csv")
orange_hl = pd.read_csv("data/orange_fl_processed_2017_to_2019_20210916.csv")


### Combining with American Community Survey data

In [17]:
# Read in 2014 acs data
hill_acs_2014 = pd.read_csv("data/hillsborough_acs5-2014_census.csv")
miami_dade_acs_2014 = pd.read_csv("data/miami_dade_acs5-2014_census.csv")
orange_acs_2014 = pd.read_csv("data/orange_acs5-2014_census.csv")

# Read in 2019 acs data
hill_acs_2019 = pd.read_csv("data/hillsborough_acs5-2019_census.csv")
miami_dade_acs_2019 = pd.read_csv("data/miami_dade_acs5-2019_census.csv")
orange_acs_2019 = pd.read_csv("data/orange_acs5-2019_census.csv")

In [32]:
# Combine 2014 and 2019 data
hill_acs_2014['year'] = 2014
hill_acs_2019['year'] = 2019
#TODO: fix this
hills_acs = pd.concat([hill_acs_2014,hill_acs_2019], axis = 1)
hills_acs.head()

Unnamed: 0,index,DP02_0001E,DP02_0001PE,DP02_0002E,DP02_0002PE,DP02_0003E,DP02_0003PE,DP02_0004E,DP02_0004PE,DP02_0005E,...,B25087_038E,B25087_039E,B25088_001E,B25088_002E,B25088_003E,B25092_001E,B25092_002E,B25092_003E,GEOID,year
0,"Census Tract 111.03, Hillsborough County, Flor...",1268,1268,1012,79.8,374,29.5,826,65.1,302,...,0,0,991,1172,359,19.9,26.3,9.0,12057000401,2019
1,"Census Tract 114.08, Hillsborough County, Flor...",1078,1078,811,75.2,332,30.8,692,64.2,288,...,0,10,866,1549,391,17.3,23.3,9.0,12057010105,2019
2,"Census Tract 114.13, Hillsborough County, Flor...",2111,2111,1444,68.4,704,33.3,1012,47.9,465,...,0,0,608,1155,263,18.6,21.9,15.7,12057010203,2019
3,"Census Tract 116.08, Hillsborough County, Flor...",428,428,319,74.5,146,34.1,272,63.6,110,...,0,15,1241,1584,566,14.2,16.9,9.0,12057010600,2019
4,"Census Tract 116.11, Hillsborough County, Flor...",1596,1596,990,62.0,438,27.4,609,38.2,241,...,0,10,1174,1400,408,17.8,18.7,14.2,12057002300,2019
