### Read in NHPD data

In [189]:
#Pandas display options
import pandas as pd
pd.options.display.max_rows = 999

In [190]:
#Use gdown to download the NHPD data
!pip install gdown



In [191]:
#Download the NHPD data into the ./data/ folder
import gdown

url = 'https://drive.google.com/u/0/uc?id=19EMDNi6AT4pecx_iWltZv0GxLYqv4Eof&export=download'
output = './data/active_and_inconclusive_communities.xlsx'
gdown.download(url, output, quiet=False)


Downloading...
From: https://drive.google.com/u/0/uc?id=19EMDNi6AT4pecx_iWltZv0GxLYqv4Eof&export=download
To: /Users/netinupur/Desktop/sep21-housing-insecurity/code/netinupur/data/active_and_inconclusive_communities.xlsx
44.4MB [00:01, 36.8MB/s]


'./data/active_and_inconclusive_communities.xlsx'

In [192]:
#Read the downloaded file
#nhpd_data=pd.read_excel('./data/active_and_inconclusive_communities.xlsx')

In [193]:
# If you have converted to csv locally use the below line, it is faster to read in 
nhpd_data = pd.read_csv("./data/active_and_inconclusive_communities.csv")

In [194]:
# Subset to Hillsborough, Miami-Dade and Orange counties
nhpd_data = nhpd_data[nhpd_data['County'].isin(['Hillsborough','Miami-Dade','Orange'])]

### Column Selection

In [195]:
# NA cleaning 
# Subset cols which have less NAs than max_NA_percentage and drop them 
max_NA_percentage = 0.95
print("Total Columns", nhpd_data.shape[1])
nhpd_data = nhpd_data.loc[:, nhpd_data.isnull().sum() < max_NA_percentage*nhpd_data.shape[0]]
print("Total Columns after removing columns with high NAs", nhpd_data.shape[1])

Total Columns 252
Total Columns after removing columns with high NAs 128


In [196]:
list(nhpd_data.columns)

['NHPDPropertyID',
 'PropertyName',
 'PropertyAddress',
 'City',
 'State',
 'Zip',
 'CBSACode',
 'CBSAType',
 'County',
 'CountyCode',
 'CensusTract',
 'CongressionalDistrict',
 'Latitude',
 'Longitude',
 'PropertyStatus',
 'ActiveSubsidies',
 'TotalInconclusiveSubsidies',
 'TotalInactiveSubsidies',
 'TotalUnits',
 'EarliestStartDate',
 'EarliestEndDate',
 'LatestEndDate',
 'Owner',
 'OwnerType',
 'ManagerName',
 'ManagerType',
 'HUDPropertyID',
 'ReacScore1',
 'ReacScore1Date',
 'ReacScore2',
 'ReacScore2Date',
 'ReacScore3',
 'ReacScore3Date',
 'StudioOneBedroomUnits',
 'TwoBedroomUnits',
 'ThreePlusBedroomUnits',
 'PercentofELIHouseholds',
 'TargetTenantType',
 'FairMarketRent_2BR',
 'EarliestConstructionDate',
 'LatestConstructionDate',
 'OccupancyRate',
 'AverageMonthsOfTenancy',
 'NumberActiveSection8',
 'NumberInconclusiveSection8',
 'NumberInactiveSection8',
 'S8_1_ID',
 'S8_1_Status',
 'S8_1_ProgramName',
 'S8_1_RenewalStatus',
 'S8_1_StartDate',
 'S8_1_EndDate',
 'S8_1_Assist

In [197]:
#Incorporate Anabel and Sam's analysis of full and/or informative columns + which has usable data
colstokeep=[
'CountyCode',
'CensusTract',
'County',
'ActiveSubsidies',
'TotalInconclusiveSubsidies',
'TotalInactiveSubsidies',
'TotalUnits',
'OwnerType',
'TargetTenantType',
 'ManagerType',
'StudioOneBedroomUnits',
'TwoBedroomUnits',
'ThreePlusBedroomUnits',
'FairMarketRent_2BR']
nhpd_data=nhpd_data[colstokeep]

In [198]:
nhpd_data.shape

(1609, 14)

In [199]:
# Thanks to Laura Prichard - Homogenizing column formats

# Target Tenant Type cleaning
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Eldery or Disabled", "Elderly or Disabled")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Mixed;Link", "Mixed")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Indv. families - not eld/ handicap", "Family")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Senior", "Elderly")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Family & Elderly", "Mixed")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Homeless Veterans", "Veterans")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Mixed Income", "Mixed")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("OTHER", "Mixed")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.replace("Affordable", "Low Income")
nhpd_data['TargetTenantType'] = nhpd_data['TargetTenantType'].str.title()

  


### Column Aggregation per census Tract

In [214]:
# Make lists of columns to sum and those to average at a census tract level
columns_to_sum = ['ActiveSubsidies', 'TotalInconclusiveSubsidies',
       'TotalInactiveSubsidies', 'TotalUnits', ]
columns_to_average = ['TotalUnits',
 'StudioOneBedroomUnits',
 'TwoBedroomUnits',
 'ThreePlusBedroomUnits',
 'FairMarketRent_2BR']
columns_for_mode = ['TargetTenantType','ManagerType', 'OwnerType']

#Create a dictionary to pass to agg function 
agg_dict = dict()

for col in columns_to_sum:
    agg_dict[col] = "sum"
for col in columns_to_average:
    agg_dict[col] = "mean"

# Aggregate the dataframe at a census tract level using the aggregate dictionary we just created
nhpd_fl_census_tract = nhpd_data.groupby(['CensusTract', 'County', 'CountyCode']).agg(agg_dict).reset_index()

nhpd_fl_census_tract_mode = nhpd_data.groupby(['CensusTract', 'County', 'CountyCode'])[columns_for_mode].agg(pd.Series.mode).reset_index()


In [215]:
# Join mode grouped df back to sum and avergae data
nhpd_fl_census_tract = pd.merge(nhpd_fl_census_tract, nhpd_fl_census_tract_mode, on = ['CensusTract', 'County', 'CountyCode'], how = 'left')
nhpd_fl_census_tract.head()

Unnamed: 0,CensusTract,County,CountyCode,ActiveSubsidies,TotalInconclusiveSubsidies,TotalInactiveSubsidies,TotalUnits,StudioOneBedroomUnits,TwoBedroomUnits,ThreePlusBedroomUnits,FairMarketRent_2BR,TargetTenantType,ManagerType,OwnerType
0,6059001000.0,Orange,6059.0,1,0,1,56.0,55.0,0.0,0.0,2010.0,Elderly,Profit Motivated,Profit Motivated
1,6059001000.0,Orange,6059.0,2,0,4,58.0,14.0,43.0,36.0,2010.0,Family,For Profit,For Profit
2,6059001000.0,Orange,6059.0,1,0,0,72.0,20.0,52.0,,2010.0,[],[],For Profit
3,6059002000.0,Orange,6059.0,2,0,1,69.0,41.0,50.0,1.0,2440.0,Family,[],For Profit
4,6059002000.0,Orange,6059.0,1,0,0,1.0,,,,2440.0,[],[],[]


### Merging with Housing loss data

In [202]:
#Read housing loss data
#Hillsborough
url = 'https://drive.google.com/uc?id=1abt4fLPO__KxBLz9SXue5VKeZN3cUcCF&export=download'
output = './data/hills_loss.csv'
gdown.download(url, output, quiet=False)
#Miami-Dade
url = 'https://drive.google.com/uc?id=1gLojTGS6HQ1s60gmIxFCq2xObB1634BU&export=download'
output = './data/miami_loss.csv'
gdown.download(url, output, quiet=False)
#Orange
url = 'https://drive.google.com/uc?id=15ee2QrH8a_yuIfptGwAsVF-tWGTEYXCy&export=download'
output = './data/orange_loss.csv'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1abt4fLPO__KxBLz9SXue5VKeZN3cUcCF&export=download
To: /Users/netinupur/Desktop/sep21-housing-insecurity/code/netinupur/data/hills_loss.csv
100%|██████████| 207k/207k [00:00<00:00, 3.23MB/s]
Downloading...
From: https://drive.google.com/uc?id=1gLojTGS6HQ1s60gmIxFCq2xObB1634BU&export=download
To: /Users/netinupur/Desktop/sep21-housing-insecurity/code/netinupur/data/miami_loss.csv
100%|██████████| 321k/321k [00:00<00:00, 4.54MB/s]
Downloading...
From: https://drive.google.com/uc?id=15ee2QrH8a_yuIfptGwAsVF-tWGTEYXCy&export=download
To: /Users/netinupur/Desktop/sep21-housing-insecurity/code/netinupur/data/orange_loss.csv
100%|██████████| 143k/143k [00:00<00:00, 3.02MB/s]


'./data/orange_loss.csv'

In [203]:
housing_loss=pd.concat([pd.read_csv('./data/hills_loss.csv'),
                       pd.read_csv('./data/miami_loss.csv'),
                       pd.read_csv('./data/orange_loss.csv')])

In [204]:
#Get census tracts, compare with those in NHPD data
housing_loss.rename(columns={'census_tract_GEOID':'CensusTract'},inplace=True)

In [216]:
nhpd_fl_census_tract=nhpd_fl_census_tract.merge(housing_loss,how="inner",on=['CensusTract'])

In [217]:
nhpd_fl_census_tract.head()

Unnamed: 0,CensusTract,County,CountyCode,ActiveSubsidies,TotalInconclusiveSubsidies,TotalInactiveSubsidies,TotalUnits,StudioOneBedroomUnits,TwoBedroomUnits,ThreePlusBedroomUnits,...,lien-foreclosure-rate-2019,avg-eviction-rate,ratio-to-mean-foreclosure-rate,ratio-to-mean-eviction-rate,avg-housing-loss-rate,evictions-pct-total-housing-loss,housing-loss-index,county_GEOID,county,state
0,12057000000.0,Hillsborough,12057.0,8,0,1,55.571429,69.5,5.6,0.0,...,0.0,2.860353,1.311987,1.262704,2.448286,0.919438,1.613917,12057,Hillsborough County,Florida
1,12057000000.0,Hillsborough,12057.0,4,0,3,63.0,92.0,1.333333,0.0,...,0.613497,3.600563,1.947692,1.58947,3.444325,0.971989,2.270508,12057,Hillsborough County,Florida
2,12057000000.0,Hillsborough,12057.0,1,0,0,84.0,,,,...,,4.615454,1.983724,2.037495,3.711221,0.893916,2.446447,12057,Hillsborough County,Florida
3,12057000000.0,Hillsborough,12057.0,1,0,0,2.0,,1.0,1.0,...,0.111235,3.730073,2.862517,1.646643,3.314329,0.851795,2.184815,12057,Hillsborough County,Florida
4,12057000000.0,Hillsborough,12057.0,1,0,0,96.0,72.0,24.0,0.0,...,0.437637,5.487367,1.628673,2.422401,3.914863,0.893568,2.580689,12057,Hillsborough County,Florida


In [218]:
nhpd_fl_census_tract.shape

(354, 86)

#### Column selection in merge data

In [219]:
# Subset columns for analysis
cols_to_keep = ['ActiveSubsidies',
       'TotalInconclusiveSubsidies', 'TotalInactiveSubsidies', 'TotalUnits',
       'StudioOneBedroomUnits', 'TwoBedroomUnits', 'ThreePlusBedroomUnits',
       'FairMarketRent_2BR', 'total-households',
       'total-renter-occupied-households', 'total-owner-occupied-households',
       'total-owner-occupied-households-mortgage', 'median-gross-rent',
       'median-household-income', 'median-property-value',
       'median-monthly-housing-cost', 'pct-white', 'pct-af-am', 'pct-hispanic',
       'pct-am-indian', 'pct-asian', 'pct-nh-pi', 'pct-multiple', 'pct-other',
       'pct-below-poverty-level', 'households-children',
       'single-parent-household', 'older-adult-alone', 'level-of-education',
       'immigrant-status', 'english-fluency', 'drive-to-work',
       'public-transport-to-work', 'vacant-properties', 'live-in-mobile-home',
       'pct-renter-occupied', 'pct-owner-occupied',
       'pct-owner-occupied-mortgage', 'pct-owner-occupied-without-mortgage',
       'median-house-age', 'pct-non-white', 'pct-without-health-insurance',
       'total-evictions', 'avg-evictions', 'total-foreclosure-sales',
       'avg-foreclosure-sales', 'total-lien-foreclosures',
       'avg-lien-foreclosures', 'total-evictions-2017',
       'eviction-filings-2017', 'eviction-rate-2017', 'total-evictions-2018',
       'eviction-filings-2018', 'eviction-rate-2018', 'total-evictions-2019',
       'eviction-filings-2019', 'eviction-rate-2019', 'foreclosure-sales-2017',
       'foreclosure-sales-2018', 'foreclosure-sales-2019',
       'lien-foreclosures-2017', 'lien-foreclosures-2018',
       'lien-foreclosures-2019', 'avg-foreclosure-rate',
       'foreclosure-rate-2017', 'foreclosure-rate-2018',
       'foreclosure-rate-2019', 'avg-lien-foreclosure-rate',
       'lien-foreclosure-rate-2017', 'lien-foreclosure-rate-2018',
       'lien-foreclosure-rate-2019', 'avg-eviction-rate',
       'ratio-to-mean-foreclosure-rate', 'ratio-to-mean-eviction-rate',
       'avg-housing-loss-rate', 'evictions-pct-total-housing-loss',
       'housing-loss-index']

nhpd_fl_census_tract=nhpd_fl_census_tract[cols_to_keep]

In [220]:
# See percentage of NAs in joined data
nhpd_fl_census_tract.isnull().sum()/nhpd_fl_census_tract.shape[0]

ActiveSubsidies                             0.000000
TotalInconclusiveSubsidies                  0.000000
TotalInactiveSubsidies                      0.000000
TotalUnits                                  0.000000
StudioOneBedroomUnits                       0.220339
TwoBedroomUnits                             0.163842
ThreePlusBedroomUnits                       0.132768
FairMarketRent_2BR                          0.000000
total-households                            0.000000
total-renter-occupied-households            0.000000
total-owner-occupied-households             0.000000
total-owner-occupied-households-mortgage    0.000000
median-gross-rent                           0.000000
median-household-income                     0.000000
median-property-value                       0.000000
median-monthly-housing-cost                 0.000000
pct-white                                   0.000000
pct-af-am                                   0.000000
pct-hispanic                                0.

In [221]:
# Drop rows with NAs since columns that has NAs are <25% NAs
nhpd_fl_census_tract.dropna(inplace = True)

In [222]:
nhpd_fl_census_tract.shape

(225, 77)

### Finding correlations

In [212]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=30):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(nhpd_fl_census_tract))

Top Absolute Correlations
pct-white                        pct-non-white                               1.000000
pct-renter-occupied              pct-owner-occupied                          1.000000
total-evictions                  avg-evictions                               0.999976
total-foreclosure-sales          avg-foreclosure-sales                       0.999742
total-lien-foreclosures          avg-lien-foreclosures                       0.998025
total-evictions-2019             eviction-filings-2019                       0.986200
total-owner-occupied-households  total-owner-occupied-households-mortgage    0.977952
total-evictions-2018             eviction-filings-2018                       0.976844
total-evictions                  total-evictions-2018                        0.976151
avg-evictions                    total-evictions-2018                        0.976095
total-evictions                  total-evictions-2017                        0.969518
avg-evictions               

In [223]:
### Removing columns which are redundant 
cols_to_drop = ['pct-white','pct-owner-occupied','total-evictions','total-foreclosure-sales','total-lien-foreclosures',
               'total-lien-foreclosures', 'total-evictions-2019','total-owner-occupied-households','total-evictions-2018','total-evictions',
               'total-foreclosure-sales', 'total-lien-foreclosures','total-lien-foreclosures','total-evictions-2019',
                'total-owner-occupied-households','total-evictions-2018','total-evictions-2018','total-evictions',
               'foreclosure-sales-2017','eviction-filings-2017','eviction-filings-2018','total-evictions-2019',
               'eviction-filings-2017','eviction-filings-2018','lien-foreclosures-2017','total-evictions-2017', 'eviction-rate-2017', 'eviction-rate-2018',
       'eviction-filings-2019', 'eviction-rate-2019', 'foreclosure-sales-2018',
       'foreclosure-sales-2019', 'lien-foreclosures-2018',
       'lien-foreclosures-2019', 
       'foreclosure-rate-2017', 'foreclosure-rate-2018',
       'foreclosure-rate-2019', 
       'lien-foreclosure-rate-2017', 'lien-foreclosure-rate-2018',
       'lien-foreclosure-rate-2019']

nhpd_fl_census_tract = nhpd_fl_census_tract.drop(cols_to_drop, axis = 1)

In [224]:
nhpd_fl_census_tract.columns

Index(['ActiveSubsidies', 'TotalInconclusiveSubsidies',
       'TotalInactiveSubsidies', 'TotalUnits', 'StudioOneBedroomUnits',
       'TwoBedroomUnits', 'ThreePlusBedroomUnits', 'FairMarketRent_2BR',
       'total-households', 'total-renter-occupied-households',
       'total-owner-occupied-households-mortgage', 'median-gross-rent',
       'median-household-income', 'median-property-value',
       'median-monthly-housing-cost', 'pct-af-am', 'pct-hispanic',
       'pct-am-indian', 'pct-asian', 'pct-nh-pi', 'pct-multiple', 'pct-other',
       'pct-below-poverty-level', 'households-children',
       'single-parent-household', 'older-adult-alone', 'level-of-education',
       'immigrant-status', 'english-fluency', 'drive-to-work',
       'public-transport-to-work', 'vacant-properties', 'live-in-mobile-home',
       'pct-renter-occupied', 'pct-owner-occupied-mortgage',
       'pct-owner-occupied-without-mortgage', 'median-house-age',
       'pct-non-white', 'pct-without-health-insurance', 

In [225]:
nhpd_fl_census_tract.to_csv("data/nhpd_fl_census_tract_housing_loss.csv", index=False)

### Next Steps

For anyone working on this, you could take the output data of this, and build a regression model to predict a target of your choice. Something related to eviction rates or foreclosure and then find the stastistically significant predictors of the target to see which variables affect evictions.