In [None]:
#!ls data/

In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict

# Import Data

## EPA's Smart Locations Database

In [None]:
try:
    sld = pd.read_parquet('data/Smart_Location_Database.parquet')

except FileNotFoundError:
    # import geopandas as gpd
    # gdb = "SmartLocationDatabase.gdb"

    # # Open the SLD using geopandas
    # sld = gpd.read_file(gdb)

    sld = pd.read_csv('data/Smart_Location_Database.csv')
    
    # The geometry column is massive and not required if reading from GDB
    # sld.drop(columns=['geometry'], inplace=True)
    
    # State FIPS codes greater than 59 are for Am. Territories
    sld = sld[sld['STATEFP'] < 60]

    # Create a FIPS column that combines State & County
    sld['FIPS'] = sld['STATEFP'].astype('str').str.zfill(2) + sld['COUNTYFP'].astype('str').str.zfill(3)

    # All CBGs with population-weighted centroids that were further than three-quarter 
    #   miles from a transit stop were assigned a value of “-99999.
    sld['D4A'] = sld['D4A'].apply(lambda x: np.nan if x == -99999 else x)

    # Create a State-County-Census FIPS code to use for Food_Atlas joins
    sld['StCtyTract'] = sld['GEOID10'].astype('str').str[:-1].str.zfill(11)

    # Write out the data to parquet format
    sld.to_parquet("data/Smart_Location_Database.parquet")

## Zillow's Home Value Index by County (ZHVI)

In [None]:
try:
    zillow = pd.read_parquet('data/zillow.parquet')

except FileNotFoundError:
    zillow = pd.read_csv('data/Zillow_County_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')

    # Limit years to 2013 onwards to focus on 2013-2023 timeframe
    years_to_drop = [str(year) for year in range(2000, 2012)]
    cols_to_drop = [col for col in zillow.columns for year in years_to_drop if year in col]

    # Add uninformative columns for removal from the dataset
    cols_to_drop.extend(['SizeRank', 'RegionType', 'StateName', 'RegionID', 'Metro'])

    # Drop the unrequired years from the dataset
    zillow = zillow.drop(columns=cols_to_drop)

    # Create the FIPS code from the State & Municipal Codes
    zillow['FIPS'] = zillow['StateCodeFIPS'].astype('str').str.zfill(2) + \
                     zillow['MunicipalCodeFIPS'].astype('str').str.zfill(3)

    # Convert zillow df from wide to long for easy merging & comparison of feature columns
    zillow = zillow.melt(
        id_vars=['RegionName', 'State', 'StateCodeFIPS', 'MunicipalCodeFIPS', 'FIPS'], 
        var_name='Year', value_name='Home Value')

    # Create a year and month column
    zillow['Y'] = zillow['Year'].apply(lambda x: x[:4])
    zillow['M'] = zillow['Year'].apply(lambda x: x[5:7])
    zillow.Y = zillow.Y.astype('int')
    zillow.M = zillow.M.astype('int')

    zillow.to_parquet('data/zillow.parquet')

## Zillow's Observed Rent Index by County (ZORI)

In [None]:
try:
    zillow_zori = pd.read_parquet('data/zillow_zori.parquet')

except FileNotFoundError:
    zillow_zori = pd.read_csv('data/County_zori_sm_sa_month.csv')

    # Zori begins in 2015
    # Limit years to 2013 onwards to focus on 2017-2023 timeframe
    years_to_drop = [str(year) for year in range(2015, 2016)]
    cols_to_drop = [col for col in zillow_zori.columns for year in years_to_drop if year in col]

    # Add uninformative columns for removal from the dataset
    cols_to_drop.extend(['SizeRank', 'RegionType', 'StateName', 'RegionID', 'Metro'])

    # Drop the unrequired years from the dataset
    zillow_zori = zillow_zori.drop(columns=cols_to_drop)

    # Create the FIPS code from the State & Municipal Codes
    zillow_zori['FIPS'] = zillow_zori['StateCodeFIPS'].astype('str').str.zfill(2) + \
                          zillow_zori['MunicipalCodeFIPS'].astype('str').str.zfill(3)

    # Convert zillow df from wide to long for easy merging & comparison of feature columns
    zillow_zori = zillow_zori.melt(
        id_vars=['RegionName', 'State', 'StateCodeFIPS', 'MunicipalCodeFIPS', 'FIPS'], 
        var_name='Year', value_name='Home Value')

    # Create a year and month column
    zillow_zori['Y'] = zillow_zori['Year'].apply(lambda x: x[:4])
    zillow_zori['M'] = zillow_zori['Year'].apply(lambda x: x[5:7])
    zillow_zori.Y = zillow_zori.Y.astype('int')
    zillow_zori.M = zillow_zori.M.astype('int')

    zillow_zori.to_parquet('data/zillow_zori.parquet')

## Food Atlas

In [None]:
try:
    food_atlas = pd.read_parquet('data/food_atlas.parquet')

except FileNotFoundError:
    food_atlas = pd.read_csv('data/FoodAccessResearchAtlasData2019.csv')
    food_atlas_vars = pd.read_csv('data/FoodAccessResearchAtlasData2019_VariableLookup.csv')

    # Turn the CensusTract into a string and zero fill it to add missing leading zeros
    food_atlas['CensusTract'] = food_atlas['CensusTract'].astype('str').str.zfill(11)

    food_atlas.to_parquet('data/food_atlas.parquet')
    food_atlas_vars.to_parquet('data/food_atlas_vars.parquet')

## Census Tract Reference 2010

In [None]:
try:
    census_tract = pd.read_parquet('data/census_tract.parquet')

except FileNotFoundError:
    census_tract = pd.read_csv('data/CensusTractReference2010.csv')

    # Turn the CensusTract into a string and zero fill it to add missing leading zeros
    census_tract['tract_fips'] = census_tract['tract_fips'].astype('str').str.zfill(11)

    census_tract.to_parquet('data/census_tract.parquet')

## Supporting Reference Data

In [None]:
equifax = pd.read_parquet('data/equifax.parquet')

In [None]:
equifax

Unnamed: 0,State,County,Year,Total Households,Median Household Income,Mortgage Interest,Property Taxes,Rental Costs,Property Management Fees,Housing For Someone At School,Lodging Away From Home,Electricity,Fuel Oil,Natural Gas,Septic Tank Cleaning,Trash And Garbage Collection,Water And Sewage,Cellular Phone Service,Total Utility Costs
0,Alabama,Autauga,2022,21948.0,60199.0,2501.76,1551.92,109.69,99.91,48.40,169.74,1799.45,31.12,254.27,6.61,199.30,531.61,1310.46,8614.24
1,Alabama,Baldwin,2022,94231.0,58644.0,2554.75,1752.60,133.36,117.66,51.77,188.50,1850.59,32.77,257.41,7.21,203.25,538.04,1312.10,9000.01
2,Alabama,Barbour,2022,9485.0,38535.0,1782.69,1100.59,78.27,77.36,28.61,123.81,1560.99,24.14,212.78,5.98,178.79,451.92,1099.28,6725.21
3,Alabama,Bibb,2022,7878.0,50337.0,2265.62,1428.14,104.18,94.39,43.88,155.50,1751.56,29.78,240.92,6.50,195.68,510.20,1249.50,8075.85
4,Alabama,Blount,2022,22261.0,55747.0,2397.33,1521.40,114.37,100.30,46.65,165.43,1787.76,31.18,249.19,6.75,198.24,526.79,1286.02,8431.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6279,Wyoming,Sweetwater,2027,,,4851.41,2807.79,217.30,148.23,55.54,250.98,1783.53,36.57,463.26,8.59,336.87,734.60,1677.86,13372.53
6280,Wyoming,Teton,2027,,,5330.73,3094.83,252.83,157.06,65.78,282.36,1828.81,39.44,478.37,9.00,348.06,759.96,1727.85,14375.08
6281,Wyoming,Uinta,2027,,,4849.66,2823.76,238.71,151.54,55.04,244.14,1757.17,36.97,449.65,9.06,331.97,723.59,1634.12,13305.38
6282,Wyoming,Washakie,2027,,,3804.75,2528.28,210.86,141.78,40.31,217.81,1649.44,33.07,412.57,8.74,308.74,664.36,1485.98,11506.69


In [None]:
state_names = pd.read_parquet('data/state_names.parquet')
fips = pd.read_parquet('data/fips.parquet')


NameError: name 'pd' is not defined

In [None]:
sld[sld.isna().any(axis=1)]

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,VMT_per_worker,VMT_tot_min,VMT_tot_max,VMT_tot_avg,GHG_per_worker,Annual_GHG,SLC_score,Shape__Area,Shape__Length,FIPS
1,2,481130078252,481130078252,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,26.465754,11.442995,82.636303,25.659327,23.580987,6131.056669,78.898635,4.849451e+05,3519.469110,48113
3,4,481130078241,481130078241,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,26.277851,11.442995,82.636303,25.659327,23.413565,6087.527012,79.162569,4.818284e+05,2922.609204,48113
11,12,481130115001,481130115001,48,113,11500,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,39.760982,11.442995,82.636303,25.659327,35.427035,9211.029194,60.223807,1.717037e+05,2628.841414,48113
15,16,483290003052,483290003052,48,329,305,2,372.0,"Midland-Odessa, TX",33260.0,...,15.745742,12.864587,24.544837,18.346436,14.029456,3647.658610,75.333105,6.200870e+05,3161.264393,48329
26,27,483290004022,483290004022,48,329,402,2,372.0,"Midland-Odessa, TX",33260.0,...,16.872426,12.864587,24.544837,18.346436,15.033332,3908.666305,65.687041,4.323937e+05,2684.854495,48329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220734,220735,780309610002,780309610002,78,30,961000,2,,,,...,,,,,,,,3.355857e+05,3414.446949,78030
220735,220736,780309610003,780309610003,78,30,961000,3,,,,...,,,,,,,,2.924305e+05,2421.025608,78030
220736,220737,780309610005,780309610005,78,30,961000,5,,,,...,,,,,,,,1.619395e+05,1955.909418,78030
220737,220738,780309607003,780309607003,78,30,960700,3,,,,...,,,,,,,,1.038966e+07,16896.768872,78030


In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
sld.isna().sum()

OBJECTID            0
GEOID10             0
GEOID20             0
STATEFP             0
COUNTYFP            0
                 ... 
Annual_GHG       3000
SLC_score        3021
Shape__Area         0
Shape__Length       0
FIPS                0
Length: 183, dtype: int64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e75edf0e-32f1-42b8-8a27-9dc0078a206d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>