In [1]:
from census import Census # pip install census
from us import states # pip install us
import numpy as np
import pandas as pd
from area import area # pip install area
c = Census("a9daf6ef81506b3306e32d8bb0d439a4aa98e1f5")

In [2]:
# dummy address for test
property_address = "46 Hano St,Boston"

In [3]:
# assume we can get the zipcode and state from the listing
property_zipcode = '02134'
property_state = 'MA'

In [4]:
# test for zipcode data and state data
# c.acs5.zipcode('B01003_001E', zcta = property_zipcode)
# c.acs5.state('B25034_010E', states.mapping('abbr','fips')[property_state])

In [5]:
# list of tables to call
# population - B01003_001E
# number of households - B11016_001E
# households size and type - B11016_001E~016E
# households with kids - B09002_001E
# tenure - B25003_002E
# vacancy - B25002_001E/003E
# median age - B01002_001E
# race - B02001_001E/002E/003E/005E 
# hispanic or latino - B03001_003E
# median household income - B19013_001E
# median monthly housing costs - B25105_001E
# property value - B25075_001E~027E, B25077_001E(median for owner-occupied)
# median year structure built - B25035_001E
# commute time - B08134_001E~111E

In [6]:
def zipcode_area(state, zipcode):
    # geojson files downloaded from https://github.com/OpenDataDE/State-zip-code-GeoJSON
    geojson_file = state.lower() + '_' + str(states.lookup(state)).lower() + '_zip_codes_geo.min.json'
    boundary = pd.read_json(geojson_file)
    for i in range(len(boundary['features'])):
        if boundary['features'][i]['properties']['ZCTA5CE10'] == zipcode:
            geometry = boundary['features'][i]['geometry']
    area_sqmi = area(geometry)/2.59e+6
    return (area_sqmi)

In [7]:
def ACS_zipcode_data(zipcode):
    population = c.acs5.zipcode('B01003_001E', zcta = zipcode)[0]['B01003_001E']
    household = c.acs5.zipcode('B11016_001E', zcta = zipcode)[0]['B11016_001E']
    area = zipcode_area(property_state, zipcode)
    pop_density = population / area
    HH_density = household / area
    # households size and type
    Family = c.acs5.zipcode('B11016_002E', zcta = zipcode)[0]['B11016_002E'] / household # get percentage for family households
    HH2 = [c.acs5.zipcode(table, zcta = zipcode)[0][table] for table in ['B11016_003E', 'B11016_010E', 'B11016_011E']]
    HH2 = sum(HH2) / household # get percentage for HH with no more than 2-person
    HH3 = [c.acs5.zipcode(table, zcta = zipcode)[0][table] for table in ['B11016_004E', 'B11016_012E']]
    HH3 = sum(HH3) / household # get percentage for HH with 3-person
    HH4 = [c.acs5.zipcode(table, zcta = zipcode)[0][table] for table in ['B11016_005E', 'B11016_006E', 'B11016_007E', 'B11016_008E', 'B11016_013E', 'B11016_014E', 'B11016_015E', 'B11016_016E']]
    HH4 = sum(HH4) / household # get percentage for HH with no less than 4-person
    HH_Kids = c.acs5.zipcode('B09002_001E', zcta = zipcode)[0]['B09002_001E'] / household # get percentage for households with kids
    # tenure
    Owner = c.acs5.zipcode('B25003_002E', zcta = zipcode)[0]['B25003_002E'] / household # get percentage for owner-occupied households
    # vacancy
    Vacancy = c.acs5.zipcode('B25002_003E', zcta = zipcode)[0]['B25002_003E'] / c.acs5.zipcode('B25002_001E', zcta = zipcode)[0]['B25002_001E'] # get percentage for vacant properties
    # age
    Age = c.acs5.zipcode('B01002_001E', zcta = zipcode)[0]['B01002_001E']
    # race
    Race_W = c.acs5.zipcode('B02001_002E', zcta = zipcode)[0]['B02001_002E'] / population # get percentage for white alone
    Race_B = c.acs5.zipcode('B02001_003E', zcta = zipcode)[0]['B02001_003E'] / population # get percentage for black or african american alone
    Race_A = c.acs5.zipcode('B02001_005E', zcta = zipcode)[0]['B02001_005E'] / population # get percentage for asian alone
    Race_H = c.acs5.zipcode('B03001_003E', zcta = zipcode)[0]['B03001_003E'] / population # get percentage for hispanic or latino
    # median household income
    Income = c.acs5.zipcode('B19013_001E', zcta = zipcode)[0]['B19013_001E']
    # median monthly housing costs
    Cost = c.acs5.zipcode('B25105_001E', zcta = zipcode)[0]['B25105_001E']
    # median property value
    Value = c.acs5.zipcode('B25077_001E', zcta = zipcode)[0]['B25077_001E']
    # median year structure built
    Year_Built = c.acs5.zipcode('B25035_001E', zcta = zipcode)[0]['B25035_001E']
    # average commute time
    C10 = c.acs5.zipcode('B08134_002E', zcta = zipcode)[0]['B08134_002E'] # less than 10min
    C20 = c.acs5.zipcode('B08134_003E', zcta = zipcode)[0]['B08134_003E'] + c.acs5.zipcode('B08134_004E', zcta = zipcode)[0]['B08134_004E'] # 10 - 20 min
    C30 = c.acs5.zipcode('B08134_005E', zcta = zipcode)[0]['B08134_005E'] + c.acs5.zipcode('B08134_006E', zcta = zipcode)[0]['B08134_006E'] # 20 - 30 min
    C45 = c.acs5.zipcode('B08134_007E', zcta = zipcode)[0]['B08134_007E'] + c.acs5.zipcode('B08134_008E', zcta = zipcode)[0]['B08134_008E'] # 30 - 45 min
    C60 = c.acs5.zipcode('B08134_009E', zcta = zipcode)[0]['B08134_009E'] + c.acs5.zipcode('B08134_010E', zcta = zipcode)[0]['B08134_010E'] # 45+ min
    Commute = [10] * int(C10) + [20] * int(C20) + [30] * int(C30) + [45] * int(C45) + [60] * int(C60)
    Commute = np.median(Commute)
    # commute means
    Total_commute = c.acs5.zipcode('B08134_001E', zcta = zipcode)[0]['B08134_001E']
    Drive = c.acs5.zipcode('B08134_011E', zcta = zipcode)[0]['B08134_011E'] / Total_commute
    Carpool2 = c.acs5.zipcode('B08134_031E', zcta = zipcode)[0]['B08134_031E'] / Total_commute
    Carpool3 = c.acs5.zipcode('B08134_041E', zcta = zipcode)[0]['B08134_041E'] / Total_commute
    Transit = c.acs5.zipcode('B08134_061E', zcta = zipcode)[0]['B08134_061E'] / Total_commute
    Walk = c.acs5.zipcode('B08134_101E', zcta = zipcode)[0]['B08134_101E'] / Total_commute
    Other = c.acs5.zipcode('B08134_111E', zcta = zipcode)[0]['B08134_111E'] / Total_commute

    return [zipcode, pop_density, HH_density, Family, HH2, HH3, HH4, HH_Kids, Owner, Vacancy, Age, 
            Race_W, Race_B, Race_A, Race_H, Income, Cost, Value, Year_Built, Commute, 
            Drive, Carpool2, Carpool3, Transit, Walk, Other]

In [8]:
census_features = ACS_zipcode_data(property_zipcode)

In [9]:
labels = ['zipcode', 'pop_density', 'HH_density', 'Family', 'HH2', 'HH3', 'HH4', 'HH_Kids', 
          'Owner', 'Vacancy', 'Age', 'Race_W', 'Race_B', 'Race_A', 'Race_H', 'Income', 'Cost', 'Value', 
          'Year_Built', 'Commute', 'Drive', 'Carpool2', 'Carpool3', 'Transit', 'Walk', 'Other']
df = pd.DataFrame.from_records([census_features], columns = labels)

In [10]:
df

Unnamed: 0,zipcode,pop_density,HH_density,Family,HH2,HH3,HH4,HH_Kids,Owner,Vacancy,...,Cost,Value,Year_Built,Commute,Drive,Carpool2,Carpool3,Transit,Walk,Other
0,2134,12819.730095,5485.428322,0.247145,0.732669,0.152722,0.114608,0.113546,0.138911,0.114743,...,1575.0,438400.0,1939.0,45.0,0.353231,0.032837,0.020879,0.421562,0.131916,0.09329
