In [2]:
import googlemaps  # pip install -U googlemaps
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import linear_model
import time
import subprocess
import requests
import json
from census import Census # pip install census
from us import states # pip install us
from area import area # pip install area
from tqdm import tqdm

In [14]:
# Google Map API
# up to 5,000 calls per month for nearby places search
# up to 40,000 calls per month for geocoding
# up to 40,000 calls per month for elevation
gmaps = googlemaps.Client(key='AIzaSyCvdQFsQNisoXrltPkiVgiVKQEu-EZaoog') 

# Walkscore API
walkscore_key = '4e5a6faea107e608b5cdd5b039a3a22b' # up to 5,000 calls per day

# Census API
c = Census("a9daf6ef81506b3306e32d8bb0d439a4aa98e1f5")

In [4]:
df_july = pd.read_csv('property_tax_records_July2016.csv')

In [28]:
# # format address and get coordinates 
# def format_address(property_address):
#     coordinates = df_july[df_july['PropertyLocation'] == property_address]['coordinates'].values[0]
#     property_state = 'CA'
#     property_zipcode = df_july[df_july['PropertyLocation'] == property_address]['ZIPcode5'].values[0]
#     return [property_address, coordinates, property_state, property_zipcode]

In [40]:
# format address and get coordinates 
def format_address(address):
    geocode_result = gmaps.geocode(address)
    property_address = geocode_result[0]['formatted_address']
    coordinates = (geocode_result[0]['geometry']['location']['lat'], geocode_result[0]['geometry']['location']['lng'])
#     property_state = property_address.split(',')[-2].strip().split(' ')[0]
#     property_zipcode = property_address.split(',')[-2].strip().split(' ')[1]
    property_state = 'CA'
    property_zipcode = address.split(' ')[-1]
    return [property_address, coordinates, property_state, property_zipcode]

In [6]:
def zipcode_area(state, zipcode):
    # geojson files downloaded from https://github.com/OpenDataDE/State-zip-code-GeoJSON
    geojson_file = state.lower() + '_' + str(states.lookup(state)).lower() + '_zip_codes_geo.min.json'
    boundary = pd.read_json(geojson_file)
    for i in range(len(boundary['features'])):
        if boundary['features'][i]['properties']['ZCTA5CE10'] == zipcode:
            geometry = boundary['features'][i]['geometry']
    area_sqmi = area(geometry)/2.59e+6
    return (area_sqmi)

In [7]:
def ACS_zipcode_data(state, zipcode):
    if c.acs5.zipcode('B01003_001E', zcta = zipcode) != []:
        population = c.acs5.zipcode('B01003_001E', zcta = zipcode)[0]['B01003_001E']
        household = c.acs5.zipcode('B11016_001E', zcta = zipcode)[0]['B11016_001E']
        area = zipcode_area(state, zipcode)
        pop_density = population / area
        HH_density = household / area
        # households size and type
        Family = c.acs5.zipcode('B11016_002E', zcta = zipcode)[0]['B11016_002E'] / household # get percentage for family households
        HH_Kids = c.acs5.zipcode('B09002_001E', zcta = zipcode)[0]['B09002_001E'] / household # get percentage for households with kids
        # tenure
        Owner = c.acs5.zipcode('B25003_002E', zcta = zipcode)[0]['B25003_002E'] / household # get percentage for owner-occupied households
        # vacancy
        Vacancy = c.acs5.zipcode('B25002_003E', zcta = zipcode)[0]['B25002_003E'] / c.acs5.zipcode('B25002_001E', zcta = zipcode)[0]['B25002_001E'] # get percentage for vacant properties
        # age
        Age = c.acs5.zipcode('B01002_001E', zcta = zipcode)[0]['B01002_001E']
        # race
        Race_W = c.acs5.zipcode('B02001_002E', zcta = zipcode)[0]['B02001_002E'] / population # get percentage for white alone
        Race_B = c.acs5.zipcode('B02001_003E', zcta = zipcode)[0]['B02001_003E'] / population # get percentage for black or african american alone
        Race_A = c.acs5.zipcode('B02001_005E', zcta = zipcode)[0]['B02001_005E'] / population # get percentage for asian alone
        Race_H = c.acs5.zipcode('B03001_003E', zcta = zipcode)[0]['B03001_003E'] / population # get percentage for hispanic or latino
        # median household income
        Income = c.acs5.zipcode('B19013_001E', zcta = zipcode)[0]['B19013_001E']
        # median year structure built
        Year_Built = c.acs5.zipcode('B25035_001E', zcta = zipcode)[0]['B25035_001E']
        # average commute time
        C10 = c.acs5.zipcode('B08134_002E', zcta = zipcode)[0]['B08134_002E'] # less than 10min
        C20 = c.acs5.zipcode('B08134_003E', zcta = zipcode)[0]['B08134_003E'] + c.acs5.zipcode('B08134_004E', zcta = zipcode)[0]['B08134_004E'] # 10 - 20 min
        C30 = c.acs5.zipcode('B08134_005E', zcta = zipcode)[0]['B08134_005E'] + c.acs5.zipcode('B08134_006E', zcta = zipcode)[0]['B08134_006E'] # 20 - 30 min
        C45 = c.acs5.zipcode('B08134_007E', zcta = zipcode)[0]['B08134_007E'] + c.acs5.zipcode('B08134_008E', zcta = zipcode)[0]['B08134_008E'] # 30 - 45 min
        C60 = c.acs5.zipcode('B08134_009E', zcta = zipcode)[0]['B08134_009E'] + c.acs5.zipcode('B08134_010E', zcta = zipcode)[0]['B08134_010E'] # 45+ min
        Commute = [10] * int(C10) + [20] * int(C20) + [30] * int(C30) + [45] * int(C45) + [60] * int(C60)
        Commute = np.median(Commute)
        # commute means
        Total_commute = c.acs5.zipcode('B08134_001E', zcta = zipcode)[0]['B08134_001E']
        Drive = c.acs5.zipcode('B08134_011E', zcta = zipcode)[0]['B08134_011E'] / Total_commute
        Carpool2 = c.acs5.zipcode('B08134_031E', zcta = zipcode)[0]['B08134_031E'] / Total_commute
        Carpool3 = c.acs5.zipcode('B08134_041E', zcta = zipcode)[0]['B08134_041E'] / Total_commute
        Transit = c.acs5.zipcode('B08134_061E', zcta = zipcode)[0]['B08134_061E'] / Total_commute
        Walk = c.acs5.zipcode('B08134_101E', zcta = zipcode)[0]['B08134_101E'] / Total_commute
        Other = c.acs5.zipcode('B08134_111E', zcta = zipcode)[0]['B08134_111E'] / Total_commute

        return [zipcode, pop_density, HH_density, Family, HH_Kids, Owner, Vacancy, Age, 
                Race_W, Race_B, Race_A, Race_H, Income, Year_Built, Commute, 
                Drive, Carpool2, Carpool3, Transit, Walk, Other]
    else:
        return [zipcode] + [0]*20

In [16]:
unique_zipcodes = df_july['ZIPcode5'].unique()
census_features = {}
for zipcode in tqdm(unique_zipcodes):
    zipcode = str(zipcode)
    census_features[zipcode] = ACS_zipcode_data('CA', str(zipcode))

100%|██████████| 278/278 [33:17<00:00,  7.18s/it]


In [17]:
with open('census_dict.txt', 'w') as file:
     file.write(json.dumps(census_features))

In [46]:
# add walkscores as features
def get_walkscore(property_address, coordinates, walkscore_key):
    url = 'http://api.walkscore.com/score?format=json&address=' + \
    property_address + '&lat=' + str(coordinates[0]) + '&lon=' + str(coordinates[1]) + \
    '&transit=1&bike=1&wsapikey=' + walkscore_key
    response = requests.get(url, verify=True)
    output = response.json()
    # write walkscores
    walkscores = []
    if 'description' in output.keys():
        walkscores.append(output['description']) 
    else:
        walkscores.append('NA')
    if 'walkscore' in output.keys():
        walkscores.append(output['walkscore']) 
    else:
        walkscores.append('NA')
    if 'transit' in output.keys():
        if 'description' in output['transit'].keys():
            walkscores.append(output['transit']['description']) 
        else:
            walkscores.append('Not Transitable')
        if 'score' in output['transit'].keys():
            walkscores.append(output['transit']['score'])
        else: 
            walkscores.append(0)
    else:
        walkscores+= ['Not Transitable',0]
    if 'bike' in output.keys():
        if 'description' in output['bike'].keys():
            walkscores.append(output['bike']['description'])
        else:
            walkscores.append('Not Bikeable')
        if 'score' in output['bike'].keys():
            walkscores.append(output['bike']['score'])
        else:
            walkscores.append(0)
    else:
        walkscores += ['Not Bikeable',0]
    return walkscores

In [47]:
walkscore_features = {}
for address in tqdm(unique_addresses[:10]):
    property_address, coordinates, property_state, property_zipcode = format_address(address)
    walkscore_features[address] = [address] + get_walkscore(property_address, coordinates, walkscore_key)

100%|██████████| 10/10 [00:07<00:00,  1.28it/s]


In [48]:
with open('walkscore_dict.txt', 'w') as file:
     file.write(json.dumps(walkscore_features))

In [9]:
# unique_addresses = df_july['PropertyLocation'].unique()
# walkscore_features = {}
# for address in tqdm(unique_addresses[4000:8000]):
#     property_address, coordinates, property_state, property_zipcode = format_address(address)
#     walkscore_features[address] = get_walkscore(address, coordinates, walkscore_key)

100%|██████████| 4000/4000 [10:49<00:00,  6.16it/s]


In [10]:
# with open('walkscore_dict.txt', 'a') as file:
#      file.write(json.dumps(walkscore_features))