In [1]:
import googlemaps  # pip install -U googlemaps
import numpy as np
import pandas as pd
import time
import subprocess
import requests
from census import Census # pip install census
from us import states # pip install us
from area import area # pip install area
from tqdm import tqdm

### Obtain Assessor Parcels Data

In [2]:
# get 2017 data (no useful records)
url = "https://data.lacounty.gov/resource/vak5-2hqh.json?$where=rollyear = '2017' \
AND usecodedescchar1 = 'Residential' \
AND propertylocation IS NOT NULL \
AND starts_with(recordingdate, '2017') \
AND effectiveyearbuilt != 0 \
AND bedrooms != 0 \
AND bathrooms != 0"
response = requests.get(url, verify=True)
output = response.json()

In [3]:
assessor = output
print (len(assessor))

0


In [4]:
# get 2016 data
url = "https://data.lacounty.gov/resource/vak5-2hqh.json?$where=rollyear = '2017' \
AND usecodedescchar1 = 'Residential' \
AND propertylocation IS NOT NULL \
AND starts_with(recordingdate, '2016') \
AND effectiveyearbuilt != 0 \
AND bedrooms != 0 \
AND bathrooms != 0 \
&$limit=2000&$offset=0"
# change limit number and offset number to get the full dataset (2.1M records)
response = requests.get(url, verify=True)
output = response.json()

In [5]:
assessor = output
print (len(assessor))

2000


In [6]:
features = ['propertylocation', 'roll_totalvalue', 'recordingdate', \
            'sqftmain', 'units', 'bedrooms', 'bathrooms', 'effectiveyearbuilt', \
            'usecodedescchar3', 'usecodedescchar4', 'roll_homeownersexemp']
assessor_features = {}
for feature in features:
    for i in range(len(assessor)):
        if i == 0:
            assessor_features[feature] = [assessor[i][feature]]
        else:
            assessor_features[feature] += [assessor[i][feature]]
df = pd.DataFrame(assessor_features)

In [7]:
# reformat dataframe
df['townhouse'] = np.where(df['usecodedescchar3']=='Townhouse Format', 'yes', 'no')
df['pool'] = np.where(df['usecodedescchar4'].str.contains('Pool'), 'yes', 'no')
df['condo'] = np.where(df['usecodedescchar4']=='Condominium', 'yes', 'no')
df['pud'] = np.where(df['usecodedescchar4']=='Planned Unit Development (PUD)', 'yes', 'no')
df['owned'] = np.where(df['roll_homeownersexemp']=='7000.00', 'yes', 'no')
df = df.drop(['usecodedescchar3', 'usecodedescchar4', 'roll_homeownersexemp'], axis=1)

Unnamed: 0,bathrooms,bedrooms,effectiveyearbuilt,propertylocation,recordingdate,roll_totalvalue,sqftmain,units,townhouse,pool,condo,pud,owned
0,2,4,1973,8309 FAUST AVE LOS ANGELES CA 91304,20161208,639000.00,2057,1,no,no,no,no,no
1,3,4,1978,8331 PONCE AVE LOS ANGELES CA 91304,20160802,735000.00,2538,1,no,yes,no,no,no
2,4,4,1962,22726 ECCLES ST LOS ANGELES CA 91304,20160331,749700.00,2630,1,no,yes,no,no,yes
3,3,3,1960,22750 ECCLES ST LOS ANGELES CA 91304,20160920,257434.00,2009,1,no,yes,no,no,no
4,4,4,1973,8378 FALLBROOK AVE LOS ANGELES CA 91304,20160802,728000.00,2072,1,no,yes,no,no,yes
5,3,3,1958,8344 FALLBROOK AVE LOS ANGELES CA 91304,20160912,233870.00,1442,1,no,no,no,no,yes
6,3,3,1959,8356 MAYNARD AVE LOS ANGELES CA 91304,20161121,236591.00,1538,1,no,no,no,no,yes
7,3,3,1959,8350 MAYNARD AVE LOS ANGELES CA 91304,20160212,588000.00,1589,1,no,no,no,no,yes
8,3,3,1960,8363 PONCE AVE LOS ANGELES CA 91304,20160923,551268.00,1589,1,no,yes,no,no,yes
9,3,4,1963,8362 PONCE AVE LOS ANGELES CA 91304,20160721,656624.00,2338,1,no,yes,no,no,yes


### Obtain Other Datasets

In [8]:
# Google Map API
gmaps = googlemaps.Client(key='AIzaSyCvdQFsQNisoXrltPkiVgiVKQEu-EZaoog') # up to 5,000 calls per month for nearby places search
# Walkscore API
walkscore_key = '4e5a6faea107e608b5cdd5b039a3a22b' # up to 5,000 calls per day
# Census API
c = Census("a9daf6ef81506b3306e32d8bb0d439a4aa98e1f5")

In [9]:
# format address and get coordinates 
def format_address(property_address):
    geocode_result = gmaps.geocode(property_address)
    property_address = geocode_result[0]['formatted_address']
    coordinates = (geocode_result[0]['geometry']['location']['lat'], geocode_result[0]['geometry']['location']['lng'])
    property_state = property_address.split(',')[-2].strip().split(' ')[0]
    property_zipcode = property_address.split(',')[-2].strip().split(' ')[1]
    return [property_address, coordinates, property_state, property_zipcode]

#### Census Data

In [10]:
def zipcode_area(state, zipcode):
    # geojson files downloaded from https://github.com/OpenDataDE/State-zip-code-GeoJSON
    geojson_file = state.lower() + '_' + str(states.lookup(state)).lower() + '_zip_codes_geo.min.json'
    boundary = pd.read_json(geojson_file)
    for i in range(len(boundary['features'])):
        if boundary['features'][i]['properties']['ZCTA5CE10'] == zipcode:
            geometry = boundary['features'][i]['geometry']
    area_sqmi = area(geometry)/2.59e+6
    return (area_sqmi)

In [11]:
def ACS_zipcode_data(state, zipcode):
    population = c.acs5.zipcode('B01003_001E', zcta = zipcode)[0]['B01003_001E']
    household = c.acs5.zipcode('B11016_001E', zcta = zipcode)[0]['B11016_001E']
    area = zipcode_area(state, zipcode)
    pop_density = population / area
    HH_density = household / area
    # households size and type
    Family = c.acs5.zipcode('B11016_002E', zcta = zipcode)[0]['B11016_002E'] / household # get percentage for family households
    HH2 = [c.acs5.zipcode(table, zcta = zipcode)[0][table] for table in ['B11016_003E', 'B11016_010E', 'B11016_011E']]
    HH2 = sum(HH2) / household # get percentage for HH with no more than 2-person
    HH3 = [c.acs5.zipcode(table, zcta = zipcode)[0][table] for table in ['B11016_004E', 'B11016_012E']]
    HH3 = sum(HH3) / household # get percentage for HH with 3-person
    HH4 = [c.acs5.zipcode(table, zcta = zipcode)[0][table] for table in ['B11016_005E', 'B11016_006E', 'B11016_007E', 'B11016_008E', 'B11016_013E', 'B11016_014E', 'B11016_015E', 'B11016_016E']]
    HH4 = sum(HH4) / household # get percentage for HH with no less than 4-person
    HH_Kids = c.acs5.zipcode('B09002_001E', zcta = zipcode)[0]['B09002_001E'] / household # get percentage for households with kids
    # tenure
    Owner = c.acs5.zipcode('B25003_002E', zcta = zipcode)[0]['B25003_002E'] / household # get percentage for owner-occupied households
    # vacancy
    Vacancy = c.acs5.zipcode('B25002_003E', zcta = zipcode)[0]['B25002_003E'] / c.acs5.zipcode('B25002_001E', zcta = zipcode)[0]['B25002_001E'] # get percentage for vacant properties
    # age
    Age = c.acs5.zipcode('B01002_001E', zcta = zipcode)[0]['B01002_001E']
    # race
    Race_W = c.acs5.zipcode('B02001_002E', zcta = zipcode)[0]['B02001_002E'] / population # get percentage for white alone
    Race_B = c.acs5.zipcode('B02001_003E', zcta = zipcode)[0]['B02001_003E'] / population # get percentage for black or african american alone
    Race_A = c.acs5.zipcode('B02001_005E', zcta = zipcode)[0]['B02001_005E'] / population # get percentage for asian alone
    Race_H = c.acs5.zipcode('B03001_003E', zcta = zipcode)[0]['B03001_003E'] / population # get percentage for hispanic or latino
    # median household income
    Income = c.acs5.zipcode('B19013_001E', zcta = zipcode)[0]['B19013_001E']
    # median monthly housing costs
    Cost = c.acs5.zipcode('B25105_001E', zcta = zipcode)[0]['B25105_001E']
    # median property value
    Value = c.acs5.zipcode('B25077_001E', zcta = zipcode)[0]['B25077_001E']
    # median year structure built
    Year_Built = c.acs5.zipcode('B25035_001E', zcta = zipcode)[0]['B25035_001E']
    # average commute time
    C10 = c.acs5.zipcode('B08134_002E', zcta = zipcode)[0]['B08134_002E'] # less than 10min
    C20 = c.acs5.zipcode('B08134_003E', zcta = zipcode)[0]['B08134_003E'] + c.acs5.zipcode('B08134_004E', zcta = zipcode)[0]['B08134_004E'] # 10 - 20 min
    C30 = c.acs5.zipcode('B08134_005E', zcta = zipcode)[0]['B08134_005E'] + c.acs5.zipcode('B08134_006E', zcta = zipcode)[0]['B08134_006E'] # 20 - 30 min
    C45 = c.acs5.zipcode('B08134_007E', zcta = zipcode)[0]['B08134_007E'] + c.acs5.zipcode('B08134_008E', zcta = zipcode)[0]['B08134_008E'] # 30 - 45 min
    C60 = c.acs5.zipcode('B08134_009E', zcta = zipcode)[0]['B08134_009E'] + c.acs5.zipcode('B08134_010E', zcta = zipcode)[0]['B08134_010E'] # 45+ min
    Commute = [10] * int(C10) + [20] * int(C20) + [30] * int(C30) + [45] * int(C45) + [60] * int(C60)
    Commute = np.median(Commute)
    # commute means
    Total_commute = c.acs5.zipcode('B08134_001E', zcta = zipcode)[0]['B08134_001E']
    Drive = c.acs5.zipcode('B08134_011E', zcta = zipcode)[0]['B08134_011E'] / Total_commute
    Carpool2 = c.acs5.zipcode('B08134_031E', zcta = zipcode)[0]['B08134_031E'] / Total_commute
    Carpool3 = c.acs5.zipcode('B08134_041E', zcta = zipcode)[0]['B08134_041E'] / Total_commute
    Transit = c.acs5.zipcode('B08134_061E', zcta = zipcode)[0]['B08134_061E'] / Total_commute
    Walk = c.acs5.zipcode('B08134_101E', zcta = zipcode)[0]['B08134_101E'] / Total_commute
    Other = c.acs5.zipcode('B08134_111E', zcta = zipcode)[0]['B08134_111E'] / Total_commute

    return [zipcode, pop_density, HH_density, Family, HH2, HH3, HH4, HH_Kids, Owner, Vacancy, Age, 
            Race_W, Race_B, Race_A, Race_H, Income, Cost, Value, Year_Built, Commute, 
            Drive, Carpool2, Carpool3, Transit, Walk, Other]

#### Walkscore Data and Google Nearby Places

In [12]:
def get_walkscore(property_address, coordinates, walkscore_key):
    url = 'http://api.walkscore.com/score?format=json&address=' + \
    property_address + '&lat=' + str(coordinates[0]) + '&lon=' + str(coordinates[1]) + \
    '&transit=1&bike=1&wsapikey=' + walkscore_key
    response = requests.get(url, verify=True)
    output = response.json()
    # write walkscores
    walkscores = []
    walkscores.append(output['description']) 
    walkscores.append(output['walkscore']) 
    walkscores.append(output['transit']['description']) 
    walkscores.append(output['transit']['score'])
    walkscores.append(output['bike']['description'])
    walkscores.append(output['transit']['score'])
    return walkscores

In [13]:
def get_places(coordinates):
    radius = 1600 #1600m is about 1 mile
    places_dict = {}
    features = []
    categories = ['restaurants', 'coffee', 'bars', 'groceries', 'parks', 'schools', 'shopping', 'entertainment'] # ignore errands to simplify process
    for category in categories:
        # get nearby places
        nearby_places = gmaps.places_nearby(location=coordinates, keyword=category, radius=radius)
        places = nearby_places['results']
        if 'next_page_token' in nearby_places.keys():
            time.sleep(2) # There is a short delay between when a next_page_token is issued, and when it will become valid
            nearby_places2 = gmaps.places_nearby(location=coordinates, page_token=nearby_places['next_page_token'])
            places += nearby_places2['results']
            if 'next_page_token' in nearby_places2.keys():
                time.sleep(2) # There is a short delay between when a next_page_token is issued, and when it will become valid
                nearby_places3 = gmaps.places_nearby(location=coordinates, page_token=nearby_places['next_page_token'])
                places += nearby_places3['results']
        
        # write nearby places' names and distances to a dictionary
        places_dict[category] = []
        total, within_half_mile, within_qt_mile = 0, 0, 0
        closest = float("inf")
        for place in places:
            place_location = place['geometry']['location']
            place_coordinates = (place_location['lat'], place_location['lng'])

            # walking distance through open street map routes (project osrm)
            url = 'http://router.project-osrm.org/route/v1/walking/' + \
            str(coordinates[1]) + ',' + str(coordinates[0]) + ';' + \
            str(place_coordinates[1]) + ',' + str(place_coordinates[0]) + '?overview=false'
            response = requests.get(url, verify=True)
            output = response.json()
            distance = output['routes'][0]['distance'] * 0.000621371 # convert from meter to mile
            
            places_dict[category] += [(place['name'], distance)]
            total += 1
            if distance <= 0.25:
                within_qt_mile += 1
            if distance <= 0.5:
                within_half_mile += 1
            if distance < closest:
                closest = distance
                closest = round(closest, 2)
        # add number of places, number of places within half mile/10 minutes walking, shortest distance for each category
        features += [total, within_half_mile, within_qt_mile, closest]
        
    return (features)

In [14]:
df_test = df[:100]

In [15]:
# columns to add
census_labels = ['zipcode', 'pop_density', 'HH_density', 'Family', 'HH2', 'HH3', 'HH4', 'HH_Kids', 
          'Owner', 'Vacancy', 'Age', 'Race_W', 'Race_B', 'Race_A', 'Race_H', 'Income', 'Cost', 'Value', 
          'Year_Built', 'Commute', 'Drive', 'Carpool2', 'Carpool3', 'Transit', 'Walk', 'Other']
walkscore_labels = ['walkable', 'walkscore','transitable', 'transit_score', 'bikeable', 'bike_score']
nearby_labels = []
categories = ['restaurants', 'coffee', 'bars', 'groceries', 'parks', 'schools', 'shopping', 'entertainment']
for category in categories:
    for distance in ['within_one_mile', 'within_half_mile', 'within_qt_mile', 'closest']:
        nearby_labels.append(category + '_' + distance)

In [16]:
pd.options.mode.chained_assignment = None  # default='warn'
for i in tqdm(range(len(df_test))):
    property_address, coordinates, property_state, property_zipcode = format_address(df['propertylocation'][i])
    
    # add elevation
    elevation = gmaps.elevation(coordinates)
    elevation = elevation[0]['elevation']
    df_test.loc[i, 'elevation'] = elevation
    
    # add census
    census_features = ACS_zipcode_data(property_state, property_zipcode)
    for l in range(len(census_labels)):
        df_test.loc[i, census_labels[l]] = census_features[l]
        
    # add walkscore
    walkscores = get_walkscore(property_address, coordinates, walkscore_key)
    for l in range(len(walkscore_labels)):
        df_test.loc[i, walkscore_labels[l]] = walkscores[l]
        
    # add nearby places
    nearby_places = get_places(coordinates=coordinates) 
    for l in range(len(nearby_labels)):
        df_test.loc[i, nearby_labels[l]] = nearby_places[l]

100%|██████████| 100/100 [47:03<00:00, 28.24s/it]


In [17]:
df_test

Unnamed: 0,bathrooms,bedrooms,effectiveyearbuilt,propertylocation,recordingdate,roll_homeownersexemp,roll_totalvalue,sqftmain,units,usecodedescchar3,...,schools_within_qt_mile,schools_closest,shopping_within_one_mile,shopping_within_half_mile,shopping_within_qt_mile,shopping_closest,entertainment_within_one_mile,entertainment_within_half_mile,entertainment_within_qt_mile,entertainment_closest
0,2,4,1973,8309 FAUST AVE LOS ANGELES CA 91304,20161208,0.00,639000.00,2057,1,,...,0.0,0.38,2.0,0.0,0.0,1.380000,4.0,0.0,0.0,1.28
1,3,4,1978,8331 PONCE AVE LOS ANGELES CA 91304,20160802,0.00,735000.00,2538,1,,...,0.0,0.26,2.0,0.0,0.0,1.300000,3.0,0.0,0.0,1.15
2,4,4,1962,22726 ECCLES ST LOS ANGELES CA 91304,20160331,7000.00,749700.00,2630,1,,...,0.0,0.39,2.0,0.0,0.0,1.430000,2.0,0.0,0.0,1.29
3,3,3,1960,22750 ECCLES ST LOS ANGELES CA 91304,20160920,0.00,257434.00,2009,1,,...,0.0,0.34,1.0,0.0,0.0,1.390000,2.0,0.0,0.0,1.24
4,4,4,1973,8378 FALLBROOK AVE LOS ANGELES CA 91304,20160802,7000.00,728000.00,2072,1,,...,0.0,0.26,1.0,0.0,0.0,2.000000,1.0,0.0,0.0,1.15
5,3,3,1958,8344 FALLBROOK AVE LOS ANGELES CA 91304,20160912,7000.00,233870.00,1442,1,,...,1.0,0.17,1.0,0.0,0.0,1.910000,2.0,0.0,0.0,1.07
6,3,3,1959,8356 MAYNARD AVE LOS ANGELES CA 91304,20161121,7000.00,236591.00,1538,1,,...,0.0,0.26,3.0,0.0,0.0,1.300000,3.0,0.0,0.0,1.16
7,3,3,1959,8350 MAYNARD AVE LOS ANGELES CA 91304,20160212,7000.00,588000.00,1589,1,,...,1.0,0.24,2.0,0.0,0.0,1.280000,3.0,0.0,0.0,1.14
8,3,3,1960,8363 PONCE AVE LOS ANGELES CA 91304,20160923,7000.00,551268.00,1589,1,,...,0.0,0.33,2.0,0.0,0.0,1.380000,3.0,0.0,0.0,1.23
9,3,4,1963,8362 PONCE AVE LOS ANGELES CA 91304,20160721,7000.00,656624.00,2338,1,,...,0.0,0.33,2.0,0.0,0.0,1.370000,3.0,0.0,0.0,1.22


In [20]:
df_test.to_csv('test_100records.csv')