In [1]:
import pandas as pd
#pd.options.display.max_rows = 1000

import pickle
import re

import numpy as np

import geopandas as gpd
import pyproj
import shapefile
import csv

import urllib

In [2]:
def get_prj(epsg_code):
    wkt = urllib.urlopen("http://spatialreference.org/ref/epsg/{0}/prettywkt/".format(epsg_code))
    remove_spaces = wkt.read().replace(" ","")
    output = remove_spaces.replace("\n", "")
    return output

In [3]:
def create_shp(city):
    filepath_in = '../data/external/airbnb/'
    filepath_csv = filepath_in + '{}.listings.csv'.format(city)
    
    filepath_out = '../data/interim/airbnb/'
    filepath_shp = filepath_out + 'airbnb_{}.shp'.format(city)
    filepath_prj = filepath_out + 'airbnb_{}.prj'.format(city)
    
    #Create point shapefile writer
    w = shapefile.Writer(shapefile.POINT)
    
    #Add field names to write (id=airbnb listing id)
    w.field('id')
    
    #read in csv and assign pt geometry (aka create shapefile)
    with open(filepath_csv) as f:
        reader = csv.DictReader(f)
        for row in reader:
            #add record for each id
            w.record(row['id'])
            #create point from coordinates
            w.point(float(row['longitude']), float(row['latitude']))
    
    #save shapefile
    w.save(filepath_shp)
    
    #save prj
    with open(filepath_prj, 'w') as f:
        #epsg code for WGS84 is 4326
        epsg = get_prj('4326')
        f.write(epsg)

In [4]:
def create_gpd(city, state):
    #define filepaths to shapefiles
    shp_airbnb = '../data/interim/airbnb/airbnb_{}.shp'.format(city)
    zillow = 'ZillowNeighborhoods-{}'.format(state)
    shp_neighborhoods = '../data/external/neighborhoods_zillow/{}/{}.shp'.format(zillow, zillow)
    
    #create gpds
    airbnb = gpd.read_file(shp_airbnb)
    neighborhoods = gpd.read_file(shp_neighborhoods)
    
    #project airbnb gpb if needed
    if airbnb.crs != neighborhoods.crs:
        airbnb_projected = airbnb.to_crs(neighborhoods.crs)
    
    #return gpds
    return airbnb_projected, neighborhoods

In [5]:
def join_gpds(city, state, airbnb, neighborhoods):
    cols = ['Name', 'City', 'State']
    neighborhoods = neighborhoods[cols + ['geometry']]
    joined = gpd.sjoin(airbnb, neighborhoods, how='left', op='within')
    joined['Name'].fillna('OUTSIDE ZILLOW', inplace=True)
    joined['State'].fillna(state, inplace=True)
    return joined[['id'] + cols]

In [6]:
cities = {'asheville': 'NC',
          'austin': 'TX',
          'boston': 'MA',
          'chicago': 'IL',
          'denver': 'CO',
          'losangeles': 'CA',
          'nashville': 'TN',
          'neworleans': 'LA',
          'newyorkcity': 'NY',
          'oakland': 'CA',
          'portland': 'OR',
          'sandiego': 'CA',
          'sanfrancisco': 'CA',
          'santacruz': 'CA',
          'seattle': 'WA',
          'washingtondc': 'DC',}

In [7]:
all_neighborhoods = pd.DataFrame()
for city, state in cities.items():
    create_shp(city)
    airbnb, neighborhoods = create_gpd(city, state)
    try:
        city_neighborhoods = join_gpds(city, state, airbnb, neighborhoods)
        print city, state, len(city_neighborhoods)
        all_neighborhoods = all_neighborhoods.append(city_neighborhoods, ignore_index=True)
    except:
        print city, state, 'failed'
        continue

losangeles CA 26080
sanfrancisco CA 8619
newyorkcity NY 40367
neworleans LA 4514
chicago IL 5147
boston MA 3617
nashville TN 3277
washingtondc DC 3724
denver CO 2516
santacruz CA failed
asheville NC failed
sandiego CA 6608
portland OR 3504
oakland CA 1718
austin TX 5835
seattle WA 3818


In [8]:
len(all_neighborhoods)

119344

In [9]:
with open('../data/interim/01_neighborhoods.pkl', 'wb') as picklefile:
    pickle.dump(all_neighborhoods, picklefile)