In [1]:
import os.path
from geopandas import GeoDataFrame, GeoSeries, read_file
from geopandas.tools import sjoin
from shapely.geometry import Point
from pandas import to_numeric
from math import log

import jupynbimp
import restaurants_data_cleaning, review_data_process, census2010_data_cleaning, acs2013_data_cleaning

importing Jupyter notebook from restaurants_data_cleaning.ipynb
importing Jupyter notebook from review_data_process.ipynb
importing Jupyter notebook from review_data_getData.ipynb
importing Jupyter notebook from topic_dictionary.ipynb
importing Jupyter notebook from review_data_classify.ipynb
importing Jupyter notebook from review_data_topics_extractor.ipynb
importing Jupyter notebook from census2010_data_cleaning.ipynb
importing Jupyter notebook from acs2013_data_cleaning.ipynb


In [11]:
# Source CRS: WGS 84/Lat,Long; Projected CRS: WGS 84/UTM Zone 12N
CRS = {'GCS':'+init=epsg:4326', 'projected':'+init=epsg:32612'}
USE_CACHE=False # if false, forces all modules to re-compute variables (takes significantly more time!)
SQ_METERS_PER_SQ_KM=1000000
MARKET_AREA_RADIUS=3000

In [3]:
def normalizeByArea(geoDataFrame, column):
    return geoDataFrame.apply(
        lambda row: row[column]/row['area_sqkm'], axis=1
    ) 

## Transform data into consolidated shapefile

### Yelp Restaurants

In [4]:
reviews = (review_data_process.getData(fromCache=USE_CACHE)
           .set_index('business_id')
          )

In [5]:
restaurants = (restaurants_data_cleaning.getData(fromCache=USE_CACHE)
               .set_index('business_id')
               .join(reviews, how='inner')
               .reset_index()
              )

In [6]:
restaurants = GeoDataFrame(
    restaurants,
    geometry=[Point(x,y) 
              for x, y in zip(restaurants['longitude'],
                              restaurants['latitude'])
             ],
    crs=CRS['GCS']
)
restaurants.to_crs(CRS['projected'], inplace=True)
restaurants.drop(['latitude', 'longitude'], axis=1, inplace=True)

### Restaurant Proximity Variables 

In [7]:
# Phoenix Convention Center in projected coordinates
phoenixCBD = GeoSeries(Point(400557, 3701492), crs=CRS['projected'])

restaurants['dist_CBD'] = restaurants.geometry.apply(
    lambda point: phoenixCBD.distance(point)
)

In [10]:
if USE_CACHE & os.path.isfile('../data/shapefiles/motorway-exits/motorwayExits.shp'):
    motorwayExits = read_file('../data/shapefiles/motorway-exits/motorwayExits.shp')
    
else:

    # Retrieved 17/07/2016 from http://download.geofabrik.de/north-america/us/arizona.html
    motorwayExits = read_file('/arizona-latest/roads.shp', 
                              vfs='zip://../data/shapefiles/arizona-latest.zip'
                             ).to_crs(CRS['projected'])

    motorwayExits = (motorwayExits[motorwayExits['type'] == 'motorway_link'])

    # save as shapefile
    motorwayExits.to_file('../data/shapefiles/motorway-exits/motorwayExits.shp')
    
restaurants['dist_mwy_exit'] = restaurants.geometry.apply(
        lambda point: motorwayExits.distance(point).min()
)

In [12]:
restaurants['competitors'] = restaurants.geometry.apply(
    lambda point: sum(restaurants.intersects(point.buffer(MARKET_AREA_RADIUS)))-1
)

### Census 2010

In [13]:
census2010 = census2010_data_cleaning.getData(fromCache=USE_CACHE)

# Retrieved 10-07-2016 from https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2010&layergroup=Block+Groups
blockGroups2010 = read_file('/tl_2010_04_bg10.shp', 
                            vfs='zip://../data/shapefiles/tl_2010_04_bg10.zip')
blockGroups2010.GEOID10 = to_numeric(blockGroups2010.GEOID10)
blockGroups2010 = (blockGroups2010[['GEOID10','geometry']]
                   .rename(columns={'GEOID10':'GEOID'})
                   .to_crs(CRS['projected'])
                  )
blockGroups2010['area_sqkm'] = blockGroups2010.geometry.area/SQ_METERS_PER_SQ_KM 
blockGroups2010 = blockGroups2010.merge(census2010, on='GEOID')
   
# Normalize count data by block group area
blockGroups2010['population_density'] = normalizeByArea(blockGroups2010,'population_total')
blockGroups2010['home_mortgage_density'] = normalizeByArea(blockGroups2010,'home_mortgages')
blockGroups2010['home_owner_density'] = normalizeByArea(blockGroups2010,'home_owners')
blockGroups2010['renter_density'] = normalizeByArea(blockGroups2010,'renters')
blockGroups2010['household_density'] = normalizeByArea(blockGroups2010,'total_households')
blockGroups2010['family_household_density'] = normalizeByArea(blockGroups2010,'family_households')
blockGroups2010['single_household_density'] = normalizeByArea(blockGroups2010,'single_households')
blockGroups2010['hispanic_latino_population_density'] = normalizeByArea(blockGroups2010,'population_hispanic_latino')
blockGroups2010['white_population_density'] = normalizeByArea(blockGroups2010,'population_white')
blockGroups2010['black_population_density'] = normalizeByArea(blockGroups2010,'population_black')
blockGroups2010['native_american_population_density'] = normalizeByArea(blockGroups2010,'population_native_american')
blockGroups2010['asian_population_density'] = normalizeByArea(blockGroups2010,'population_asian')

blockGroups2010.drop(['population_total',
                      'home_mortgages',
                      'home_owners',
                      'renters',
                      'total_households',
                      'family_households',
                      'single_households','population_hispanic_latino',
                      'population_white',
                      'population_black',
                      'population_native_american',
                      'population_asian'
                     ], 
                     axis=1, inplace=True
                    )

### ACS 2013

In [14]:
acs2013 = acs2013_data_cleaning.getData(fromCache=False) 

# Retrieved 10-07-2016 https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2013&layergroup=Block+Groups
blockGroups2013 = read_file('/tl_2013_04_bg.shp', 
                            vfs='zip://../data/shapefiles/tl_2013_04_bg.zip')
blockGroups2013.GEOID = to_numeric(blockGroups2010.GEOID)
blockGroups2013 = (blockGroups2013[['GEOID','geometry']]
                   .to_crs(CRS['projected'])
                  )

blockGroups2013['area_sqkm'] = blockGroups2013.geometry.area/SQ_METERS_PER_SQ_KM 
blockGroups2013 = blockGroups2013.merge(acs2013, on='GEOID')
  
# Normalize count data by block group area
blockGroups2013['density_education_highschool'] = normalizeByArea(blockGroups2013,'education_highschool')
blockGroups2013['density_education_undergraduate'] = normalizeByArea(blockGroups2013,'education_undergraduate')
blockGroups2013['density_education_postgraduate'] = normalizeByArea(blockGroups2013,'education_postgraduate')

blockGroups2013.drop(['education_highschool',
                      'education_undergraduate',
                      'education_postgraduate'
                     ], 
                     axis=1, inplace=True
                    )

## Spatial Join Restaurant Locations on Census Block Groups

In [16]:
restaurants_census2010 = (sjoin(restaurants[['business_id', 'geometry']], blockGroups2010, how='inner')
                          .drop(['geometry', 'index_right', 'GEOID', 'area_sqkm'], axis=1)
                         )
restaurants_acs2013 = (sjoin(restaurants[['business_id', 'geometry']], blockGroups2013, how='inner')
                       .drop(['geometry', 'index_right', 'GEOID', 'area_sqkm'], axis=1)
                      )
restaurants = GeoDataFrame(restaurants_census2010.merge(restaurants_acs2013, on='business_id')
                           .merge(restaurants, on='business_id'), 
                           crs=CRS['projected']
                          )

## Transform Variables for Linearity

In [17]:
# names shortened for ESRI compatability
restaurants['sq_sentmnt'] = restaurants['sentiment']**2.0
restaurants['ln_unique'] = (restaurants['uniqueness'].apply(lambda score: log(score)))
restaurants['sq_dst_CBD'] = restaurants['dist_CBD']**0.5
restaurants['sq_num_cmp'] = restaurants['competitors']**0.5

restaurants[['ln_med_age', 'ln_pop_skm', 'ln_hld_skm', 'ln_mor_skm', 
             'ln_own_skm', 'ln_rnt_skm', 'ln_avg_hld', 'ln_fmy_skm', 
             'ln_sgl_skm', 'ln_his_skm', 'ln_whi_skm', 'ln_blk_skm', 
             'ln_nam_skm', 'ln_asi_skm', 'ln_mhm_val', 'ln_med_inc', 
             'ln_ehs_skm', 'ln_eug_skm', 'ln_epg_skm'
           ]] = \
    (restaurants[['median_age', 'population_density', 'household_density', 
                  'home_mortgage_density', 'home_owner_density', 'renter_density', 
                  'average_household_size', 'family_household_density', 
                  'single_household_density', 'hispanic_latino_population_density', 
                  'white_population_density', 'black_population_density', 
                  'native_american_population_density', 'asian_population_density', 
                  'median_home_value', 'median_household_income', 
                  'density_education_highschool', 'density_education_undergraduate', 
                  'density_education_postgraduate'
                ]]
     .applymap(lambda value: log(value + 1))
    ) 
  

In [18]:
# drop non-linear variables: see 'data_exploration' module
# (these aren't approprate for use in linear regression)
restaurants.drop(['review_span', 'sentiment', 'uniqueness', 'dist_CBD', 'competitors', 
                  'dist_mwy_exit', 'home_mortgage_density', 'home_owner_density', 
                  'renter_density', 'average_household_size', 'family_household_density', 
                  'single_household_density', 'hispanic_latino_population_density', 
                  'white_population_density', 'black_population_density', 
                  'native_american_population_density', 'asian_population_density', 
                  'median_home_value', 'median_household_income', 
                  'density_education_highschool', 'density_education_undergraduate', 
                  'density_education_postgraduate'
                ], axis=1, inplace=True)

ValueError: labels ['near_exit'] not contained in axis

## Save Data

In [19]:
# shorten remaining names for ESRI compatability
restaurants.rename(columns={'business_id':'bus_id',
                            'full_address':'address',
                            'price_range':'price_rng',
                            'waiter_service':'waiter_svc',
                            'outdoor_seating':'outdr_seat',
                            'review_count': 'revw_count'
                           }, inplace=True
                  )

# save as csv
restaurants.to_csv('../data/restaurants.csv')

# save as shapefile
restaurants.to_file('../data/shapefiles/restaurants/restaurants.shp')