In [1]:
import json 
import pandas
import os.path
from math import log
from enum import Enum

## weight star rating by review count

In [2]:
def __weightedStars(stars, count):
        
    try:
        count = int(count)
    except ValueError:
        return 'NA'
    
    if not(1 <= stars <= 5):
        return 'NA'
    else:
        normalStars = stars - 3 # 'neutral' rating of 3 normalized to 0
        return normalStars * log(count)

## levels of alcohol service

In [3]:
class __Alcohol(Enum):
    none = 0
    beer_and_wine = 1
    full_bar = 2

## return clean and organised data

In [11]:
def getData(fromCache=True):
    dataDirectory = '../data/yelp/'
    outputName = 'restaurants_clean.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0, index_col='business_id')
    
    else:
    
        # read-in .json file as pandas dataframe
        businesses = (pandas.DataFrame.from_dict([json.loads(data) for data in 
                      open(dataDirectory + 'yelp_academic_dataset_business.json', 'r')])
                     )

        # select study area data & filter restaurants
        selectedData = (businesses[(businesses.state == 'AZ') & 
                                   businesses.categories.apply(lambda categories: 
                                                                   'Restaurants' in categories)
                                  ]
                        .set_index('business_id')
                        .drop(['neighborhoods', 'open', 'city', 'state', 'type'], axis = 1)
                       )
                                   
        # recode nested variables
        selectedData["rating"] = selectedData.apply(
            lambda row: __weightedStars(row.stars, row.review_count),axis=1
        )

        selectedData["alcohol"] = selectedData.apply(
            lambda row: __Alcohol[row.attributes.get('Alcohol', 'none')].value, axis=1
        )

        selectedData["price_range"] = selectedData.apply(
            lambda row: row.attributes.get('Price Range', 'NA'), axis=1
        )

        selectedData["attire"] = selectedData.apply(
            lambda row: row.attributes.get('Attire', 'casual'), axis=1
        )

        selectedData["takeout"] = selectedData.apply(
            lambda row: row.attributes.get('Take-out', 'NA'), axis=1
        )

        selectedData["waiter_service"] = selectedData.apply(
            lambda row: row.attributes.get('Waiter Service', 'NA'), axis=1
        )

        selectedData["outdoor_seating"] = selectedData.apply(
            lambda row: 
                row.attributes.get('Outdoor Seating', 'NA'), axis=1
        )
        
        selectedData.drop(["attributes", "categories", "hours", "review_count", "stars"], 
                                 axis=1, inplace=True)
    
        selectedData.to_csv(dataDirectory + outputName)
        return selectedData

In [14]:
getData()

Unnamed: 0_level_0,full_address,latitude,longitude,name,rating,alcohol,price_range,attire,takeout,waiter_service,outdoor_seating
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
x5Mv61CnZLohZWxfCVCPTQ,"2819 N Central Ave\nPhoenix, AZ 85004",33.479482,-112.073681,Domino's Pizza,-1.386294,0,2.0,casual,True,False,False
2ZnCITVa0abGce4gZ6RhIw,"1850 N Central Ave\nPhoenix, AZ 85004",33.468547,-112.075085,Viad Tower Restaurants,0.895880,0,1.0,casual,True,,
EmzaQR5hQlF0WIl24NxAZA,"132 E Washington St\nPhoenix, AZ 85004",33.448399,-112.071702,Sky Lounge,-1.609438,2,1.0,casual,False,False,True
KPoTixdjoJxSqRSEApSAGg,"2631 N Central Ave\nPhoenix, AZ 85004",33.477939,-112.073417,Wild Thaiger,3.034213,2,2.0,casual,True,True,True
e5kc0CQ4R-PCCDgb274gSg,"455 N 3rd St\nSte 114\nPhoenix, AZ 85004",33.452154,-112.068620,Canyon Cafe,2.736135,2,2.0,casual,True,True,True
gBcpDl0quCLABsg5OuSFww,"622 E Adams St\nPhoenix, AZ 85004",33.449298,-112.065542,Teeter House Tea Room,1.242453,2,2.0,casual,True,True,True
72N6TdH5MdUWE2YhYRbyxw,"2530 N Central Ave\nPhoenix, AZ 85004",33.476260,-112.074248,Burger King,-1.242453,0,3.0,casual,True,False,
eOgUKCy5-sK-leqIBiNokg,"2 N Central Ave\nPhoenix, AZ 85004",33.448491,-112.074428,Copper Square Cafe,0.000000,0,1.0,casual,True,False,False
neGAQFkBNI8-rpPBLBzNkw,"301 East McDowell Road\nPhoenix, AZ 85004",33.465440,-112.069090,Taco Bell,2.302585,0,1.0,casual,True,False,False
45puCRQ6Vh_IIAy7kkfFDQ,"24 N 2nd St\nPhoenix, AZ 85004",33.448465,-112.071257,Majerle's Sports Grill,2.661505,2,2.0,casual,True,True,True
