In [69]:
import json 
import pandas
import os.path
from numpy import nan
from math import log
from enum import Enum

## weight star rating by review count

In [70]:
def __weightedStars(stars, count):
        
    try:
        count = int(count)
    except ValueError:
        return nan
    
    if not(1 <= stars <= 5):
        return nan
    else:
        normalStars = stars - 3 # 'neutral' rating of 3 normalized to 0
        return normalStars * log(count)

## levels of alcohol service

In [71]:
class __Alcohol(Enum):
    none = 0
    beer_and_wine = 1
    full_bar = 2

## return clean and organised data

In [72]:
def getData(fromCache=True):
    # Retrieved 09-07-2016 from https://nz.yelp.com/dataset_challenge/dataset 
    dataDirectory = '../data/yelp/'
    outputName = 'restaurants_clean.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0)
    
    else:
    
        # read-in .json file as pandas dataframe
        businesses = (pandas.DataFrame.from_dict([json.loads(data) for data in 
                      open(dataDirectory + 'yelp_academic_dataset_business.json', 'r')])
                     )

        # select study area data & filter restaurants
        selectedData = (businesses[(businesses.state == 'AZ') & 
                                   businesses.categories.apply(lambda categories: 
                                                                   'Restaurants' in categories)
                                  ]
                        .drop(['neighborhoods', 'open', 'city', 'state', 'type'], axis = 1)
                       )
                                   
        # recode nested variables
        selectedData['rating'] = selectedData.apply(
            lambda row: __weightedStars(row.stars, row.review_count),axis=1
        )

        selectedData['beer_wine'] = selectedData.apply(
            lambda row: 
                __Alcohol[row.attributes.get('Alcohol', 'none')]==__Alcohol['beer_and_wine'], 
            axis=1
        ).astype(int)

        selectedData['full_bar'] = selectedData.apply(
            lambda row: 
                __Alcohol[row.attributes.get('Alcohol', 'none')]==__Alcohol['full_bar'], 
            axis=1
        ).astype(int)
        
        selectedData['price_range'] = selectedData.apply(
            lambda row: row.attributes.get('Price Range', nan), axis=1
        )

        selectedData['attire'] = selectedData.apply(
            lambda row: row.attributes.get('Attire', 'casual'), axis=1
        ).astype('category').cat.codes

        selectedData['takeout'] = selectedData.apply(
            lambda row: row.attributes.get('Take-out', False), axis=1
        ).astype(int)

        selectedData['waiter_service'] = selectedData.apply(
            lambda row: row.attributes.get('Waiter Service', False), axis=1
        ).astype(int)

        selectedData['outdoor_seating'] = selectedData.apply(
            lambda row: 
                row.attributes.get('Outdoor Seating', False), axis=1
        ).astype(int)
        
        selectedData.drop(['attributes', 'categories', 'hours', 'review_count', 'stars'],
                          axis=1, inplace=True
                         )
        
        selectedData.fillna(selectedData.mean())
    
        selectedData.to_csv(dataDirectory + outputName)
        return selectedData