In [1]:
import json 
import pandas
import os.path
from numpy import nan
from enum import Enum

## levels of alcohol service

In [2]:
class __Alcohol(Enum):
    none = 0
    beer_and_wine = 1
    full_bar = 2

## return clean and organised data

In [9]:
def getData(fromCache=True):
    # Retrieved 09-07-2016 from https://nz.yelp.com/dataset_challenge/dataset 
    dataDirectory = '../data/yelp/'
    outputName = 'restaurants_clean.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0)
    
    else:
    
        # read-in .json file as pandas dataframe
        restaurants = (pandas.DataFrame.from_dict([json.loads(data) for data in 
                      open(dataDirectory + 'yelp_academic_dataset_business.json', 'r')])
                     )

        # select study area data & filter restaurants
        restaurants = (restaurants[(restaurants['state'] == 'AZ') 
                                   & restaurants.categories.apply(lambda categories: 
                                                                   'Restaurants' in categories
                                                                 )
                                  ]
                       .drop(['neighborhoods', 
                              'open', 
                              'city', 
                              'state', 
                              'type', 
                              'review_count', 
                              'stars'
                             ], 
                             axis=1
                            )
                      )
                                   
        # recode nested variables
        restaurants['beer_wine'] = restaurants.apply(
            lambda row: 
                __Alcohol[row.attributes.get('Alcohol', 'none')]==__Alcohol['beer_and_wine'], 
            axis=1
        ).astype(int)

        restaurants['full_bar'] = restaurants.apply(
            lambda row: 
                __Alcohol[row.attributes.get('Alcohol', 'none')]==__Alcohol['full_bar'], 
            axis=1
        ).astype(int)
        
        restaurants['price_range'] = restaurants.apply(
            lambda row: row.attributes.get('Price Range', nan), axis=1
        )

        restaurants['attire'] = restaurants.apply(
            lambda row: row.attributes.get('Attire', 'casual'), axis=1
        ).astype('category').cat.codes

        restaurants['takeout'] = restaurants.apply(
            lambda row: row.attributes.get('Take-out', False), axis=1
        ).astype(int)

        restaurants['waiter_service'] = restaurants.apply(
            lambda row: row.attributes.get('Waiter Service', False), axis=1
        ).astype(int)

        restaurants['outdoor_seating'] = restaurants.apply(
            lambda row: 
                row.attributes.get('Outdoor Seating', False), axis=1
        ).astype(int)
        
        restaurants.drop(['attributes', 'categories', 'hours'],
                          axis=1, inplace=True
                        )
        
        #calculate additional variables
        nameCounts = restaurants.groupby('name').size()
        
        restaurants['uniqueness'] = restaurants.apply(
            lambda row: 
                1 / nameCounts[row['name']], axis=1
        )
        
        # remove NAs        
        restaurants.fillna(restaurants.mean(), inplace=True)
    
        # cache results and return
        restaurants.to_csv(dataDirectory + outputName, index=False)
        return restaurants