In [2]:
import json 
import os.path
from numpy import timedelta64
import pandas

import jupynbimp
import restaurants_data_cleaning

In [7]:
def getData(fromCache=True):
    dataDirectory = '../data/yelp/'
    outputName = 'restaurants_clean.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0)
    
    else:
        restaurants = restaurants_data_cleaning.getData(fromCache=fromCache)
        
        # read-in .json file as pandas dataframe
        reviews = (pandas.DataFrame.from_dict([
                    json.loads(data) for data in open(
                        dataDirectory + 'yelp_academic_dataset_review.json', 
                        'r'
                    )
                ])
        )
        
        # keep only reviews from selected restaurants
        reviews = (reviews[reviews['business_id']
                           .isin(restaurants['business_id'])
                          ]
                   .drop(['type', 'user_id', 'votes'], axis = 1)
                  )

        reviews['date'] = pandas.to_datetime(reviews['date'])

        # calculate additional variables
        reviewSpans = (reviews[['business_id', 'date']]
                       .groupby('business_id')
                       .aggregate(lambda g: 
                                  g['date'].max() - g['date'].min()
                                 )
                       .astype('timedelta64[D]')
                       .astype(int)
                       .rename(columns={'date':'review_span'})
                      )
        

        restaurants = restaurants.set_index('business_id').join(reviewSpans).reset_index()
        
        # remove NAs        
        restaurants.fillna(restaurants.mean())
    
        # cache results and return
        restaurants.to_csv(dataDirectory + outputName)
        return restaurants