In [39]:
import json 
import os.path
from numpy import timedelta64
from math import log
import pandas

import jupynbimp
import restaurants_data_cleaning

In [40]:
def __weightedStars(stars, count):
    # weight star rating by review count    
    try:
        count = int(count)
    except ValueError:
        return nan
    
    if not(1 <= stars <= 5):
        return nan
    else:
        normalStars = stars - 3 # 'neutral' rating of 3 normalized to 0
        return normalStars * log(count)

In [91]:
def getData(fromCache=True):
    dataDirectory = '../data/yelp/'
    outputName = 'restaurants_clean.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0)
    
    else:
        restaurants = restaurants_data_cleaning.getData(fromCache=fromCache)
        
        # read-in .json file as pandas dataframe
        reviews = (pandas.DataFrame.from_dict([
                    json.loads(data) for data in open(
                        dataDirectory + 'yelp_academic_dataset_review.json', 
                        'r'
                    )
                ])
        )
        
        # keep only reviews from selected restaurants
        reviews = (reviews[reviews['business_id']
                           .isin(restaurants['business_id'])
                          ]
                   .drop(['type', 'user_id', 'votes'], axis = 1)
                  )

        reviews['date'] = pandas.to_datetime(reviews['date'])

        # calculate aggregated variables        
        reviews = (reviews
                   .groupby('business_id')
                   .aggregate({'business_id':'count',
                               'stars':'mean',
                               'date': lambda g: g.max() - g.min()
                              })
                   .rename(columns={'business_id':'count',
                                    'date':'review_span'
                                   }
                          )
                  )
        
        reviews['review_span'] = (reviews['review_span']
                                  .astype('timedelta64[D]')
                                  .astype(int)
                                 )
        
        reviews['rating'] = reviews.apply(
            lambda row: __weightedStars(row['stars'], row['count']),
            axis=1
        )
        
        restaurants = (restaurants
                       .set_index('business_id')
                       .join(reviews, how='inner')
                       .reset_index()
                      )
        
        # fill NAs        
        restaurants.fillna(restaurants.mean())       
    
        # cache results and return
        restaurants.to_csv(dataDirectory + outputName)
        return restaurants