In [1]:
import pandas
import os.path
from math import log
from numpy import mean, timedelta64

import jupynbimp
import review_data_getData 
from review_data_classify import ReviewSentiment

importing Jupyter notebook from review_data_getData.ipynb
importing Jupyter notebook from restaurants_data_cleaning.ipynb
importing Jupyter notebook from review_data_classify.ipynb


In [2]:
__COL_NAME_GROUP_ID = 'business_id'
__COL_NAME_RATING_DATE = 'date'
__COL_NAME_DOCUMENT_TEXT = 'text'
__COL_NAME_RATING_VALUE = 'stars'
__RATING_VALUE_RANGE = [1, 5]

In [50]:
def __calculateWeightedRating(ratingValue, ratingCount):
    
    # weight rating by log(ratingCount)    
    
    try:
        ratingCount = int(ratingCount)
    
    except ValueError:
        return nan
    
    if not(min(__RATING_VALUE_RANGE) <= ratingValue <= max(__RATING_VALUE_RANGE)):
        return nan
    
    else:  
        # 'neutral' rating normalized to 0
        normalizedRating = ratingValue - mean(__RATING_VALUE_RANGE) 
        
        return normalizedRating * log(ratingCount)

In [32]:
def __calculateSentiment(data):
        
        # calculate 'sentiment' of reviews (predicted star rating from review text)
        # -see review_data_classify module for model fitting and tuning
        # -fit the tuned model to the entire data and predict the star rating
        
        reviewSentiment = ReviewSentiment(data=data, 
                                          label=__COL_NAME_RATING_VALUE, 
                                          text=__COL_NAME_DOCUMENT_TEXT
                                         )
        
        return reviewSentiment.predictSentiment(ReviewSentiment._optimalParameters)

In [37]:
def __aggregateReviews(data):
    
    data = (data
            .groupby(__COL_NAME_GROUP_ID)
            .aggregate({__COL_NAME_GROUP_ID: 'count',
                        __COL_NAME_RATING_VALUE: 'mean',
                        __COL_NAME_RATING_DATE: lambda dates: dates.max() - dates.min(),
                        'sentiment': 'mean',
                       })
            .rename(columns={__COL_NAME_GROUP_ID:'review_count',
                             __COL_NAME_RATING_DATE:'review_span'
                            }
                   )
           )
    
    # store review_span as number of days (int)
    data['review_span'] = (data['review_span']
                           .astype('timedelta64[D]')
                           .astype(int)
                          )
    
    # normalize rating by review_count
    data['rating'] = data.apply(lambda row: 
                                __calculateWeightedRating(row[__COL_NAME_RATING_VALUE], 
                                                          row['review_count']
                                                         ),axis=1
                               )
    
    return data

In [35]:
def getData(fromCache=True):
    
    dataDirectory = '../../data/yelp/'
    outputName = 'reviews_aggregated.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0)
    
    else:
    
        data = __aggregateReviews(
        
            # sentiment must be calculated before aggregation
            # (labels for classifier training come from individual reviews)
            data=__calculateSentiment(
            
                # raw data from study area
                data=review_data_getData.getReviews()
            )
        )
        
        data.reset_index(inplace=True)

        # cache results and return
        data.to_csv(dataDirectory + outputName, index=False)
    
        return data