In [42]:
import pandas
import os.path
from math import log
from numpy import mean, timedelta64

import jupynbimp
import review_data_getData 
import topic_dictionary
from review_data_classify import ReviewSentiment
from review_data_topics_extractor import TopicExtractor

In [30]:
__COL_NAME_GROUP_ID = 'business_id'
__COL_NAME_RATING_DATE = 'date'
__COL_NAME_DOCUMENT_TEXT = 'text'
__COL_NAME_RATING_VALUE = 'stars'
__RATING_VALUE_RANGE = [1, 5]
__TOPIC_DICTIONARY = topic_dictionary.topicDictionary

In [50]:
def __calculateWeightedRating(ratingValue, ratingCount):
    
    # weight rating by log(ratingCount)    
    
    try:
        ratingCount = int(ratingCount)
    
    except ValueError:
        return nan
    
    if not(min(__RATING_VALUE_RANGE) <= ratingValue <= max(__RATING_VALUE_RANGE)):
        return nan
    
    else:  
        # 'neutral' rating normalized to 0
        normalizedRating = ratingValue - mean(__RATING_VALUE_RANGE) 
        
        return normalizedRating * log(ratingCount)

In [32]:
def __calculateSentiment(data):
        
        # calculate 'sentiment' of reviews (predicted star rating from review text)
        # -see review_data_classify module for model fitting and tuning
        # -fit the tuned model to the entire data and predict the star rating
        
        reviewSentiment = ReviewSentiment(data=data, 
                                          label=__COL_NAME_RATING_VALUE, 
                                          text=__COL_NAME_DOCUMENT_TEXT
                                         )
        
        return reviewSentiment.predictSentiment(ReviewSentiment._optimalParameters)

In [33]:
def __makeTopicScorer(data):
    
    positiveReviewText = data[(data[__COL_NAME_RATING_VALUE]>3)][__COL_NAME_DOCUMENT_TEXT]
    negativeReviewText = data[(data[__COL_NAME_RATING_VALUE]<3)][__COL_NAME_DOCUMENT_TEXT]
       
    positiveTopicExtractor = TopicExtractor(trainingDocuments=positiveReviewText, 
                                            topicDictionary=__TOPIC_DICTIONARY
                                           )
    negativeTopicExtractor = TopicExtractor(trainingDocuments=negativeReviewText, 
                                            topicDictionary=__TOPIC_DICTIONARY
                                           )
    
    def scoreTopic(documents, topic):
        
        positiveScore = negativeScore = 0
        
        if topic in list(positiveTopicExtractor.ngramsByTopic.index.levels[0]):
            positiveScore = positiveTopicExtractor.getTfIdfScore(documents=documents, topic=topic)
            
        if topic in list(negativeTopicExtractor.ngramsByTopic.index.levels[0]):
            negativeScore = negativeTopicExtractor.getTfIdfScore(documents=documents, topic=topic)
        
        return (data['review_span'] * log(positiveScore - negativeScore + 1e4))**2
    
    
    return scoreTopic

In [37]:
def __aggregateReviews(data):
    
    # topicExtractors must be trained before aggregation
    # (seperate extractors are trained for positive and negative reviews)
    topicScorer = __makeTopicScorer(data)
    topics = __TOPIC_DICTIONARY.keys()
    
    data = (data
            .groupby(__COL_NAME_GROUP_ID)
            .aggregate({__COL_NAME_GROUP_ID: 'count',
                        __COL_NAME_RATING_VALUE: 'mean',
                        __COL_NAME_RATING_DATE: lambda dates: dates.max() - dates.min(), 
                        __COL_NAME_DOCUMENT_TEXT: lambda reviews: [(topic, topicScorer(reviews, topic)) 
                                                                   for topic in topics
                                                                  ],
                        'sentiment': 'mean',
                       })
            .rename(columns={__COL_NAME_GROUP_ID:'review_count',
                             __COL_NAME_RATING_DATE:'review_span',
                             __COL_NAME_DOCUMENT_TEXT:'topic_scores'
                            }
                   )
           )
    
    ## clean up data types and columns
    
    # split topic scores into seperate columns 
    for i, topic in enumerate(topics):
        data[topic] = data['topic_scores'].apply(lambda scores: scores[i][1])
      
    data.drop('topic_scores', axis=1, inplace=True)
    
    # store review_span as number of days (int)
    data['review_span'] = (data['review_span']
                           .astype('timedelta64[D]')
                           .astype(int)
                          )
    
    # normalize rating by review_count
    data['rating'] = data.apply(lambda row: 
                                __calculateWeightedRating(row[__COL_NAME_RATING_VALUE], 
                                                          row['review_count']
                                                         ),axis=1
                               )
    
    return data

In [35]:
def getData(fromCache=True):
    
    dataDirectory = '../data/yelp/'
    outputName = 'reviews_aggregated.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0)
    
    else:
    
        data = __aggregateReviews(
        
            # sentiment must be calculated before aggregation
            # (labels for classifier training come from individual reviews)
            data=__calculateSentiment(
            
                # raw data from study area
                data=review_data_getData.getReviews()
            )
        )

        # cache results and return
        data.to_csv(dataDirectory + outputName)
    
        return data