In [1]:
businesses = sc.textFile('../YelpDataset/yelp_academic_dataset_business.json')
reviews = sc.textFile('../YelpDataset/yelp_academic_dataset_review.json')
users = sc.textFile('../YelpDataset/yelp_academic_dataset_user.json')

In [2]:
import json, operator

selectedCategories = ['Restaurants', 'Shopping', 'Beauty & Spas', 'Bars', 'Automotive', 'Event Planning & Services',
                     'Fast Food', 'Coffee & Tea', 'Hotels', 'Real Estate', 'Dentists', 'Gyms']

def mapper_reviews_filter(rows):
    for row in rows:
        row = json.loads(row)
        yield (row['user_id'], (row['business_id'], row['stars']))

def mapper_buisness_filter(rows):
    for row in rows:
        row = json.loads(row)
        for category in row['categories']:
            if(category in selectedCategories):
                yield (row['business_id'], category)

def mapper_users_filter(rows):
    for row in rows:
        row = json.loads(row)
        elite = 0
        if len(row['elite']) == 0:
            elite = 1
        else:
            elite = 4 + len(row['elite'])
        yield (row['user_id'], elite)

def mapper_buisness_rating(rows):
    for row in rows:
        buisness_id = row[0]
        ratings_value = 0
        ratings_count = 0
        for rating in row[1]:
            ratings_value += rating[0]*rating[1]
            ratings_count += rating[1]
        yield (buisness_id, (float(ratings_value)/float(ratings_count), ratings_count))        

def mapper_normalized_total(records):
    for category in records:
        maxRatingValue = max(category[1],key=operator.itemgetter(1))[1][0] 
        minRatingValue = min(category[1],key=operator.itemgetter(1))[1][0] 
        maxRatingCount = max(category[1],key=lambda item:item[1][1])[1][1]
        minRatingCount = min(category[1],key=lambda item:item[1][1])[1][1]
        for buisness in category[1]:
            normalizedRatingValue = (float(buisness[1][0]) - minRatingValue)/(maxRatingValue - minRatingValue) * 20
            normalizedRatingCount = (float(buisness[1][1]) - minRatingCount)/(maxRatingCount - minRatingCount) * 15
            yield (category[0], (buisness[0], (normalizedRatingValue, normalizedRatingCount)))        
        
reviews_filtered = reviews.mapPartitions(mapper_reviews_filter)
buisness_filtered = businesses.mapPartitions(mapper_buisness_filter)
users_filtered = users.mapPartitions(mapper_users_filter)
joined = reviews_filtered.join(users_filtered).map(lambda x: (x[1][0][0],(x[1][0][1], x[1][1]))).groupByKey()
total = joined.mapPartitions(mapper_buisness_rating)
category_join = total.join(buisness_filtered).map(lambda x: (x[1][1], (x[0], x[1][0]))).groupByKey().mapValues(list)
total_normalized = category_join.mapPartitions(mapper_normalized_total).groupByKey().mapValues(list) 
total_sortedByRatingsCount = total_normalized.map(lambda (k, v): (k, sorted(v, key=lambda x: x[1][1], reverse=True)))
total_sortedByRatingsValue = total_normalized.map(lambda (k, v): (k, sorted(v, key=lambda x: x[1], reverse=True)))
