In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf

In [2]:
import json
import math
import itertools
import random
import sys

In [3]:
conf = SparkConf()
# conf.set("spark.driver.memory", "4g")
# conf.set("spark.executor.memory", "4g")
conf.setMaster('local[8]')
conf.setAppName('Assignment_2')
sc = SparkContext.getOrCreate(conf)

In [4]:
import re
import string
escape_chars = r'[' + string.whitespace + ']'

In [35]:
def create_word_count_dict(word_list):
    count_dict = {}
    for word in word_list:
        if word not in count_dict:
            count_dict[word] = 0
        count_dict[word] += 1
    return count_dict

In [40]:
def get_tf_scores(word_dict):
    max_freq = max(word_dict.values())
    tf_scores = {}
    for word in word_dict:
        tf_scores[word] = word_dict[word]/max_freq
    return tf_scores

In [5]:
reviews_json = sc.textFile('asnlib/publicdata/train_review.json').map(json.loads)

In [6]:
reviews_json.first()

{'review_id': 'pxOrtki0sqXps5hSyLXKpA',
 'user_id': 'OLR4DvqFxCKLOEHqfAxpqQ',
 'business_id': 'zK7sltLeRRioqYwgLiWUIA',
 'stars': 5.0,
 'text': "Second time I've been here. First time was whatever. This time it was actually good. Way better than inn n out. It's the same type of burger that's why I put it up against that. I love that you can get grilled jalapeños. Just wish they came on the burger and not on the side.",
 'date': '2015-12-19 07:35:30'}

In [7]:
users_token_dict = reviews_json.map(lambda x: x.get('user_id')).distinct().sortBy(lambda x: x).zipWithIndex().collectAsMap()

In [8]:
business_token_dict = reviews_json.map(lambda x: x.get('business_id')).distinct().sortBy(lambda x: x).zipWithIndex().collectAsMap()

In [None]:
users_token_dict

In [9]:
stop_words = set(word.strip() for word in open("asnlib/publicdata/stopwords"))

In [33]:
business_reviews = reviews_json \
        .map(lambda x: (x.get('business_id'), x.get('text'))) \
        .flatMap(lambda x: [(x[0], word) for word in re.split(escape_chars, x[1])]) \
        .map(lambda x: (x[0], x[1].lower().strip(string.punctuation)))  \
        .filter(lambda x: x[1] and x[1] != '' and x[1] not in string.ascii_lowercase and x[1] not in string. and x[1] not in stop_words) \
        .groupByKey().map(lambda x: (x[0], list(x[1])))

In [34]:
business_reviews.take(10)

[('bZMcorDrciRbjdjRyANcjA',
  ['listened',
   'reviews',
   'place',
   'highly',
   'unprofessional',
   '1st',
   'visit',
   'great',
   'signed',
   'package',
   'needed',
   '2',
   'months',
   'directed',
   'cancel',
   'email',
   'corporate',
   '30',
   'days',
   'advance',
   'paid',
   'partial',
   'month',
   'directed',
   'month',
   'would',
   'automatically',
   'pulled',
   'point',
   'sent',
   '4',
   'emails',
   'response',
   'requesting',
   'cancel',
   '2nd',
   'visit',
   'recommended',
   'lotion',
   '30',
   'holiday',
   'weekend',
   'special',
   'advised',
   'due',
   'something',
   'could',
   'use',
   'lotion',
   'today',
   'keep',
   'store',
   'another',
   '2',
   'days',
   'thought',
   'weird',
   'hey',
   'whatever',
   'went',
   'also',
   '2nd',
   'visit',
   '2',
   'bulbs',
   'bed',
   'lastly',
   'would',
   'would',
   'started',
   'package',
   '2',
   'days',
   'later',
   'would',
   'got',
   'following',
   'mont

In [38]:
business_review_word_counts = business_reviews.map(lambda x: (x[0], create_word_count_dict(x[1])))

In [39]:
business_review_word_counts.take(10)

[('bZMcorDrciRbjdjRyANcjA',
  {'listened': 1,
   'reviews': 1,
   'place': 4,
   'highly': 2,
   'unprofessional': 1,
   '1st': 1,
   'visit': 4,
   'great': 8,
   'signed': 3,
   'package': 5,
   'needed': 1,
   '2': 6,
   'months': 5,
   'directed': 2,
   'cancel': 2,
   'email': 2,
   'corporate': 13,
   '30': 4,
   'days': 5,
   'advance': 1,
   'paid': 8,
   'partial': 1,
   'month': 14,
   'would': 10,
   'automatically': 1,
   'pulled': 1,
   'point': 2,
   'sent': 4,
   '4': 2,
   'emails': 2,
   'response': 2,
   'requesting': 1,
   '2nd': 2,
   'recommended': 1,
   'lotion': 5,
   'holiday': 1,
   'weekend': 2,
   'special': 1,
   'advised': 3,
   'due': 1,
   'something': 1,
   'could': 2,
   'use': 2,
   'today': 1,
   'keep': 1,
   'store': 2,
   'another': 2,
   'thought': 2,
   'weird': 1,
   'hey': 1,
   'whatever': 1,
   'went': 7,
   'also': 7,
   'bulbs': 2,
   'bed': 5,
   'lastly': 1,
   'started': 3,
   'later': 1,
   'got': 4,
   'following': 2,
   '1.00': 1,
   

In [41]:
business_tf_scores = business_review_word_counts.map(lambda x: (x[0], get_tf_scores(x[1])))

In [42]:
business_tf_scores.first()

('bZMcorDrciRbjdjRyANcjA',
 {'listened': 0.0625,
  'reviews': 0.0625,
  'place': 0.25,
  'highly': 0.125,
  'unprofessional': 0.0625,
  '1st': 0.0625,
  'visit': 0.25,
  'great': 0.5,
  'signed': 0.1875,
  'package': 0.3125,
  'needed': 0.0625,
  '2': 0.375,
  'months': 0.3125,
  'directed': 0.125,
  'cancel': 0.125,
  'email': 0.125,
  'corporate': 0.8125,
  '30': 0.25,
  'days': 0.3125,
  'advance': 0.0625,
  'paid': 0.5,
  'partial': 0.0625,
  'month': 0.875,
  'would': 0.625,
  'automatically': 0.0625,
  'pulled': 0.0625,
  'point': 0.125,
  'sent': 0.25,
  '4': 0.125,
  'emails': 0.125,
  'response': 0.125,
  'requesting': 0.0625,
  '2nd': 0.125,
  'recommended': 0.0625,
  'lotion': 0.3125,
  'holiday': 0.0625,
  'weekend': 0.125,
  'special': 0.0625,
  'advised': 0.1875,
  'due': 0.0625,
  'something': 0.0625,
  'could': 0.125,
  'use': 0.125,
  'today': 0.0625,
  'keep': 0.0625,
  'store': 0.125,
  'another': 0.125,
  'thought': 0.125,
  'weird': 0.0625,
  'hey': 0.0625,
  'what