In [None]:
import findspark
import json
import itertools
import sys
import time
import random

findspark.init()
from pyspark import SparkConf, SparkContext

In [None]:
import math

In [None]:
def build_min_hash_func(a, b, p, m):
    def min_hash_func(x):
        return (((a * x + b) % p) % m)

    return min_hash_func

In [None]:
def get_min_hash_functions(num_func, buckets):
    list_a = random.sample(range(50331653, 92233720), num_func)
    list_b = random.sample(range(25165843, 92233720), num_func)
    p = 12582917
    min_hash_func_list = [build_min_hash_func(a, b, p, buckets) for a, b in zip(list_a, list_b)]

    return min_hash_func_list

In [None]:
def check_jaccard_similarity(candidate, business_user_tokens):
    business_set_1 = set(business_user_tokens.get(candidate[0], []))
    business_set_2 = set(business_user_tokens.get(candidate[1], []))
    pair_jac_sim = 0
    if business_set_1 and business_set_2:
        pair_jac_sim = len(business_set_1.intersection(business_set_2)) / len(business_set_1.union(business_set_2))
    return tuple([candidate, pair_jac_sim])

In [None]:
def write_results(results, file_path):
    with open(file_path, 'w') as file:
        for line in results:
            file.write(json.dumps(line) + '\n')
    file.close()

In [None]:
def computeSimilarity(dict1, dict2):
    """
    compute Pearson Correlation Similarity
    :param dict1:
    :param dict2:
    :return: a float number
    """
    co_rated_user = list(set(dict1.keys()) & (set(dict2.keys())))
    val1_list, val2_list = list(), list()
    [(val1_list.append(dict1[user_id]),
      val2_list.append(dict2[user_id])) for user_id in co_rated_user]

    avg1 = sum(val1_list) / len(val1_list)
    avg2 = sum(val2_list) / len(val2_list)

    numerator = sum(map(lambda pair: (pair[0] - avg1) * (pair[1] - avg2), zip(val1_list, val2_list)))

    if numerator == 0:
        return 0
    denominator = math.sqrt(sum(map(lambda val: (val - avg1) ** 2, val1_list))) * \
                  math.sqrt(sum(map(lambda val: (val - avg2) ** 2, val2_list)))
    if denominator == 0:
        return 0

    return numerator / denominator


# Main

In [None]:
conf = SparkConf()
conf.set("spark.driver.memory", "4g")
conf.set("spark.executor.memory", "4g")
conf.setMaster('local[8]')
conf.setAppName('Assignment_3')
sc = SparkContext.getOrCreate(conf)

In [None]:
reviews_json = sc.textFile("asnlib/publicdata/train_review.json").map(json.loads)

In [None]:
reviews_json.first()

In [None]:
user_business_rating_sets = reviews_json.map(lambda x: (x.get('user_id'), x.get('business_id'), x.get('stars'))).distinct()

In [None]:
# create user tokens
user_tokens_dict = user_business_rating_sets\
    .map(lambda x: x[0])\
    .distinct()\
    .sortBy(lambda x: x)\
    .zipWithIndex()\
    .collectAsMap()

inverse_user_tokens_dict = {bid: token for token, bid in user_tokens_dict.items()}

In [None]:
# create business tokens
business_tokens = user_business_rating_sets\
    .map(lambda x: x[1])\
    .distinct()\
    .sortBy(lambda x: x)\
    .zipWithIndex()\
    
business_tokens_dict = business_tokens.collectAsMap()

inverse_business_tokens_dict = {bid: token for token, bid in business_tokens_dict.items()}

In [None]:
min_hash_func_list = get_min_hash_functions(30, len(user_tokens_dict) * 2)

In [None]:
# get user business tokenized maps
user_business_rating_tokenized_sets = user_business_rating_sets\
    .map(lambda x: (user_tokens_dict.get(x[0]), business_tokens_dict.get(x[1]), x[2]))

In [None]:
business_user_tokenized_pairs = user_business_rating_tokenized_sets.map(lambda x: (x[1], x[0]))

In [None]:
# create business user list
business_user_tokenized_map = business_user_tokenized_pairs.groupByKey().mapValues(lambda x: list(set(x))).filter(lambda x: len(x[1])>=3)

In [None]:
business_user_tokenized_map.count()

In [None]:
user_business_tokenized_dict = business_user_tokenized_map.flatMap(lambda x: [(user, x[0]) for user in x[1]]).groupByKey().mapValues(lambda x: list(set(x))).collectAsMap()

In [None]:
business_hashed_values = business_tokens.map(lambda x: (x[1], [min_hash(x[1]) for min_hash in min_hash_func_list]))

In [None]:
signature_matrix_rdd = business_user_tokenized_map\
    .leftOuterJoin(business_hashed_values)\
    .map(lambda x: x[1])\
    .flatMap(lambda user_set: [(x, user_set[1]) for x in user_set[0]])\
    .reduceByKey(lambda a, b: [min(x, y) for x, y in zip(a, b)])

In [None]:
signature_matrix_rdd.first()

In [None]:
candidate_pairs = signature_matrix_rdd \
    .flatMap(lambda x: [(tuple([i, tuple(x[1][i:i + 1])]), x[0]) for i in range(0, 30)]) \
    .groupByKey()\
    .map(lambda x: list(x[1]))\
    .filter(lambda val: len(val) > 1) \
    .flatMap(lambda uid_list: [pair for pair in itertools.combinations(uid_list, 2)])

In [None]:
candidate_pairs.count()

In [None]:
jaccard_similar_users = candidate_pairs\
        .distinct()\
        .map(lambda x: check_jaccard_similarity(x, user_business_tokenized_dict))\
        .filter(lambda x: x[1] >= 0.01)\
#         .map(lambda x: {"b1": inverse_business_tokens_dict[x[0][0]], "b2": inverse_business_tokens_dict[x[0][1]], "sim": x[1]})\
#         .collect()

In [None]:
jaccard_similar_users.first()

In [None]:
jaccard_similar_users.count()

In [None]:
user_business_rating_map = user_business_rating_tokenized_sets.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().mapValues(lambda x: {business_rating_pair[0]: business_rating_pair[1] for business_rating_pair in list(x)}).collectAsMap()

In [None]:
user_business_rating_map

In [None]:
pearson_similar_pairs = jaccard_similar_users.map(lambda id_pair: (id_pair[0], computeSimilarity(user_business_rating_map[id_pair[0][0]], user_business_rating_map[id_pair[0][1]]))).filter(lambda kv: kv[1] > 0)

In [None]:
pearson_similar_pairs.count()

In [None]:
pearson_similar_pairs.take(100)

In [None]:
pearson_similar_results_untokenized = pearson_similar_pairs.map(lambda kv: {"u1": inverse_user_tokens_dict[kv[0][0]], "u2": inverse_user_tokens_dict[kv[0][1]], "sim": kv[1]})

In [None]:
pearson_similar_results_untokenized.first()

# Predict

In [None]:
import findspark
import json
import itertools
import sys
import time
import random
import math

findspark.init()
from pyspark import SparkConf, SparkContext

In [None]:
def write_results(results, file_path):
    with open(file_path, 'w') as file:
        for line in results:
            file.write(json.dumps(line) + '\n')
    file.close()


In [None]:
start_time = time.time()
conf = SparkConf()
conf.set("spark.driver.memory", "4g")
conf.set("spark.executor.memory", "4g")
conf.setMaster('local[8]')
conf.setAppName('Assignment_3')
sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")

cf_model = 'item_based'

# load data
train_reviews_json = sc.textFile('asnlib/publicdata/train_review.json').map(json.loads)
# test_reviews_json = sc.textFile('asnlib/publicdata/test_review_ratings.json').map(json.loads)

# create user business rating sets
train_user_business_rating_sets = train_reviews_json \
    .map(lambda x: (x.get('user_id'), x.get('business_id'), x.get('stars'))).distinct()

# create user tokens
user_tokens = train_user_business_rating_sets \
    .map(lambda x: x[0]) \
    .distinct() \
    .sortBy(lambda x: x) \
    .zipWithIndex()

user_tokens_dict = user_tokens.collectAsMap()

inverse_user_tokens_dict = {bid: token for token, bid in user_tokens_dict.items()}

# create business tokens
candidate_business_tokens = train_user_business_rating_sets \
    .map(lambda x: x[1]) \
    .distinct() \
    .sortBy(lambda x: x) \
    .zipWithIndex()

business_tokens_dict = candidate_business_tokens.collectAsMap()

inverse_business_tokens_dict = {bid: token for token, bid in business_tokens_dict.items()}

# tokenize trains sets
train_user_business_rating_sets_tokenized = train_user_business_rating_sets\
    .map(lambda x: (user_tokens_dict[x[0]], business_tokens_dict[x[1]], x[2]))

# load test set
# test_user_business_pairs_tokenized = test_reviews_json\
#     .map(lambda x: (user_tokens_dict.get(x['user_id', None], business_tokens_dict.get['business_id', None])))\
#     .filter(lambda x: x[0] is not None and x[1] is not None)

In [None]:
test_reviews_json = sc.textFile('asnlib/publicdata/test_review_ratings.json').map(json.loads)

In [None]:
test_reviews_json.first()['user_id']

In [None]:
test_user_business_pairs_tokenized = test_reviews_json\
    .map(lambda x: (user_tokens_dict.get(x['user_id'], None), business_tokens_dict.get(x['business_id'], None)))\
    .filter(lambda x: x[0] is not None and x[1] is not None)

In [None]:
test_user_business_pairs_tokenized.first()

In [None]:
# load model from memory
model = sc.textFile('task3item.model')\
    .map(json.loads)

In [None]:
# get model keys
keys = list(model.first().keys())

In [None]:
# collect final model as similarity pairs
model = model\
    .map(lambda x: ((x[keys[0]], x[keys[1]]), x[keys[2]]))\
    .collectAsMap()

In [None]:
business_user_rating_sets = train_user_business_rating_sets_tokenized\
        .map(lambda x: (x[1], (x[0], x[2])))\
        .groupByKey()\
        .map(lambda x: (x[0], [(user_rating[0], user_rating[1]) for user_rating in list(x[1])]))

user_average_ratings = sc.textFile('asnlib/publicdata/user_avg.json')\
        .map(json.loads)\
        .collect()[0]

test_business_user_pairs_tokenized = test_user_business_pairs_tokenized\
        .map(lambda x: (x[1], x[0]))


In [None]:
user_average_ratings['dcpE4CUvtqLf63tRJgNqKQ']

In [None]:
# users and list of rated businesses with respective ratings
user_business_rating_sets = train_user_business_rating_sets_tokenized\
    .map(lambda x: (x[0], (x[1], x[2])))\
    .groupByKey()\
    .map(lambda x: (x[0], [(business_rating[0], business_rating[1]) for business_rating in list(x[1])]))

# dictionary of average ratings for each business from the given file
business_average_ratings = sc.textFile('asnlib/publicdata/business_avg.json')\
    .map(json.loads)\
    .collect()[0]

In [None]:
user_business_rating_sets.first()

In [None]:
test_business_user_pairs_tokenized.first()

In [None]:
test_data_join_train_data = test_user_business_pairs_tokenized.leftOuterJoin(user_business_rating_sets)

In [None]:
test_data_join_train_data.first()

In [None]:
def item_based_model_predict(bid_user_ratings, n, item_model, business_average_ratings_dict, inverse_tokens_dict):
    candidate_id = bid_user_ratings[0]
    rating_similarity_pairs = sorted(
        [
            tuple([business_rating_pair[1], item_model.get(tuple(sorted([candidate_id, business_rating_pair[0]])), 0)])
            for business_rating_pair in list(bid_user_ratings[1])
        ],
        key=lambda x: x[1]
    )
    
    n_similar_businesses = rating_similarity_pairs[:n]
    
    try:
        return tuple(
            (
                candidate_id, 
                sum(map(lambda item: item[0] * item[1], n_similar_businesses)) / sum(map(lambda item: abs(item[1]), n_similar_businesses))
            )
        )
    
    except:
        return tuple((candidate_id, business_average_ratings_dict.get(inverse_tokens_dict.get(candidate_id), 3)))

    

In [None]:
results = test_data_join_train_data.mapValues(lambda x: item_based_model_predict(x, 3, model, business_average_ratings, inverse_business_tokens_dict))

In [None]:
results.collect()

In [None]:
def user_based_model_predict(uid_business_ratings, user_model, user_average_ratings_dict, inverse_tokens_dict):
    candidate_id = uid_business_ratings[0]
    rating_similarity_sets = sorted(
        [
            tuple([
                user_rating_pair[1],
                user_average_ratings_dict.get(inverse_tokens_dict.get(user_rating_pair[0], ''), 3),
                user_model.get(tuple(sorted([candidate_id, user_rating_pair[0]])), 0)
            ])
            for user_rating_pair in list(uid_business_ratings[1])
        ],
        key=lambda x: x[1]
    )

    try:
        return tuple(
            (
                candidate_id,
                user_average_ratings_dict.get(candidate_id, 3) +
                sum(map(lambda x: (x[0] - x[1]) * x[2], rating_similarity_sets)) /
                sum(map(lambda item: abs(item[2]), rating_similarity_sets))
            )
        )

    except:
        return tuple((candidate_id, user_average_ratings_dict.get(inverse_tokens_dict.get(candidate_id), 3)))


# pycharm code

In [1]:
import findspark
import json
import sys
import time

findspark.init()
from pyspark import SparkConf, SparkContext


def write_results(results, file_path):
    with open(file_path, 'w') as file:
        for line in results:
            file.write(json.dumps(line) + '\n')
    file.close()


In [58]:
def item_based_model_predict(bid_user_ratings, n, item_model, business_average_ratings_dict, inverse_tokens_dict):
    candidate_id = bid_user_ratings[0]
    rating_similarity_pairs = sorted(
        [
            tuple([
                business_rating_pair[1],
                item_model.get(tuple(sorted([candidate_id, business_rating_pair[0]])), 0)
            ])
            for business_rating_pair in list(bid_user_ratings[1])
        ],
        key=lambda x: x[1]
    )

    n_similar_businesses = rating_similarity_pairs[:n]

    try:
        return tuple(
            [
                candidate_id,
                sum(map(lambda x: x[0] * x[1], n_similar_businesses)) / sum(
                    map(lambda x: abs(x[1]), n_similar_businesses))
            ]
        )

    except:
        return tuple([candidate_id, business_average_ratings_dict.get(inverse_tokens_dict.get(candidate_id), 3)])


In [57]:
def user_based_model_predict(uid_business_ratings, user_model, user_average_ratings_dict, inverse_tokens_dict):
    candidate_id = uid_business_ratings[0]
    rating_similarity_sets = [
        tuple([
            user_rating_pair[1],
            user_average_ratings_dict.get(inverse_tokens_dict.get(user_rating_pair[0], ''), 3),
            user_model.get(tuple(sorted([candidate_id, user_rating_pair[0]])), 0)
        ])
        for user_rating_pair in list(uid_business_ratings[1])
    ]


    try:
        return tuple(
            [
                candidate_id,
                user_average_ratings_dict.get(candidate_id, 3) +
                sum(map(lambda x: (x[0] - x[1]) * x[2], rating_similarity_sets)) /
                sum(map(lambda item: abs(item[2]), rating_similarity_sets))
            ]
        )

    except:
        return tuple([candidate_id, user_average_ratings_dict.get(inverse_tokens_dict.get(candidate_id), 3)])


In [70]:
# argv = sys.argv

start_time = time.time()
conf = SparkConf()
conf.set("spark.driver.memory", "4g")
conf.set("spark.executor.memory", "4g")
conf.setMaster('local[8]')
conf.setAppName('Assignment_3')
sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")

# load data
train_reviews_json = sc.textFile('asnlib/publicdata/train_review.json').map(json.loads)
test_reviews_json = sc.textFile('asnlib/publicdata/test_review_ratings.json').map(json.loads)



# create user business rating sets
train_user_business_rating_sets = train_reviews_json \
    .map(lambda x: (x.get('user_id'), x.get('business_id'), x.get('stars'))).distinct()

# create user tokens
user_tokens = train_user_business_rating_sets \
    .map(lambda x: x[0]) \
    .distinct() \
    .sortBy(lambda x: x) \
    .zipWithIndex()

user_tokens_dict = user_tokens.collectAsMap()

inverse_user_tokens_dict = {bid: token for token, bid in user_tokens_dict.items()}

# create business tokens
candidate_business_tokens = train_user_business_rating_sets \
    .map(lambda x: x[1]) \
    .distinct() \
    .sortBy(lambda x: x) \
    .zipWithIndex()

business_tokens_dict = candidate_business_tokens.collectAsMap()

inverse_business_tokens_dict = {bid: token for token, bid in business_tokens_dict.items()}

# tokenize trains sets
train_user_business_rating_sets_tokenized = train_user_business_rating_sets \
    .map(lambda x: (user_tokens_dict[x[0]], business_tokens_dict[x[1]], x[2]))

# load test set
test_user_business_pairs_tokenized = test_reviews_json \
    .map(lambda x: (user_tokens_dict.get(x['user_id'], None), business_tokens_dict.get(x['business_id'], None))) \
    .filter(lambda x: x[0] is not None and x[1] is not None)

# load model from memory
model = sc.textFile('task3item.model') \
    .map(json.loads)

# get model keys
keys = list(model.first().keys())

# collect final model as similarity pairs
model = model \
    .map(lambda x: ((x[keys[0]], x[keys[1]]), x[keys[2]])) \
    .collectAsMap()

In [71]:
test_user_business_pairs_tokenized.first()

(20625, 1716)

# item based

In [None]:
model.items()

In [72]:
# users and list of rated businesses with respective ratings
user_business_rating_sets = train_user_business_rating_sets_tokenized \
    .map(lambda x: (x[0], (x[1], x[2]))) \
    .groupByKey() \
    .map(lambda x: (x[0], [(business_rating[0], business_rating[1]) for business_rating in list(x[1])]))

In [76]:
user_business_rating_sets_dict = user_business_rating_sets.collectAsMap()

In [86]:
len(user_business_rating_sets_dict[23792])

29

In [89]:
# dictionary of average ratings for each business from the given file
business_average_ratings = sc.textFile('asnlib/publicdata/business_avg.json')\
    .map(json.loads)\
    .map(lambda x: dict(x))\
    .flatMap(lambda x: x.items()) \
    .collectAsMap()
over_all_avg = sum(business_average_ratings.values())/len(business_average_ratings)
# business_average_ratings = business_average_ratings[0]

In [90]:
over_all_avg

3.809428219215185

In [81]:
len(business_average_ratings)

10254

In [83]:
test_user_business_pairs_tokenized.count()

58473

In [32]:
results = test_user_business_pairs_tokenized\
    .leftOuterJoin(user_business_rating_sets) \

In [26]:
results.first()

(19008,
 (146,
  [(6858, 4.0),
   (8754, 4.0),
   (6797, 5.0),
   (9631, 3.0),
   (6334, 5.0),
   (1655, 3.0),
   (5348, 4.0),
   (9900, 4.0),
   (1926, 4.0),
   (2029, 4.0),
   (3440, 5.0),
   (1984, 4.0),
   (1207, 4.0),
   (6673, 4.0),
   (8983, 3.0),
   (8963, 3.0),
   (182, 3.0),
   (6698, 3.0),
   (4843, 4.0),
   (1878, 4.0),
   (3092, 5.0),
   (9854, 5.0),
   (9446, 4.0),
   (758, 4.0),
   (7085, 5.0),
   (8297, 4.0),
   (1040, 4.0),
   (899, 4.0),
   (7984, 4.0),
   (3401, 4.0),
   (6976, 5.0),
   (4711, 4.0)]))

In [33]:
results = results.mapValues(
    lambda x: item_based_model_predict(x, 3, model, business_average_ratings, inverse_business_tokens_dict)) \
    

In [28]:
results.first()

(19008, (146, 3.2966101694915255))

In [34]:
results.map(lambda x:
         {
             "user_id": inverse_user_tokens_dict[x[0]],
             "business_id": inverse_business_tokens_dict[x[1][0]],
             "stars": x[1][1]
         }
         ) \
    .collect()

[{'user_id': 'iCqpb2C3TZrB5gdMASf8LQ',
  'business_id': '-wuqvqzPR0pHkxCT3SMtIA',
  'stars': 3.2966101694915255},
 {'user_id': 'iCqpb2C3TZrB5gdMASf8LQ',
  'business_id': '2HIhAc5SWb9GUOru_Vc5zQ',
  'stars': 3.4},
 {'user_id': 'iCqpb2C3TZrB5gdMASf8LQ',
  'business_id': 'ECOkEVUodMLUxvI0PMI4gQ',
  'stars': 3.296577946768061},
 {'user_id': 'OD6wbZ1jQOW30wYCDD6X3Q',
  'business_id': 'rcaPajgKOJC2vo_l3xa42A',
  'stars': 4.030769230769231},
 {'user_id': 'OD6wbZ1jQOW30wYCDD6X3Q',
  'business_id': 'YuiM5VKscO8rS1EX7f19_g',
  'stars': 3.7457627118644066},
 {'user_id': 'OD6wbZ1jQOW30wYCDD6X3Q',
  'business_id': 'O3OH5IEFMPtz7mPKakPZ3Q',
  'stars': 4.27536231884058},
 {'user_id': 'OD6wbZ1jQOW30wYCDD6X3Q',
  'business_id': 'ZwNfUuh88cvjqiNtIxxHHg',
  'stars': 3.814814814814815},
 {'user_id': 'vkTlI0k3HqS1axQS1z8whg',
  'business_id': 'm_CqRpEZ2XDH3fp-KFAycQ',
  'stars': 3.65625},
 {'user_id': 'vkTlI0k3HqS1axQS1z8whg',
  'business_id': 'hroo5nOO8b9QhHX0GLg7oA',
  'stars': 3.5649122807017544},
 {'us

# user based

In [95]:
# load model from memory
model = sc.textFile('task3user.model') \
    .map(json.loads)

# get model keys
keys = list(model.first().keys())

# collect final model as similarity pairs
model = model \
    .map(lambda x: (tuple(sorted([user_tokens_dict[x[keys[0]]], user_tokens_dict[x[keys[1]]]])), x[keys[2]])) \
    .collectAsMap()

In [96]:
model[(646, 22764)]

0.5000000000000001

In [37]:
# business and list of user and respective user ratings
business_user_rating_sets = train_user_business_rating_sets_tokenized \
    .map(lambda x: (x[1], (x[0], x[2]))) \
    .groupByKey() \
    .map(lambda x: (x[0], [(user_rating[0], user_rating[1]) for user_rating in list(set(x[1]))]))

In [41]:
business_user_rating_sets.first()

(9422,
 [(21588, 5.0),
  (22117, 4.0),
  (4688, 5.0),
  (8656, 4.0),
  (6350, 4.0),
  (18872, 5.0),
  (16698, 2.0),
  (18156, 5.0),
  (23537, 2.0),
  (7244, 5.0),
  (25246, 5.0),
  (1720, 1.0),
  (15508, 4.0),
  (3338, 2.0)])

In [53]:
# dictionary of average ratings for each user from the given file
user_average_ratings = sc.textFile('asnlib/publicdata/user_avg.json') \
    .map(json.loads)\
    .map(lambda x: dict(x))\
    .flatMap(lambda x: x.items()) \
    .collectAsMap()
# user_average_ratings = user_average_ratings[0]

In [54]:
user_average_ratings

{'OLR4DvqFxCKLOEHqfAxpqQ': 3.5952380952380953,
 '0XMLbsJt-fvcQsOHt3_B_Q': 4.555555555555555,
 'bQCHF5rn5lMI9c5kEwCaNA': 3.6818181818181817,
 '-gQm-IoK2_BMEMx9OgtQnw': 3.8484848484848486,
 'S6x7z26X_HdmdxwkmujlEg': 3.9583333333333335,
 'b1wjZL3EBm4h2vbeXjfcTA': 4.2105263157894735,
 'PKEzKWv_FktMm2mGPjwd0Q': 3.6078224101479917,
 'txu_KwZOGYG6O3yYHjztbg': 3.2413793103448274,
 'n_a826hPtfMFnucaVwnqIg': 3.6444444444444444,
 'evS0cCehNU_EiDiD3286Jw': 3.7209302325581395,
 '8drMKNHWavs2g6uf0pLtvg': 3.588888888888889,
 'CNaaizAUc2c4WIb1M3BBPw': 3.8587570621468927,
 'tEy1MNP7tHJlZgP7xqF4yA': 3.2142857142857144,
 'JQIZhoQB-Tg9hdBEj90m4g': 3.9740259740259742,
 'fiGqQ7pIGKyZ9G0RqWLMpg': 3.9763779527559056,
 'bvPS4vrfiO7ZZ4KAo8KBTw': 3.838235294117647,
 'FAjCZoxiGw9HJKueB8YWTg': 4.133333333333334,
 '7vFWz2YcWWC5UnPYNm7KNA': 4.470588235294118,
 'XYk552OWzrYviwebRCgEJA': 3.466666666666667,
 'Dw36lKvjYEfa6BDgeMs61Q': 3.909090909090909,
 'D1OMbUi-jYVnvJ8gZ2LXHQ': 4.884615384615385,
 'Spgm6HFWgc4YXJlbhg1

In [39]:
# re-order the test set
test_user_business_pairs_tokenized = test_user_business_pairs_tokenized \
    .map(lambda x: (x[1], x[0]))

In [44]:
test_user_business_pairs_tokenized.first()

(1716, 20625)

In [59]:
# making predictions
results = test_user_business_pairs_tokenized\
    .leftOuterJoin(business_user_rating_sets)

In [60]:
results.first()

(9328,
 (3668,
  [(8199, 3.0),
   (22347, 4.0),
   (23888, 4.0),
   (3597, 4.0),
   (19064, 5.0),
   (7186, 4.0),
   (22003, 3.0),
   (14790, 5.0),
   (22982, 5.0),
   (8375, 4.0),
   (18949, 5.0),
   (6022, 3.0),
   (5416, 5.0),
   (16661, 4.0),
   (17769, 5.0),
   (19165, 4.0),
   (4916, 4.0),
   (2274, 4.0),
   (16515, 5.0),
   (3894, 3.0),
   (3474, 3.0),
   (19503, 3.0),
   (16749, 4.0),
   (9311, 5.0),
   (2931, 4.0),
   (4512, 5.0),
   (3561, 4.0),
   (576, 4.0),
   (925, 5.0),
   (6648, 5.0),
   (22357, 4.0),
   (16570, 5.0),
   (18618, 5.0),
   (371, 3.0),
   (19620, 3.0),
   (4075, 4.0),
   (26096, 4.0),
   (25852, 3.0),
   (19930, 5.0),
   (3943, 5.0),
   (22521, 3.0),
   (22575, 5.0),
   (12299, 4.0),
   (21157, 5.0),
   (10681, 4.0),
   (441, 4.0),
   (7871, 5.0),
   (2048, 5.0),
   (19577, 2.0),
   (20061, 4.0),
   (7171, 5.0),
   (23851, 4.0),
   (1704, 2.0),
   (25848, 4.0),
   (23148, 5.0),
   (12725, 4.0),
   (3027, 3.0),
   (23643, 5.0),
   (18228, 5.0),
   (21373, 4

In [61]:
results = results.mapValues(lambda x: user_based_model_predict(x, model, user_average_ratings, inverse_user_tokens_dict))

In [62]:
results.first()

(7920, (5038, 3.5992063492063493))

In [63]:
results = results.map(lambda x:
         {
             "user_id": inverse_user_tokens_dict[x[1][0]],
             "business_id": inverse_business_tokens_dict[x[0]],
             "stars": x[1][1]
         }
         ) \
    .collect()

In [64]:
results

[{'user_id': '98rLDXbloLXekGjieuQSlA',
  'business_id': 'E96XW5-Me9nnBIkpg0nNBA',
  'stars': 4.388888888888889},
 {'user_id': '9iIhOj--TcuxmJ6CGHnBOw',
  'business_id': '5fpXjlC71y_mKORnPdX5zA',
  'stars': 3.4615384615384617},
 {'user_id': '1q6K81JT5eing3VlloekvA',
  'business_id': '5fpXjlC71y_mKORnPdX5zA',
  'stars': 3.7083333333333335},
 {'user_id': 'zviIkWQleYMGPeFyyGaVCw',
  'business_id': '5fpXjlC71y_mKORnPdX5zA',
  'stars': 3.111111111111111},
 {'user_id': 'OjTzg-3qGUKtd9c2q9tnhQ',
  'business_id': '5fpXjlC71y_mKORnPdX5zA',
  'stars': 4.0},
 {'user_id': 'BEE2u5krVuRG7kaAfx4cMw',
  'business_id': 'InYPeA1WM9uGR_mA5Wia1g',
  'stars': 3.3636363636363638},
 {'user_id': 'is5vAvanYvmF-28i9no2LQ',
  'business_id': 'NTQXBbCa5Ugj5lNr6E4J5w',
  'stars': 3.391304347826087},
 {'user_id': '6mMk9hO_RAbdGMEGeg3oWQ',
  'business_id': 'NTQXBbCa5Ugj5lNr6E4J5w',
  'stars': 3.5135135135135136},
 {'user_id': 'w57Yn8npGhdDHmHFGKk2jA',
  'business_id': 'NTQXBbCa5Ugj5lNr6E4J5w',
  'stars': 4.29411764705

In [67]:
[1,2,3][:2]

[1, 2]

In [69]:
a = 0
if a:
    print(1+1)
else:
    print(2+2)

4
