In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilaritiesRedux005")
sc = SparkContext(conf = conf)

In [3]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data")
                      .map(process_movie_row))

In [4]:
for r in user_movie_ratings.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(22, (377, 1.0))
(244, (51, 2.0))
(166, (346, 1.0))
(298, (474, 4.0))
(115, (265, 2.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))


In [5]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])))
                                 .groupByKey())

In [6]:
for r in user_mr_pairs.take(10):
    print(r)

((197, 1097), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42750>)
((42, 364), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42990>)
((773, 1409), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f429d0>)
((273, 617), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42a10>)
((372, 974), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42a50>)
((789, 865), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42a90>)
((496, 1314), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42ad0>)
((389, 493), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42b10>)
((856, 1006), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42b50>)
((552, 754), <pyspark.resultiterable.ResultIterable object at 0x7f58c5f42b90>)


In [7]:
def pearson_similarity(ratings):
    from scipy.stats.stats import pearsonr
    v1,v2=zip(*ratings)
    num_pairs=len(v1)
    return pearsonr(v1,v2)[0],num_pairs
    
movie_pair_similarities=user_mr_pairs.mapValues(pearson_similarity)

In [8]:
for r in movie_pair_similarities.take(10):
    print(r)

((197, 1097), (0.63586706399759618, 7))
((42, 364), (-0.077864642904380901, 18))
((773, 1409), (nan, 1))
((273, 617), (0.21459876881973802, 7))
((372, 974), (nan, 1))
((789, 865), (1.0, 3))
((496, 1314), (nan, 4))
((389, 493), (0.0, 7))
((856, 1006), (0.6805446536716202, 10))
((552, 754), (nan, 2))


In [9]:
movie_avg_ratings=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                     .reduceByKey(lambda x, y: (x[0]+y[0],x[1]+y[1]))
                                     .mapValues(lambda x: (x[0]/x[1],x[1])))

In [10]:
for r in movie_avg_ratings.take(10):
    print(r)

(2, (3.2061068702290076, 131))
(4, (3.550239234449761, 209))
(6, (3.576923076923077, 26))
(8, (3.9954337899543377, 219))
(10, (3.831460674157303, 89))
(12, (4.385767790262173, 267))
(14, (3.9672131147540983, 183))
(16, (3.2051282051282053, 39))
(18, (2.8, 10))
(20, (3.4166666666666665, 72))


In [11]:
def process_movie_row(movie_row):
    movie=movie_row.split('|')
    movie_id=int(movie[0])
    movie_title=movie[1]
    return (movie_id,movie_title)

movie_names_and_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item")
                           .map(process_movie_row)
                           .join(movie_avg_ratings)
                           .mapValues(lambda x: (x[0],x[1][0],x[1][1])))

In [12]:
for r in movie_names_and_ratings.takeOrdered(10):
    print('{:4d} [{:4.2f}, {:3d}] - {}'.format(r[0],r[1][1],r[1][2],r[1][0]))

   1 [3.88, 452] - Toy Story (1995)
   2 [3.21, 131] - GoldenEye (1995)
   3 [3.03,  90] - Four Rooms (1995)
   4 [3.55, 209] - Get Shorty (1995)
   5 [3.30,  86] - Copycat (1995)
   6 [3.58,  26] - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
   7 [3.80, 392] - Twelve Monkeys (1995)
   8 [4.00, 219] - Babe (1995)
   9 [3.90, 299] - Dead Man Walking (1995)
  10 [3.83,  89] - Richard III (1995)


In [18]:
target_movie=50
min_similarity=0.5
min_num_rating_pairs=25

target_movie_filtered=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)
                                              .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                              (x[1][0],x[1][1]))))

In [19]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

(172, (0.74798142237886811, 345))
(181, (0.67255585588760702, 480))
(174, (0.53611710137293112, 380))
(1226, (0.51529081620737816, 28))
(1142, (0.51516406681876525, 37))
(936, (0.51510769706017157, 27))
(963, (0.5090161595253091, 40))


In [20]:
min_rating=4

target_movie_filtered_named_rated=(target_movie_filtered.join(movie_names_and_ratings)
                                                        .mapValues(lambda x: (x[1][0],x[1][1],x[1][2],
                                                                              x[0][1],x[0][0]))
                                                        .filter(lambda x: x[1][1]>=min_rating))

In [21]:
for r in target_movie_filtered_named_rated.take(10):
    print r

(963, (u'Some Folks Call It a Sling Blade (1993)', 4.2926829268292686, 41, 40, 0.5090161595253091))
(172, (u'Empire Strikes Back, The (1980)', 4.204359673024523, 367, 345, 0.74798142237886811))
(181, (u'Return of the Jedi (1983)', 4.007889546351085, 507, 480, 0.67255585588760702))
(174, (u'Raiders of the Lost Ark (1981)', 4.252380952380952, 420, 380, 0.53611710137293112))
(1142, (u'When We Were Kings (1996)', 4.045454545454546, 44, 37, 0.51516406681876525))


In [22]:
target_movie_r=movie_names_and_ratings.lookup(target_movie)[0]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_movie_r[1],target_movie_r[2],
                                                                  target_movie_r[0]))
for r in target_movie_filtered_named_rated.takeOrdered(10,key=lambda x: (-x[1][4],-x[1][1])):
    print('{:4d} [{:.2f}, {:3d}/{:3d}] ({:.4f}): {}'.format(r[0],r[1][1],r[1][3],r[1][2],r[1][4],r[1][0]))

Most similar movies to   50 [4.36, 583] - Star Wars (1977)
 172 [4.20, 345/367] (0.7480): Empire Strikes Back, The (1980)
 181 [4.01, 480/507] (0.6726): Return of the Jedi (1983)
 174 [4.25, 380/420] (0.5361): Raiders of the Lost Ark (1981)
1142 [4.05,  37/ 44] (0.5152): When We Were Kings (1996)
 963 [4.29,  40/ 41] (0.5090): Some Folks Call It a Sling Blade (1993)
