In [13]:
import findspark
findspark.init()

In [None]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilaritiesRedux001")
sc = SparkContext(conf = conf)

In [47]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data")
                      .map(process_movie_row))

In [48]:
for r in user_movie_ratings.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(22, (377, 1.0))
(244, (51, 2.0))
(166, (346, 1.0))
(298, (474, 4.0))
(115, (265, 2.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))


In [50]:
min_rating=3

user_movie_ratings_filtered=user_movie_ratings.filter(lambda x: x[1][1]>=min_rating)

In [51]:
for r in user_movie_ratings_filtered.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(298, (474, 4.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))
(286, (1014, 5.0))
(200, (222, 5.0))
(210, (40, 3.0))
(224, (29, 3.0))


In [54]:
user_mr_pairs=(user_movie_ratings_filtered.join(user_movie_ratings_filtered)
                                          .filter(lambda x: x[1][0][0]<x[1][1][0])
                                          .map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])))
                                          .groupByKey())

In [55]:
for r in user_mr_pairs.take(10):
    print(r)

((197, 1097), <pyspark.resultiterable.ResultIterable object at 0x7fef6176aad0>)
((273, 617), <pyspark.resultiterable.ResultIterable object at 0x7fef6176a9d0>)
((789, 865), <pyspark.resultiterable.ResultIterable object at 0x7fef6176afd0>)
((496, 1314), <pyspark.resultiterable.ResultIterable object at 0x7fef6176ac50>)
((246, 1008), <pyspark.resultiterable.ResultIterable object at 0x7fef6176a650>)
((856, 1006), <pyspark.resultiterable.ResultIterable object at 0x7fef6176ad90>)
((747, 795), <pyspark.resultiterable.ResultIterable object at 0x7fef6176a550>)
((648, 678), <pyspark.resultiterable.ResultIterable object at 0x7fef6176af10>)
((150, 1344), <pyspark.resultiterable.ResultIterable object at 0x7fef6176a110>)
((734, 864), <pyspark.resultiterable.ResultIterable object at 0x7fef6176ab90>)


In [56]:
def cosine_similarity(ratings):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    v1,v2=zip(*ratings)
    num_pairs=len(v1)
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2))),num_pairs
    except:
        return 0,num_pairs
    
movie_pair_similarities=user_mr_pairs.mapValues(cosine_similarity)

In [57]:
for r in movie_pair_similarities.take(10):
    print(r)

((197, 1097), (0.9839914013827435, 5))
((273, 617), (0.9652953599007105, 7))
((789, 865), (0.9897475249773018, 3))
((496, 1314), (0.976416832356179, 4))
((246, 1008), (0.9688564539098803, 16))
((856, 1006), (0.9681316602428751, 6))
((747, 795), (1.0, 2))
((648, 678), (0.9767576910715874, 4))
((150, 1344), (1.0, 1))
((734, 864), (0.9931459096297017, 6))


In [61]:
movie_avg_ratings=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                     .reduceByKey(lambda x, y: (x[0]+y[0],x[1]+y[1]))
                                     .mapValues(lambda x: (x[0]/x[1],x[1])))

In [67]:
for r in movie_avg_ratings.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

(1189, (5.0, 3))
(1293, (5.0, 3))
(1500, (5.0, 2))
(1467, (5.0, 2))
(814, (5.0, 1))
(1122, (5.0, 1))
(1536, (5.0, 1))
(1201, (5.0, 1))
(1599, (5.0, 1))
(1653, (5.0, 1))


In [68]:
for r in movie_avg_ratings.takeOrdered(10,key=lambda x: (-x[1][1],-x[1][0])):
    print(r)

(50, (4.3584905660377355, 583))
(258, (3.8035363457760316, 509))
(100, (4.155511811023622, 508))
(181, (4.007889546351085, 507))
(294, (3.156701030927835, 485))
(286, (3.656964656964657, 481))
(288, (3.4414225941422596, 478))
(1, (3.8783185840707963, 452))
(300, (3.6310904872389793, 431))
(121, (3.438228438228438, 429))


In [69]:
def process_movie_row(movie_row):
    movie=movie_row.split('|')
    movie_id=int(movie[0])
    movie_title=movie[1]
    return (movie_id,movie_title)

movie_names=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item")
               .map(process_movie_row))

In [70]:
for r in movie_names.take(10):
    print('{:4d} - {}'.format(r[0],r[1]))

   1 - Toy Story (1995)
   2 - GoldenEye (1995)
   3 - Four Rooms (1995)
   4 - Get Shorty (1995)
   5 - Copycat (1995)
   6 - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
   7 - Twelve Monkeys (1995)
   8 - Babe (1995)
   9 - Dead Man Walking (1995)
  10 - Richard III (1995)


In [97]:
movie_names_and_ratings=(movie_names.join(movie_avg_ratings)
                                    .mapValues(lambda x: (x[0],x[1][0],x[1][1])))

In [98]:
for r in movie_names_and_ratings.takeOrdered(10):
    print('{:4d} [{:4.2f}, {:3d}] - {}'.format(r[0],r[1][1],r[1][2],r[1][0]))

   1 [3.88, 452] - Toy Story (1995)
   2 [3.21, 131] - GoldenEye (1995)
   3 [3.03,  90] - Four Rooms (1995)
   4 [3.55, 209] - Get Shorty (1995)
   5 [3.30,  86] - Copycat (1995)
   6 [3.58,  26] - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
   7 [3.80, 392] - Twelve Monkeys (1995)
   8 [4.00, 219] - Babe (1995)
   9 [3.90, 299] - Dead Man Walking (1995)
  10 [3.83,  89] - Richard III (1995)


In [86]:
for r in movie_names_and_ratings.takeOrdered(10,key=lambda x: (-x[1][1],-x[1][2])):
    print('{:4d} [{:4.2f}, {:3d}] - {}'.format(r[0],r[1][1],r[1][2],r[1][0]))

1189 [5.00,   3] - Prefontaine (1997)
1293 [5.00,   3] - Star Kid (1997)
1500 [5.00,   2] - Santa with Muscles (1996)
1467 [5.00,   2] - Saint of Fort Washington, The (1993)
1536 [5.00,   1] - Aiqing wansui (1994)
1201 [5.00,   1] - Marlene Dietrich: Shadow and Light (1996) 
1653 [5.00,   1] - Entertaining Angels: The Dorothy Day Story (1996)
 814 [5.00,   1] - Great Day in Harlem, A (1994)
1122 [5.00,   1] - They Made Me a Criminal (1939)
1599 [5.00,   1] - Someone Else's America (1995)


In [87]:
for r in movie_names_and_ratings.takeOrdered(10,key=lambda x: (-x[1][2],-x[1][1])):
    print('{:4d} [{:4.2f}, {:3d}] - {}'.format(r[0],r[1][1],r[1][2],r[1][0]))

  50 [4.36, 583] - Star Wars (1977)
 258 [3.80, 509] - Contact (1997)
 100 [4.16, 508] - Fargo (1996)
 181 [4.01, 507] - Return of the Jedi (1983)
 294 [3.16, 485] - Liar Liar (1997)
 286 [3.66, 481] - English Patient, The (1996)
 288 [3.44, 478] - Scream (1996)
   1 [3.88, 452] - Toy Story (1995)
 300 [3.63, 431] - Air Force One (1997)
 121 [3.44, 429] - Independence Day (ID4) (1996)


In [42]:
print(movie_avg_ratings.count())
print(movie_names.count())
print(movie_names_and_ratings.count())

1574
1682
1574


In [92]:
target_movie=50
min_similarity=0.95
min_num_rating_pairs=25

target_movie_filtered=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)
                                              .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                              (x[1][0],x[1][1]))))

In [93]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

(172, (0.991306470335622, 327))
(1142, (0.9895197229266409, 33))
(181, (0.988642935971419, 450))
(114, (0.9871889393770342, 56))
(141, (0.9862168070192264, 62))
(612, (0.9856850002173794, 29))
(169, (0.9855477594994516, 97))
(674, (0.985498827294707, 27))
(174, (0.9854887713233331, 358))
(570, (0.985200209815983, 32))


In [102]:
target_movie_filtered_named_rated=(target_movie_filtered.join(movie_names_and_ratings)
                                                        .mapValues(lambda x: (x[1][0],x[1][1],x[1][2],
                                                                              x[0][1],x[0][0])))

In [103]:
for r in target_movie_filtered_named_rated.take(10):
    print r

(512, (u'Wings of Desire (1987)', 4.0, 57, 41, 0.9742728844145573))
(520, (u'Great Escape, The (1963)', 4.104838709677419, 124, 98, 0.9770131024395379))
(16, (u'French Twist (Gazon maudit) (1995)', 3.2051282051282053, 39, 27, 0.9692142921387517))
(24, (u'Rumble in the Bronx (1995)', 3.4482758620689653, 174, 128, 0.9779808597908008))
(32, (u'Crumb (1994)', 3.7901234567901234, 81, 62, 0.9820698016106348))
(432, (u'Fantasia (1940)', 3.7701149425287355, 174, 142, 0.9771264299032284))
(40, (u'To Wong Foo, Thanks for Everything! Julie Newmar (1995)', 2.8947368421052633, 57, 35, 0.9627068041543829))
(48, (u'Hoop Dreams (1994)', 4.094017094017094, 117, 93, 0.9787764547742498))
(8, (u'Babe (1995)', 3.9954337899543377, 219, 169, 0.9771268100909533))
(568, (u'Speed (1994)', 3.6478260869565218, 230, 179, 0.9760032727286512))


In [108]:
target_movie_r=movie_names_and_ratings.lookup(target_movie)[0]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_movie_r[1],target_movie_r[2],
                                                                  target_movie_r[0]))
for r in target_movie_filtered_named_rated.takeOrdered(10,key=lambda x: (-x[1][4],-x[1][1])):
    print('{:4d} [{:.2f}, {:3d}/{:3d}] ({:.4f}): {}'.format(r[0],r[1][1],r[1][3],r[1][2],r[1][4],r[1][0]))

Most similar movies to   50 [4.36, 583] - Star Wars (1977)
 172 [4.20, 327/367] (0.9913): Empire Strikes Back, The (1980)
1142 [4.05,  33/ 44] (0.9895): When We Were Kings (1996)
 181 [4.01, 450/507] (0.9886): Return of the Jedi (1983)
 114 [4.45,  56/ 67] (0.9872): Wallace & Gromit: The Best of Aardman Animation (1996)
 141 [3.50,  62/ 72] (0.9862): 20,000 Leagues Under the Sea (1954)
 612 [3.94,  29/ 34] (0.9857): Lost Horizon (1937)
 169 [4.47,  97/118] (0.9855): Wrong Trousers, The (1993)
 674 [2.90,  27/ 48] (0.9855): Cat People (1982)
 174 [4.25, 358/420] (0.9855): Raiders of the Lost Ark (1981)
 570 [3.10,  32/ 50] (0.9852): Wyatt Earp (1994)
