In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilaritiesRedux009")
sc = SparkContext(conf = conf)

In [4]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data")
                      .map(process_movie_row))

In [5]:
for r in user_movie_ratings.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(22, (377, 1.0))
(244, (51, 2.0))
(166, (346, 1.0))
(298, (474, 4.0))
(115, (265, 2.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))


In [6]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])))
                                 .groupByKey())

In [7]:
for r in user_mr_pairs.take(10):
    print(r)

((197, 1097), <pyspark.resultiterable.ResultIterable object at 0x7ff438993310>)
((42, 364), <pyspark.resultiterable.ResultIterable object at 0x7ff438993090>)
((773, 1409), <pyspark.resultiterable.ResultIterable object at 0x7ff4389930d0>)
((273, 617), <pyspark.resultiterable.ResultIterable object at 0x7ff438993110>)
((372, 974), <pyspark.resultiterable.ResultIterable object at 0x7ff438993150>)
((789, 865), <pyspark.resultiterable.ResultIterable object at 0x7ff438993190>)
((496, 1314), <pyspark.resultiterable.ResultIterable object at 0x7ff4389931d0>)
((389, 493), <pyspark.resultiterable.ResultIterable object at 0x7ff4389934d0>)
((856, 1006), <pyspark.resultiterable.ResultIterable object at 0x7ff438993510>)
((552, 754), <pyspark.resultiterable.ResultIterable object at 0x7ff438993550>)


In [8]:
def conditional_probability_similarity(ratings):
    def compare(x,y):
        if x>y:
            return -1
        elif x==y:
            return 0
        else:
            return 1
    ratings_comparison=[ compare(x,y) for x,y in ratings ]
    num_pairs=len(ratings)
    similarity=1.0*ratings_comparison.count(0)/num_pairs
    return similarity,num_pairs
    
movie_pair_similarities=user_mr_pairs.mapValues(conditional_probability_similarity)

In [9]:
for r in movie_pair_similarities.take(10):
    print(r)

((197, 1097), (0.2857142857142857, 7))
((42, 364), (0.1111111111111111, 18))
((773, 1409), (1.0, 1))
((273, 617), (0.2857142857142857, 7))
((372, 974), (0.0, 1))
((789, 865), (0.3333333333333333, 3))
((496, 1314), (0.5, 4))
((389, 493), (0.2857142857142857, 7))
((856, 1006), (0.3, 10))
((552, 754), (0.0, 2))


In [10]:
movie_avg_ratings=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                     .reduceByKey(lambda x, y: (x[0]+y[0],x[1]+y[1]))
                                     .mapValues(lambda x: (x[0]/x[1],x[1])))

In [11]:
for r in movie_avg_ratings.take(10):
    print(r)

(2, (3.2061068702290076, 131))
(4, (3.550239234449761, 209))
(6, (3.576923076923077, 26))
(8, (3.9954337899543377, 219))
(10, (3.831460674157303, 89))
(12, (4.385767790262173, 267))
(14, (3.9672131147540983, 183))
(16, (3.2051282051282053, 39))
(18, (2.8, 10))
(20, (3.4166666666666665, 72))


In [12]:
def process_movie_row(movie_row):
    movie=movie_row.split('|')
    movie_id=int(movie[0])
    movie_title=movie[1]
    return (movie_id,movie_title)

movie_names_and_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item")
                           .map(process_movie_row)
                           .join(movie_avg_ratings)
                           .mapValues(lambda x: (x[0],x[1][0],x[1][1])))

In [13]:
for r in movie_names_and_ratings.takeOrdered(10):
    print('{:4d} [{:4.2f}, {:3d}] - {}'.format(r[0],r[1][1],r[1][2],r[1][0]))

   1 [3.88, 452] - Toy Story (1995)
   2 [3.21, 131] - GoldenEye (1995)
   3 [3.03,  90] - Four Rooms (1995)
   4 [3.55, 209] - Get Shorty (1995)
   5 [3.30,  86] - Copycat (1995)
   6 [3.58,  26] - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
   7 [3.80, 392] - Twelve Monkeys (1995)
   8 [4.00, 219] - Babe (1995)
   9 [3.90, 299] - Dead Man Walking (1995)
  10 [3.83,  89] - Richard III (1995)


In [14]:
target_movie=50
min_similarity=0.25
min_num_rating_pairs=25

target_movie_filtered=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)
                                              .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                              (x[1][0],x[1][1]))))

In [15]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

(172, (0.6434782608695652, 345))
(114, (0.5862068965517241, 58))
(169, (0.5728155339805825, 103))
(174, (0.5710526315789474, 380))
(181, (0.5625, 480))
(641, (0.56, 25))
(963, (0.525, 40))
(408, (0.5217391304347826, 92))
(936, (0.5185185185185185, 27))
(1142, (0.4864864864864865, 37))


In [16]:
min_rating=4

target_movie_filtered_named_rated=(target_movie_filtered.join(movie_names_and_ratings)
                                                        .mapValues(lambda x: (x[1][0],x[1][1],x[1][2],
                                                                              x[0][1],x[0][0]))
                                                        .filter(lambda x: x[1][1]>=min_rating))

In [17]:
for r in target_movie_filtered_named_rated.take(10):
    print r

(512, (u'Wings of Desire (1987)', 4.0, 57, 47, 0.2765957446808511))
(520, (u'Great Escape, The (1963)', 4.104838709677419, 124, 107, 0.40186915887850466))
(528, (u'Killing Fields, The (1984)', 4.132231404958677, 121, 101, 0.36633663366336633))
(48, (u'Hoop Dreams (1994)', 4.094017094017094, 117, 104, 0.3942307692307692))
(56, (u'Pulp Fiction (1994)', 4.060913705583756, 394, 330, 0.396969696969697))
(96, (u'Terminator 2: Judgment Day (1991)', 4.0067796610169495, 295, 271, 0.3874538745387454))
(648, (u'Quiet Man, The (1952)', 4.029850746268656, 67, 57, 0.3684210526315789))
(168, (u'Monty Python and the Holy Grail (1974)', 4.0664556962025316, 316, 278, 0.4244604316546763))
(192, (u'Raging Bull (1980)', 4.120689655172414, 116, 96, 0.34375))
(272, (u'Good Will Hunting (1997)', 4.262626262626263, 198, 110, 0.4636363636363636))


In [18]:
target_movie_r=movie_names_and_ratings.lookup(target_movie)[0]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_movie_r[1],target_movie_r[2],
                                                                  target_movie_r[0]))
for r in target_movie_filtered_named_rated.takeOrdered(10,key=lambda x: (-x[1][4],-x[1][1])):
    print('{:4d} [{:.2f}, {:3d}/{:3d}] ({:.4f}): {}'.format(r[0],r[1][1],r[1][3],r[1][2],r[1][4],r[1][0]))

Most similar movies to   50 [4.36, 583] - Star Wars (1977)
 172 [4.20, 345/367] (0.6435): Empire Strikes Back, The (1980)
 114 [4.45,  58/ 67] (0.5862): Wallace & Gromit: The Best of Aardman Animation (1996)
 169 [4.47, 103/118] (0.5728): Wrong Trousers, The (1993)
 174 [4.25, 380/420] (0.5711): Raiders of the Lost Ark (1981)
 181 [4.01, 480/507] (0.5625): Return of the Jedi (1983)
 641 [4.21,  25/ 33] (0.5600): Paths of Glory (1957)
 963 [4.29,  40/ 41] (0.5250): Some Folks Call It a Sling Blade (1993)
 408 [4.49,  92/112] (0.5217): Close Shave, A (1995)
1142 [4.05,  37/ 44] (0.4865): When We Were Kings (1996)
 478 [4.12,  87/104] (0.4828): Philadelphia Story, The (1940)
