In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilaritiesRedux011")
sc = SparkContext(conf = conf)

In [3]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data")
                      .map(process_movie_row))

In [4]:
for r in user_movie_ratings.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(22, (377, 1.0))
(244, (51, 2.0))
(166, (346, 1.0))
(298, (474, 4.0))
(115, (265, 2.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))


In [5]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])))
                                 .groupByKey()
                                 .mapValues(lambda x: zip(*x)))

In [6]:
for r in user_mr_pairs.take(10):
    print(r)

((197, 1097), [(5.0, 4.0, 3.0, 5.0, 3.0, 4.0, 2.0), (4.0, 5.0, 2.0, 4.0, 3.0, 4.0, 3.0)])
((42, 364), [(4.0, 5.0, 4.0, 4.0, 3.0, 5.0, 3.0, 4.0, 1.0, 4.0, 5.0, 4.0, 5.0, 5.0, 4.0, 4.0, 5.0, 2.0), (3.0, 2.0, 3.0, 3.0, 2.0, 1.0, 4.0, 2.0, 1.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 4.0, 2.0, 4.0)])
((773, 1409), [(1.0,), (1.0,)])
((273, 617), [(3.0, 5.0, 4.0, 3.0, 3.0, 3.0, 5.0), (3.0, 4.0, 5.0, 4.0, 3.0, 5.0, 4.0)])
((372, 974), [(3.0,), (2.0,)])
((789, 865), [(3.0, 5.0, 3.0), (4.0, 5.0, 4.0)])
((496, 1314), [(4.0, 3.0, 5.0, 3.0), (3.0, 3.0, 3.0, 3.0)])
((389, 493), [(5.0, 3.0, 4.0, 3.0, 3.0, 4.0, 4.0), (4.0, 5.0, 3.0, 3.0, 4.0, 4.0, 5.0)])
((856, 1006), [(4.0, 3.0, 5.0, 3.0, 4.0, 1.0, 3.0, 4.0, 4.0, 3.0), (4.0, 2.0, 3.0, 2.0, 4.0, 1.0, 2.0, 3.0, 3.0, 4.0)])
((552, 754), [(5.0, 1.0), (3.0, 3.0)])


In [7]:
def jaccard_similarity(v1,v2):
    from sklearn.metrics import jaccard_similarity_score
    num_pairs=len(v1)
    return jaccard_similarity_score(v1,v2),num_pairs
    
movie_pair_ur_similarities=user_mr_pairs.mapValues(lambda x: jaccard_similarity(*x))

In [31]:
for r in movie_pair_ur_similarities.takeOrdered(10):
    print(r)

((1, 2), (0.25961538461538464, 104))
((1, 3), (0.25641025641025639, 78))
((1, 4), (0.3087248322147651, 149))
((1, 5), (0.43859649122807015, 57))
((1, 6), (0.5, 14))
((1, 7), (0.33096085409252668, 281))
((1, 8), (0.41558441558441561, 154))
((1, 9), (0.30348258706467662, 201))
((1, 10), (0.31147540983606559, 61))
((1, 11), (0.375, 160))


In [9]:
def process_movie_row_genre(movie_row):
    movie=movie_row.split('|')
    movie_id=int(movie[0])
    movie_genres=[int(g) for g in movie[5:]]
    return movie_id,movie_genres

processed_movie_genres=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item")
                          .map(process_movie_row_genre))

In [10]:
for r in processed_movie_genres.takeOrdered(10):
    print('{:4d} - {}'.format(r[0],r[1]))

   1 - [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   2 - [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   3 - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   4 - [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   5 - [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   6 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   7 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
   8 - [0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   9 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  10 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [11]:
movie_gr_pairs=(processed_movie_genres.cartesian(processed_movie_genres)
                                      .filter(lambda x: x[0][0]<x[1][0])
                                      .map(lambda x: ((x[0][0],x[1][0]),(x[0][1],x[1][1]))))

In [12]:
for r in movie_gr_pairs.take(5):
    print r

((1, 2), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
((1, 3), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
((1, 4), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
((1, 5), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
((1, 6), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))


In [13]:
movie_pair_gr_similarities=movie_gr_pairs.mapValues(lambda x: jaccard_similarity(*x)[0])

In [14]:
for r in movie_pair_gr_similarities.takeOrdered(10):
    print r

((1, 2), 0.68421052631578949)
((1, 3), 0.78947368421052633)
((1, 4), 0.78947368421052633)
((1, 5), 0.68421052631578949)
((1, 6), 0.78947368421052633)
((1, 7), 0.73684210526315785)
((1, 8), 0.89473684210526316)
((1, 9), 0.78947368421052633)
((1, 10), 0.73684210526315785)
((1, 11), 0.73684210526315785)


In [15]:
genre_weight=0.25

movie_pair_similarities=(movie_pair_ur_similarities.join(movie_pair_gr_similarities)
                                                   .mapValues(lambda x: (x[0][0]+genre_weight*x[1],x[0][1])))

In [16]:
for r in movie_pair_similarities.takeOrdered(10):
    print r

((1, 2), (0.43066801619433204, 104))
((1, 3), (0.45377867746288797, 78))
((1, 4), (0.50609325326739674, 149))
((1, 5), (0.60964912280701755, 57))
((1, 6), (0.69736842105263164, 14))
((1, 7), (0.51517138040831612, 281))
((1, 8), (0.63926862611073143, 154))
((1, 9), (0.50085100811730821, 201))
((1, 10), (0.49568593615185508, 61))
((1, 11), (0.55921052631578949, 160))


In [17]:
movie_avg_ratings=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                     .reduceByKey(lambda x, y: (x[0]+y[0],x[1]+y[1]))
                                     .mapValues(lambda x: (x[0]/x[1],x[1])))

In [18]:
for r in movie_avg_ratings.take(10):
    print(r)

(2, (3.2061068702290076, 131))
(4, (3.550239234449761, 209))
(6, (3.576923076923077, 26))
(8, (3.9954337899543377, 219))
(10, (3.831460674157303, 89))
(12, (4.385767790262173, 267))
(14, (3.9672131147540983, 183))
(16, (3.2051282051282053, 39))
(18, (2.8, 10))
(20, (3.4166666666666665, 72))


In [19]:
def process_movie_row_names_and_ratings(movie_row):
    movie=movie_row.split('|')
    movie_id=int(movie[0])
    movie_title=movie[1]
    return (movie_id,movie_title)

movie_names_and_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item")
                           .map(process_movie_row_names_and_ratings)
                           .join(movie_avg_ratings)
                           .mapValues(lambda x: (x[0],x[1][0],x[1][1])))

In [20]:
for r in movie_names_and_ratings.takeOrdered(10):
    print('{:4d} [{:4.2f}, {:3d}] - {}'.format(r[0],r[1][1],r[1][2],r[1][0]))

   1 [3.88, 452] - Toy Story (1995)
   2 [3.21, 131] - GoldenEye (1995)
   3 [3.03,  90] - Four Rooms (1995)
   4 [3.55, 209] - Get Shorty (1995)
   5 [3.30,  86] - Copycat (1995)
   6 [3.58,  26] - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
   7 [3.80, 392] - Twelve Monkeys (1995)
   8 [4.00, 219] - Babe (1995)
   9 [3.90, 299] - Dead Man Walking (1995)
  10 [3.83,  89] - Richard III (1995)


In [26]:
target_movie=50
min_similarity=0.50
min_num_rating_pairs=25

target_movie_filtered=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)
                                              .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                              (x[1][0],x[1][1]))))

In [27]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

(172, (0.88032036613272302, 345))
(181, (0.8125, 480))
(174, (0.78157894736842104, 380))
(114, (0.75725952813067143, 58))
(641, (0.74421052631578954, 25))
(169, (0.7307102708226878, 103))
(936, (0.68957115009746583, 27))
(963, (0.68289473684210522, 40))
(173, (0.67883617494440318, 284))
(483, (0.67867683226758491, 214))


In [28]:
min_rating=4

target_movie_filtered_named_rated=(target_movie_filtered.join(movie_names_and_ratings)
                                                        .mapValues(lambda x: (x[1][0],x[1][1],x[1][2],
                                                                              x[0][1],x[0][0]))
                                                        .filter(lambda x: x[1][1]>=min_rating))

In [29]:
for r in target_movie_filtered_named_rated.take(10):
    print r

(12, (u'Usual Suspects, The (1995)', 4.385767790262173, 267, 223, 0.6332310597120604))
(528, (u'Killing Fields, The (1984)', 4.132231404958677, 121, 101, 0.55054715997915582))
(48, (u'Hoop Dreams (1994)', 4.094017094017094, 117, 104, 0.56528340080971662))
(60, (u'Three Colors: Blue (1993)', 4.015625, 64, 53, 0.51067527308838134))
(96, (u'Terminator 2: Judgment Day (1991)', 4.0067796610169495, 295, 271, 0.58482229559137699))
(132, (u'Wizard of Oz, The (1939)', 4.0772357723577235, 246, 221, 0.56513455584663008))
(648, (u'Quiet Man, The (1952)', 4.029850746268656, 67, 57, 0.55263157894736836))
(168, (u'Monty Python and the Holy Grail (1974)', 4.0664556962025316, 316, 278, 0.59551306323362363))
(180, (u'Apocalypse Now (1979)', 4.04524886877828, 221, 183, 0.57765314926660916))
(192, (u'Raging Bull (1980)', 4.120689655172414, 116, 96, 0.51480263157894735))


In [30]:
target_movie_r=movie_names_and_ratings.lookup(target_movie)[0]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_movie_r[1],target_movie_r[2],
                                                                  target_movie_r[0]))
for r in target_movie_filtered_named_rated.takeOrdered(10,key=lambda x: (-x[1][4],-x[1][1])):
    print('{:4d} [{:.2f}, {:3d}/{:3d}] ({:.4f}): {}'.format(r[0],r[1][1],r[1][3],r[1][2],r[1][4],r[1][0]))

Most similar movies to   50 [4.36, 583] - Star Wars (1977)
 172 [4.20, 345/367] (0.8803): Empire Strikes Back, The (1980)
 181 [4.01, 480/507] (0.8125): Return of the Jedi (1983)
 174 [4.25, 380/420] (0.7816): Raiders of the Lost Ark (1981)
 114 [4.45,  58/ 67] (0.7573): Wallace & Gromit: The Best of Aardman Animation (1996)
 641 [4.21,  25/ 33] (0.7442): Paths of Glory (1957)
 169 [4.47, 103/118] (0.7307): Wrong Trousers, The (1993)
 963 [4.29,  40/ 41] (0.6829): Some Folks Call It a Sling Blade (1993)
 173 [4.17, 284/324] (0.6788): Princess Bride, The (1987)
 483 [4.46, 214/243] (0.6787): Casablanca (1942)
 498 [4.18, 138/152] (0.6716): African Queen, The (1951)
