# 35.4 Movie similarities
- 1M
- Cluster
- Partitioned
- Broadcasting title, average rating, and genre vector dictionary
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf=SparkConf().setMaster("spark://1.1.1.102:7077").setAppName("MovieSimilaritiesCluster004")
sc=SparkContext(conf=conf)

In [3]:
num_partitions=8

In [4]:
def process_movie_row(movie_row):
    movie=movie_row.split('::')
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-1m/ratings.dat")
                      .map(process_movie_row).partitionBy(num_partitions))

In [5]:
user_movie_ratings.getNumPartitions()

8

In [7]:
for r in user_movie_ratings.take(10):
    print r

(3096, (3930, 3.0))
(3096, (3936, 4.0))
(3096, (2053, 4.0))
(3096, (3793, 5.0))
(3096, (1253, 5.0))
(3096, (3798, 5.0))
(3096, (587, 5.0))
(3096, (589, 4.0))
(3096, (3, 3.0))
(3096, (1408, 5.0))


In [8]:
movie_ar_dict=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                 .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
                                 .mapValues(lambda x: (x[0]/x[1],x[1]))
                                 .collectAsMap())

In [9]:
for key in movie_ar_dict.keys()[:10]:
    print('{:4d} [{:.2f} {:4d}]'.format(key,*movie_ar_dict[key]))

   1 [4.15 2077]
   2 [3.20  701]
   3 [3.02  478]
   4 [2.73  170]
   5 [3.01  296]
   6 [3.88  940]
   7 [3.41  458]
   8 [3.01   68]
   9 [2.66  102]
  10 [3.54  888]


In [11]:
def build_movie_dict(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie_row=movie_row.strip()
            movie=movie_row.split('::')
            movie_id=int(movie[0])
            movie_title=movie[1]
            genres=['Action','Adventure','Animation','Children\'s','Comedy','Crime',
                    'Documentary','Drama','Fantasy','Film-Noir','Horror',
                    'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
            movie_genres=[0]*len(genres)
            for genre in movie[2].split('|'):
                movie_genres[genres.index(genre)]=1
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

movie_dict=sc.broadcast(build_movie_dict('/home/sparky/jupyter/ZZ_Data/ml-1m/movies.dat',movie_ar_dict))

In [12]:
for key in movie_dict.value.keys()[:10]:
    print('{:4d} [{:.2f} {:4d}] - {} - {}'.format(key,*movie_dict.value[key]))

   1 [4.15 2077] - Toy Story (1995) - [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   2 [3.20  701] - Jumanji (1995) - [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   3 [3.02  478] - Grumpier Old Men (1995) - [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
   4 [2.73  170] - Waiting to Exhale (1995) - [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   5 [3.01  296] - Father of the Bride Part II (1995) - [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   6 [3.88  940] - Heat (1995) - [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   7 [3.41  458] - Sabrina (1995) - [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
   8 [3.01   68] - Tom and Huck (1995) - [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   9 [2.66  102] - Sudden Death (1995) - [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  10 [3.54  888] - GoldenEye (1995) - [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [13]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),[(x[1][0][1],x[1][1][1])]))
                                 .partitionBy(num_partitions)
                                 .reduceByKey(lambda x,y: x+y)
                                 .mapValues(lambda x: zip(*x)))

In [14]:
for r in user_mr_pairs.take(10):
    print(r)

((530, 640), [(4.0,), (1.0,)])
((3066, 3072), [(3.0, 4.0, 4.0, 4.0, 4.0, 3.0, 2.0, 4.0, 5.0, 4.0, 5.0, 3.0, 1.0, 4.0, 4.0, 4.0, 3.0, 3.0, 5.0, 4.0, 3.0, 4.0, 4.0, 4.0, 5.0, 3.0, 2.0, 4.0, 4.0, 3.0, 5.0, 3.0, 5.0, 3.0, 4.0, 3.0, 3.0, 2.0, 3.0, 4.0, 1.0, 3.0, 5.0, 2.0, 4.0, 3.0, 5.0, 3.0, 2.0, 2.0, 3.0, 5.0, 3.0, 2.0, 3.0, 3.0, 4.0, 3.0, 2.0, 4.0, 2.0, 5.0, 3.0, 3.0, 4.0, 3.0, 5.0, 5.0, 4.0, 3.0, 3.0, 5.0, 4.0, 3.0, 3.0, 2.0, 3.0, 4.0, 3.0, 3.0, 4.0, 4.0, 3.0, 2.0, 5.0, 3.0, 5.0, 1.0), (3.0, 3.0, 5.0, 4.0, 5.0, 3.0, 4.0, 4.0, 1.0, 5.0, 2.0, 2.0, 4.0, 4.0, 4.0, 4.0, 5.0, 3.0, 3.0, 5.0, 3.0, 4.0, 4.0, 4.0, 3.0, 4.0, 5.0, 4.0, 4.0, 4.0, 4.0, 1.0, 5.0, 4.0, 4.0, 4.0, 5.0, 4.0, 2.0, 5.0, 3.0, 1.0, 3.0, 5.0, 3.0, 3.0, 4.0, 3.0, 4.0, 5.0, 3.0, 1.0, 4.0, 4.0, 2.0, 3.0, 3.0, 2.0, 1.0, 5.0, 4.0, 3.0, 5.0, 4.0, 4.0, 4.0, 3.0, 2.0, 3.0, 5.0, 5.0, 5.0, 4.0, 5.0, 3.0, 4.0, 2.0, 3.0, 4.0, 4.0, 4.0, 5.0, 3.0, 5.0, 3.0, 3.0, 4.0, 3.0)])
((3052, 3202), [(4.0,), (1.0,)])
((1176, 1606), [(5.0, 3.0, 3.0), (3

In [15]:
def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    num_pairs=len(v1)
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2))),num_pairs
    except:
        return 0,num_pairs
    
movie_pair_ur_similarities=user_mr_pairs.mapValues(lambda x: cosine_similarity(*x))

In [16]:
for r in movie_pair_ur_similarities.take(10):
    print(r)

((82, 2496), (0.794752861321983, 11))
((3052, 3202), (1.0, 1))
((1176, 1606), (0.9446104285307813, 3))
((12, 2842), (0.9333809511662426, 4))
((1193, 2621), (0.9539457271989132, 29))
((1090, 2144), (0.9599744728931857, 349))
((346, 1472), (0.974755499671922, 9))
((2970, 3664), (1.0, 1))
((181, 913), (0.9060787876015473, 19))
((2089, 3821), (0.8725264017661245, 21))


In [17]:
def adjust_rating_by_genre(movie_pair,genre_weight=0.25):
    gr_similarity=cosine_similarity(movie_dict.value[movie_pair[0][0]][3],
                                    movie_dict.value[movie_pair[0][1]][3])[0]
    return (movie_pair[0],
            (movie_pair[1][0]+genre_weight*gr_similarity,movie_pair[1][1]))

movie_pair_similarities=(movie_pair_ur_similarities.map(adjust_rating_by_genre)
                                                   .partitionBy(num_partitions))

In [18]:
movie_pair_similarities.getNumPartitions()

8

In [19]:
for r in movie_pair_similarities.take(10):
    print r

((763, 3447), (1.25, 1))
((2317, 2617), (0.9949366763261821, 3))
((1954, 2600), (1.0161765475591253, 126))
((1410, 1872), (1.176776695296637, 1))
((479, 2131), (0.8408409924953906, 3))
((1654, 2852), (1.012956620679228, 4))
((1194, 2176), (0.9388293358648743, 30))
((1425, 1869), (0.8946300779688404, 6))
((480, 3838), (0.9007318732256941, 28))
((265, 2613), (0.9129600438427359, 45))


In [26]:
target_movie=260

min_rating=4
min_similarity=0.95
min_num_rating_pairs=250

sims_to_target=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or (x[0][1]==target_movie)) and \
                                                         x[1][0]>=min_similarity and \
                                                         x[1][1]>=min_num_rating_pairs)
                                       .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                       x[1])))

In [29]:
for r in sims_to_target.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print r

(2628, (1.208293054475818, 1678))
(2105, (1.20517218329955, 893))
(2640, (1.1849154371081836, 1153))
(1374, (1.1843208887582921, 1247))
(480, (1.1829358746214107, 1824))
(1375, (1.1819134194533014, 833))
(1356, (1.1813526267333057, 1083))
(1376, (1.1784875753299524, 1019))
(2528, (1.1759700974811342, 599))
(3698, (1.1759502418135537, 651))


In [27]:
sims_to_target_ext_fil=(sims_to_target.map(lambda x: (x[0],(movie_dict.value[x[0]][0],
                                                            x[1][0],x[1][1],
                                                            movie_dict.value[x[0]][1],
                                                            movie_dict.value[x[0]][2])))
                                      .filter(lambda x: x[1][0]>=min_rating))

In [30]:
for r in sims_to_target_ext_fil.takeOrdered(10):
    print r

(1, (4.146846413095811, 0.9721270419610062, 1382, 2077, 'Toy Story (1995)'))
(17, (4.027544910179641, 0.9639639121739775, 479, 835, 'Sense and Sensibility (1995)'))
(29, (4.062034739454094, 1.1390315217600202, 344, 403, 'City of Lost Children, The (1995)'))
(47, (4.106420404573439, 0.9619409530596446, 801, 1137, 'Seven (Se7en) (1995)'))
(50, (4.517106001121705, 0.9766875136831684, 1194, 1783, 'Usual Suspects, The (1995)'))
(58, (4.093812375249501, 0.9608543463218306, 296, 501, 'Postino, Il (The Postman) (1994)'))
(110, (4.234957020057307, 1.0420684116040864, 1554, 2443, 'Braveheart (1995)'))
(111, (4.183870967741935, 0.9609147058314133, 886, 1240, 'Taxi Driver (1976)'))
(150, (4.073541167066347, 0.9724531455121123, 811, 1251, 'Apollo 13 (1995)'))
(162, (4.063136456211812, 0.9513303336744755, 335, 491, 'Crumb (1994)'))


In [28]:
target_info=movie_dict.value[target_movie]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_info[0],
                                                                  target_info[1],
                                                                  target_info[2]))
for r in sims_to_target_ext_fil.takeOrdered(10,key=lambda x: (-x[1][1],-x[1][0])):
    print('{:4d} [{:.4f}, {:4d}/{:4d}] ({:.2f}): {}'.format(r[0],r[1][1],r[1][2],r[1][3],r[1][0],r[1][4]))

Most similar movies to  260 [4.45, 2991] - Star Wars: Episode IV - A New Hope (1977)
1198 [1.1623, 1972/2514] (4.48): Raiders of the Lost Ark (1981)
1196 [1.1575, 2355/2990] (4.29): Star Wars: Episode V - The Empire Strikes Back (1980)
1291 [1.1542, 1397/1628] (4.13): Indiana Jones and the Last Crusade (1989)
1210 [1.1518, 2113/2883] (4.02): Star Wars: Episode VI - Return of the Jedi (1983)
  29 [1.1390,  344/ 403] (4.06): City of Lost Children, The (1995)
1240 [1.1189, 1746/2098] (4.15): Terminator, The (1984)
2571 [1.1176, 1908/2590] (4.32): Matrix, The (1999)
1287 [1.1170,  600/ 704] (4.11): Ben-Hur (1959)
 589 [1.1147, 1889/2649] (4.06): Terminator 2: Judgment Day (1991)
3000 [1.1087,  287/ 345] (4.15): Princess Mononoke, The (Mononoke Hime) (1997)
