# 35.1 Movie similarities
- 100K
- Cluster
- Not partitioned
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("spark://1.1.1.102:7077").setAppName("MovieSimilaritiesCluster001")
sc = SparkContext(conf = conf)

In [4]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-100k/u.data")
                      .map(process_movie_row))

In [6]:
user_movie_ratings.getNumPartitions()

2

In [5]:
for r in user_movie_ratings.takeOrdered(10):
    print r

(1, (1, 5.0))
(1, (2, 3.0))
(1, (3, 4.0))
(1, (4, 3.0))
(1, (5, 3.0))
(1, (6, 5.0))
(1, (7, 4.0))
(1, (8, 1.0))
(1, (9, 5.0))
(1, (10, 3.0))


In [7]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),[(x[1][0][1],x[1][1][1])]))
                                 .reduceByKey(lambda x,y: x+y)
                                 .mapValues(lambda x: zip(*x)))

In [8]:
user_mr_pairs.getNumPartitions()

4

In [9]:
for r in user_mr_pairs.takeOrdered(10):
    print(r)

((1, 2), [(4.0, 4.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 3.0, 5.0, 5.0, 3.0, 5.0, 4.0, 4.0, 4.0, 5.0, 3.0, 5.0, 4.0, 2.0, 3.0, 4.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 4.0, 3.0, 4.0, 5.0, 4.0, 3.0, 4.0, 4.0, 3.0, 4.0, 4.0, 4.0, 5.0, 3.0, 5.0, 4.0, 4.0, 4.0, 3.0, 4.0, 5.0, 2.0, 4.0, 4.0, 5.0, 4.0, 3.0, 2.0, 2.0, 5.0, 3.0, 4.0, 3.0, 4.0, 3.0, 2.0, 4.0, 2.0, 2.0, 4.0, 5.0, 2.0, 4.0, 3.0, 3.0, 4.0, 2.0, 3.0, 4.0, 4.0, 4.0, 5.0, 3.0, 4.0, 2.0, 3.0, 3.0, 5.0, 4.0, 4.0, 3.0, 2.0, 5.0, 4.0, 3.0, 4.0, 4.0, 4.0, 5.0, 5.0, 4.0, 5.0, 5.0, 4.0), (3.0, 3.0, 3.0, 4.0, 3.0, 3.0, 5.0, 4.0, 4.0, 4.0, 4.0, 5.0, 2.0, 4.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 2.0, 5.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 5.0, 4.0, 2.0, 2.0, 4.0, 3.0, 4.0, 4.0, 3.0, 2.0, 3.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 4.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 1.0, 3.0, 3.0, 3.0, 4.0, 3.0, 3.0, 2.0, 4.0, 3.0, 3.0, 3.0, 2.0, 2.0, 1.0, 1.0, 4.0, 3.0, 4.0, 2.0, 3.0, 3.0, 1.0, 4.0, 2.0, 4.0, 4.0, 3.0, 1.0, 4.0, 3.0, 2.0, 3.0, 4.0, 4.0, 3.0, 3.

In [10]:
def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    num_pairs=len(v1)
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2))),num_pairs
    except:
        return 0,num_pairs
    
movie_pair_ur_similarities=user_mr_pairs.mapValues(lambda x: cosine_similarity(*x))

In [11]:
for r in movie_pair_ur_similarities.takeOrdered(10):
    print(r)

((1, 2), (0.9487373941786248, 104))
((1, 3), (0.9132997212006829, 78))
((1, 4), (0.9429068878509639, 149))
((1, 5), (0.9613638498709224, 57))
((1, 6), (0.9551193973874768, 14))
((1, 7), (0.948915528102488, 281))
((1, 8), (0.9600459451420741, 154))
((1, 9), (0.9387445181517423, 201))
((1, 10), (0.943039377274702, 61))
((1, 11), (0.9573860685968381, 160))


In [12]:
def process_movie_row_genre(movie_row):
    movie=movie_row.split('|')
    movie_id=int(movie[0])
    movie_genres=[int(g) for g in movie[5:]]
    return movie_id,movie_genres

processed_movie_genres=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-100k/u.item")
                          .map(process_movie_row_genre))

In [13]:
for r in processed_movie_genres.takeOrdered(10):
    print('{:4d} - {}'.format(r[0],r[1]))

   1 - [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   2 - [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   3 - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   4 - [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   5 - [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   6 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   7 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
   8 - [0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   9 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  10 - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [14]:
movie_gr_pairs=(processed_movie_genres.cartesian(processed_movie_genres)
                                      .filter(lambda x: x[0][0]<x[1][0])
                                      .map(lambda x: ((x[0][0],x[1][0]),(x[0][1],x[1][1]))))

In [15]:
movie_gr_pairs.getNumPartitions()

4

In [16]:
for r in movie_gr_pairs.takeOrdered(10):
    print r

((1, 2), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
((1, 3), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
((1, 4), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
((1, 5), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]))
((1, 6), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
((1, 7), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]))
((1, 8), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
((1, 9), ([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 

In [17]:
movie_pair_gr_similarities=movie_gr_pairs.mapValues(lambda x: cosine_similarity(*x)[0])

In [18]:
for r in movie_pair_gr_similarities.take(10):
    print r

((1, 2), 0.0)
((1, 3), 0.0)
((1, 4), 0.33333333333333337)
((1, 5), 0.0)
((1, 6), 0.0)
((1, 7), 0.0)
((1, 8), 0.6666666666666667)
((1, 9), 0.0)
((1, 10), 0.0)
((1, 11), 0.0)


In [19]:
genre_weight=0.25

movie_pair_similarities=(movie_pair_ur_similarities.join(movie_pair_gr_similarities)
                                                   .mapValues(lambda x: (x[0][0]+genre_weight*x[1],x[0][1])))

In [20]:
movie_pair_similarities.getNumPartitions()

8

In [21]:
for r in movie_pair_similarities.takeOrdered(10):
    print r

((1, 2), (0.9487373941786248, 104))
((1, 3), (0.9132997212006829, 78))
((1, 4), (1.0262402211842971, 149))
((1, 5), (0.9613638498709224, 57))
((1, 6), (0.9551193973874768, 14))
((1, 7), (0.948915528102488, 281))
((1, 8), (1.1267126118087407, 154))
((1, 9), (0.9387445181517423, 201))
((1, 10), (0.943039377274702, 61))
((1, 11), (0.9573860685968381, 160))


In [22]:
movie_avg_ratings=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                     .reduceByKey(lambda x, y: (x[0]+y[0],x[1]+y[1]))
                                     .mapValues(lambda x: (x[0]/x[1],x[1])))

In [23]:
for r in movie_avg_ratings.takeOrdered(10):
    print r

(1, (3.8783185840707963, 452))
(2, (3.2061068702290076, 131))
(3, (3.033333333333333, 90))
(4, (3.550239234449761, 209))
(5, (3.302325581395349, 86))
(6, (3.576923076923077, 26))
(7, (3.798469387755102, 392))
(8, (3.9954337899543377, 219))
(9, (3.8963210702341136, 299))
(10, (3.831460674157303, 89))


In [24]:
def process_movie_row_names_and_ratings(movie_row):
    movie=movie_row.split('|')
    movie_id=int(movie[0])
    movie_title=movie[1]
    return (movie_id,movie_title)

movie_names_and_ratings=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-100k/u.item")
                           .map(process_movie_row_names_and_ratings)
                           .join(movie_avg_ratings)
                           .mapValues(lambda x: (x[0],x[1][0],x[1][1])))

In [25]:
for r in movie_names_and_ratings.takeOrdered(10):
    print('{:4d} [{:4.2f}, {:4d}] - {}'.format(r[0],r[1][1],r[1][2],r[1][0]))

   1 [3.88,  452] - Toy Story (1995)
   2 [3.21,  131] - GoldenEye (1995)
   3 [3.03,   90] - Four Rooms (1995)
   4 [3.55,  209] - Get Shorty (1995)
   5 [3.30,   86] - Copycat (1995)
   6 [3.58,   26] - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
   7 [3.80,  392] - Twelve Monkeys (1995)
   8 [4.00,  219] - Babe (1995)
   9 [3.90,  299] - Dead Man Walking (1995)
  10 [3.83,   89] - Richard III (1995)


In [26]:
target_movie=50
min_similarity=0.95
min_num_rating_pairs=25

target_movie_filtered=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)
                                              .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                              (x[1][0],x[1][1]))))

In [27]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print r

(181, (1.2357230861253026, 480))
(172, (1.217769940132353, 345))
(498, (1.2000760200174676, 138))
(228, (1.1624821242902348, 230))
(222, (1.1606490354391281, 316))
(380, (1.1536039037420276, 109))
(271, (1.153493817947114, 138))
(82, (1.152488592206514, 242))
(229, (1.1468084097706708, 162))
(230, (1.144957188642716, 184))


In [28]:
min_rating=4

target_movie_filtered_named_rated=(target_movie_filtered.join(movie_names_and_ratings)
                                                        .mapValues(lambda x: (x[1][0],x[1][1],x[1][2],
                                                                              x[0][1],x[0][0]))
                                                        .filter(lambda x: x[1][1]>=min_rating))

In [29]:
for r in target_movie_filtered_named_rated.takeOrdered(10):
    print r

(12, (u'Usual Suspects, The (1995)', 4.385767790262173, 267, 223, 0.9724956031333988))
(22, (u'Braveheart (1995)', 4.151515151515151, 297, 261, 1.094368166383693))
(23, (u'Taxi Driver (1976)', 4.1208791208791204, 182, 159, 0.9555263124734333))
(45, (u'Eat Drink Man Woman (1994)', 4.05, 80, 62, 0.9603697921095729))
(48, (u'Hoop Dreams (1994)', 4.094017094017094, 117, 104, 0.9596593446508168))
(64, (u'Shawshank Redemption, The (1994)', 4.445229681978798, 283, 243, 0.9691491993942581))
(79, (u'Fugitive, The (1993)', 4.044642857142857, 336, 297, 1.0474637680208472))
(83, (u'Much Ado About Nothing (1993)', 4.0625, 176, 150, 1.0394370558771413))
(89, (u'Blade Runner (1982)', 4.138181818181818, 275, 254, 1.0484417063887075))
(96, (u'Terminator 2: Judgment Day (1991)', 4.0067796610169495, 295, 271, 1.0929247433829412))


In [30]:
target_movie_r=movie_names_and_ratings.lookup(target_movie)[0]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_movie_r[1],target_movie_r[2],
                                                                  target_movie_r[0]))
for r in target_movie_filtered_named_rated.takeOrdered(10,key=lambda x: (-x[1][4],-x[1][1])):
    print('{:4d} [{:.2f}, {:3d}/{:3d}] ({:.4f}): {}'.format(r[0],r[1][1],r[1][3],r[1][2],r[1][4],r[1][0]))

Most similar movies to   50 [4.36, 583] - Star Wars (1977)
 181 [4.01, 480/507] (1.2357): Return of the Jedi (1983)
 172 [4.20, 345/367] (1.2178): Empire Strikes Back, The (1980)
 498 [4.18, 138/152] (1.2001): African Queen, The (1951)
 174 [4.25, 380/420] (1.1399): Raiders of the Lost Ark (1981)
 173 [4.17, 284/324] (1.1391): Princess Bride, The (1987)
 511 [4.23, 153/173] (1.1269): Lawrence of Arabia (1962)
 520 [4.10, 107/124] (1.1232): Great Escape, The (1963)
 474 [4.25, 171/194] (1.1218): Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
 483 [4.46, 214/243] (1.1018): Casablanca (1942)
 651 [4.08, 155/171] (1.1008): Glory (1989)
