# 35.3 Movie similarities
- 100K
- Cluster
- Partitioned
- Broadcasting title, average rating, and genre vector dictionary
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf=SparkConf().setMaster("spark://1.1.1.102:7077").setAppName("MovieSimilaritiesCluster003")
sc=SparkContext(conf=conf)

In [3]:
num_partitions=8

In [4]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-100k/u.data")
                      .map(process_movie_row).partitionBy(num_partitions))

In [5]:
user_movie_ratings.getNumPartitions()

8

In [6]:
for r in user_movie_ratings.takeOrdered(10):
    print r

(1, (1, 5.0))
(1, (2, 3.0))
(1, (3, 4.0))
(1, (4, 3.0))
(1, (5, 3.0))
(1, (6, 5.0))
(1, (7, 4.0))
(1, (8, 1.0))
(1, (9, 5.0))
(1, (10, 3.0))


In [7]:
movie_ar_dict=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                 .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
                                 .mapValues(lambda x: (x[0]/x[1],x[1]))
                                 .collectAsMap())

In [8]:
for key in movie_ar_dict.keys()[:10]:
    print('{:4d} [{:.2f} {:4d}]'.format(key,*movie_ar_dict[key]))

   1 [3.88  452]
   2 [3.21  131]
   3 [3.03   90]
   4 [3.55  209]
   5 [3.30   86]
   6 [3.58   26]
   7 [3.80  392]
   8 [4.00  219]
   9 [3.90  299]
  10 [3.83   89]


In [9]:
def build_movie_dict(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie=movie_row.split('|')
            movie_id=int(movie[0])
            movie_title=movie[1]
            movie_genres=[int(g) for g in movie[5:]]
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

movie_dict=sc.broadcast(build_movie_dict('/home/sparky/jupyter/ZZ_Data/ml-100k/u.item',movie_ar_dict))

In [10]:
for key in movie_dict.value.keys()[:10]:
    print('{:4d} [{:.2f} {:4d}] - {} - {}'.format(key,*movie_dict.value[key]))

   1 [3.88  452] - Toy Story (1995) - [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   2 [3.21  131] - GoldenEye (1995) - [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   3 [3.03   90] - Four Rooms (1995) - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   4 [3.55  209] - Get Shorty (1995) - [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   5 [3.30   86] - Copycat (1995) - [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
   6 [3.58   26] - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   7 [3.80  392] - Twelve Monkeys (1995) - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
   8 [4.00  219] - Babe (1995) - [0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   9 [3.90  299] - Dead Man Walking (1995) - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  10 [3.83   89] - Richard III (1995) - [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1

In [11]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),[(x[1][0][1],x[1][1][1])]))
                                 .partitionBy(num_partitions)
                                 .reduceByKey(lambda x,y: x+y)
                                 .mapValues(lambda x: zip(*x)))

In [12]:
for r in user_mr_pairs.takeOrdered(10):
    print(r)

((1, 2), [(4.0, 4.0, 5.0, 4.0, 3.0, 4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 4.0, 4.0, 4.0, 2.0, 5.0, 4.0, 4.0, 5.0, 3.0, 4.0, 5.0, 5.0, 3.0, 3.0, 4.0, 4.0, 3.0, 4.0, 4.0, 5.0, 4.0, 2.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 5.0, 5.0, 4.0, 2.0, 4.0, 3.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, 4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 5.0, 3.0, 4.0, 5.0, 5.0, 5.0, 3.0, 2.0, 2.0, 4.0, 5.0, 3.0, 4.0, 2.0, 5.0, 3.0, 2.0, 4.0, 5.0, 4.0, 4.0, 3.0, 5.0, 5.0, 4.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 4.0, 4.0, 3.0, 3.0, 3.0, 4.0, 3.0, 2.0, 4.0, 4.0, 4.0, 2.0, 4.0, 2.0), (4.0, 2.0, 4.0, 3.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0, 3.0, 3.0, 4.0, 2.0, 3.0, 4.0, 4.0, 3.0, 4.0, 3.0, 3.0, 4.0, 3.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0, 4.0, 3.0, 4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 1.0, 4.0, 2.0, 3.0, 2.0, 3.0, 4.0, 4.0, 3.0, 4.0, 3.0, 2.0, 5.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 4.0, 3.0, 4.0, 5.0, 3.0, 4.0, 1.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 1.0, 5.0, 3.0, 4.0, 2.0, 4.0, 4.0, 2.0, 5.0, 2.0, 4.0, 3.0, 3.0, 4.0, 3.0, 3.0, 3.0, 3.

In [13]:
def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    num_pairs=len(v1)
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2))),num_pairs
    except:
        return 0,num_pairs
    
movie_pair_ur_similarities=user_mr_pairs.mapValues(lambda x: cosine_similarity(*x))

In [14]:
for r in movie_pair_ur_similarities.takeOrdered(10):
    print(r)

((1, 2), (0.9487373941786248, 104))
((1, 3), (0.9132997212006829, 78))
((1, 4), (0.9429068878509639, 149))
((1, 5), (0.9613638498709224, 57))
((1, 6), (0.9551193973874768, 14))
((1, 7), (0.948915528102488, 281))
((1, 8), (0.9600459451420741, 154))
((1, 9), (0.9387445181517423, 201))
((1, 10), (0.943039377274702, 61))
((1, 11), (0.9573860685968381, 160))


In [15]:
def adjust_rating_by_gender(movie_pair,genre_weight=0.25):
    gr_similarity=cosine_similarity(movie_dict.value[movie_pair[0][0]][3],
                                    movie_dict.value[movie_pair[0][1]][3])[0]
    return (movie_pair[0],
            (movie_pair[1][0]+genre_weight*gr_similarity,movie_pair[1][1]))

movie_pair_similarities=(movie_pair_ur_similarities.map(adjust_rating_by_gender)
                                                   .partitionBy(num_partitions))

In [16]:
movie_pair_similarities.getNumPartitions()

8

In [17]:
for r in movie_pair_similarities.takeOrdered(10):
    print r

((1, 2), (0.9487373941786248, 104))
((1, 3), (0.9132997212006829, 78))
((1, 4), (1.0262402211842971, 149))
((1, 5), (0.9613638498709224, 57))
((1, 6), (0.9551193973874768, 14))
((1, 7), (0.948915528102488, 281))
((1, 8), (1.1267126118087407, 154))
((1, 9), (0.9387445181517423, 201))
((1, 10), (0.943039377274702, 61))
((1, 11), (0.9573860685968381, 160))


In [18]:
target_movie=50

min_rating=4
min_similarity=0.95
min_num_rating_pairs=25

sims_to_target=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or (x[0][1]==target_movie)) and \
                                                         x[1][0]>=min_similarity and \
                                                         x[1][1]>=min_num_rating_pairs)
                                       .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                       x[1])))

In [19]:
for r in sims_to_target.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print r

(181, (1.2357230861253026, 480))
(172, (1.217769940132353, 345))
(498, (1.2000760200174676, 138))
(228, (1.1624821242902348, 230))
(222, (1.1606490354391281, 316))
(380, (1.1536039037420276, 109))
(271, (1.153493817947114, 138))
(82, (1.152488592206514, 242))
(229, (1.1468084097706708, 162))
(230, (1.144957188642716, 184))


In [20]:
sims_to_target_ext_fil=(sims_to_target.map(lambda x: (x[0],(movie_dict.value[x[0]][0],
                                                            x[1][0],x[1][1],
                                                            movie_dict.value[x[0]][1],
                                                            movie_dict.value[x[0]][2])))
                                      .filter(lambda x: x[1][0]>=min_rating))

In [21]:
for r in sims_to_target_ext_fil.takeOrdered(10):
    print r

(12, (4.385767790262173, 0.9724956031333988, 223, 267, 'Usual Suspects, The (1995)'))
(22, (4.151515151515151, 1.094368166383693, 261, 297, 'Braveheart (1995)'))
(23, (4.1208791208791204, 0.9555263124734333, 159, 182, 'Taxi Driver (1976)'))
(45, (4.05, 0.9603697921095729, 62, 80, 'Eat Drink Man Woman (1994)'))
(48, (4.094017094017094, 0.9596593446508168, 104, 117, 'Hoop Dreams (1994)'))
(64, (4.445229681978798, 0.9691491993942581, 243, 283, 'Shawshank Redemption, The (1994)'))
(79, (4.044642857142857, 1.0474637680208472, 297, 336, 'Fugitive, The (1993)'))
(83, (4.0625, 1.0394370558771413, 150, 176, 'Much Ado About Nothing (1993)'))
(89, (4.138181818181818, 1.0484417063887075, 254, 275, 'Blade Runner (1982)'))
(96, (4.0067796610169495, 1.0929247433829412, 271, 295, 'Terminator 2: Judgment Day (1991)'))


In [22]:
target_info=movie_dict.value[target_movie]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_info[0],
                                                                  target_info[1],
                                                                  target_info[2]))
for r in sims_to_target_ext_fil.takeOrdered(10,key=lambda x: (-x[1][1],-x[1][0])):
    print('{:4d} [{:.4f}, {:3d}/{:3d}] ({:.2f}): {}'.format(r[0],r[1][1],r[1][2],r[1][3],r[1][0],r[1][4]))

Most similar movies to   50 [4.36, 583] - Star Wars (1977)
 181 [1.2357, 480/507] (4.01): Return of the Jedi (1983)
 172 [1.2178, 345/367] (4.20): Empire Strikes Back, The (1980)
 498 [1.2001, 138/152] (4.18): African Queen, The (1951)
 174 [1.1399, 380/420] (4.25): Raiders of the Lost Ark (1981)
 173 [1.1391, 284/324] (4.17): Princess Bride, The (1987)
 511 [1.1269, 153/173] (4.23): Lawrence of Arabia (1962)
 520 [1.1232, 107/124] (4.10): Great Escape, The (1963)
 474 [1.1218, 171/194] (4.25): Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
 483 [1.1018, 214/243] (4.46): Casablanca (1942)
 651 [1.1008, 155/171] (4.08): Glory (1989)
