# 35.5 Movie similarities
- 1M
- Cluster
- Partitioned
- Broadcasting title, average rating, and genre vector dictionary
- Streamlined
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf=SparkConf().setMaster("spark://1.1.1.102:7077").setAppName("MovieSimilaritiesCluster005")
sc=SparkContext(conf=conf)

In [3]:
num_partitions=8

In [16]:
def process_movie_row(movie_row):
    movie=movie_row.split('::')
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-1m/ratings.dat")
                      .map(process_movie_row).partitionBy(num_partitions).persist())

In [17]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),[(x[1][0][1],x[1][1][1])]))
                                 .partitionBy(num_partitions)
                                 .reduceByKey(lambda x,y: x+y)
                                 .mapValues(lambda x: zip(*x)))

In [19]:
def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    num_pairs=len(v1)
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2))),num_pairs
    except:
        return 0,num_pairs
    
movie_pair_ur_similarities=user_mr_pairs.mapValues(lambda x: cosine_similarity(*x))

In [20]:
movie_ar_dict=(user_movie_ratings.map(lambda x: (x[1][0],(x[1][1],1)))
                                 .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
                                 .mapValues(lambda x: (x[0]/x[1],x[1]))
                                 .collectAsMap())

In [21]:
def build_movie_dict(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie_row=movie_row.strip()
            movie=movie_row.split('::')
            movie_id=int(movie[0])
            movie_title=movie[1]
            genres=['Action','Adventure','Animation','Children\'s','Comedy','Crime',
                    'Documentary','Drama','Fantasy','Film-Noir','Horror',
                    'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
            movie_genres=[0]*len(genres)
            for genre in movie[2].split('|'):
                movie_genres[genres.index(genre)]=1
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

movie_dict=sc.broadcast(build_movie_dict('/home/sparky/jupyter/ZZ_Data/ml-1m/movies.dat',movie_ar_dict))

In [22]:
def adjust_rating_by_genre(movie_pair,genre_weight=0.25):
    gr_similarity=cosine_similarity(movie_dict.value[movie_pair[0][0]][3],
                                    movie_dict.value[movie_pair[0][1]][3])[0]
    return (movie_pair[0],
            (movie_pair[1][0]+genre_weight*gr_similarity,movie_pair[1][1]))

movie_pair_similarities=movie_pair_ur_similarities.map(adjust_rating_by_genre)

In [23]:
target_movie=260

min_rating=4
min_similarity=0.95
min_num_rating_pairs=250

sims_to_target=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or (x[0][1]==target_movie)) and \
                                                         x[1][0]>=min_similarity and \
                                                         x[1][1]>=min_num_rating_pairs)
                                       .map(lambda x: (x[0][0] if x[0][0]<>target_movie else x[0][1],
                                                       x[1])))

In [24]:
sims_to_target_ext_fil=(sims_to_target.map(lambda x: (x[0],(movie_dict.value[x[0]][0],
                                                            x[1][0],x[1][1],
                                                            movie_dict.value[x[0]][1],
                                                            movie_dict.value[x[0]][2])))
                                      .filter(lambda x: x[1][0]>=min_rating))

In [25]:
target_info=movie_dict.value[target_movie]
print('Most similar movies to {:4d} [{:4.2f}, {:3d}] - {}'.format(target_movie,
                                                                  target_info[0],
                                                                  target_info[1],
                                                                  target_info[2]))
for r in sims_to_target_ext_fil.takeOrdered(10,key=lambda x: (-x[1][1],-x[1][0])):
    print('{:4d} [{:.4f}, {:4d}/{:4d}] ({:.2f}): {}'.format(r[0],r[1][1],r[1][2],r[1][3],r[1][0],r[1][4]))

Most similar movies to  260 [4.45, 2991] - Star Wars: Episode IV - A New Hope (1977)
1198 [1.1623, 1972/2514] (4.48): Raiders of the Lost Ark (1981)
1196 [1.1575, 2355/2990] (4.29): Star Wars: Episode V - The Empire Strikes Back (1980)
1291 [1.1542, 1397/1628] (4.13): Indiana Jones and the Last Crusade (1989)
1210 [1.1518, 2113/2883] (4.02): Star Wars: Episode VI - Return of the Jedi (1983)
  29 [1.1390,  344/ 403] (4.06): City of Lost Children, The (1995)
1240 [1.1189, 1746/2098] (4.15): Terminator, The (1984)
2571 [1.1176, 1908/2590] (4.32): Matrix, The (1999)
1287 [1.1170,  600/ 704] (4.11): Ben-Hur (1959)
 589 [1.1147, 1889/2649] (4.06): Terminator 2: Judgment Day (1991)
3000 [1.1087,  287/ 345] (4.15): Princess Mononoke, The (Mononoke Hime) (1997)
