# 35.6 Movie similarities
- 1M
- Cluster
- Not partitioned
- Broadcasting title, average rating, and genre vector dictionary
- Streamlined (this time for real)
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf=SparkConf().setMaster("spark://1.1.1.102:7077").setAppName("MovieSimilaritiesCluster006")
sc=SparkContext(conf=conf)

In [3]:
def process_user_movie_rating(movie_row):
    movie=movie_row.split('::')
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-1m/ratings.dat")
                      .map(process_user_movie_rating))

In [4]:
mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                            .filter(lambda x: x[1][0][0]<x[1][1][0])
                            .map(lambda x: ((x[1][0][0],x[1][1][0]),
                                            [(x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])]))
                            .reduceByKey(lambda x,y: x+y[1:]))

In [5]:
def process_movie_rating(movie_row):
    movie=movie_row.split('::')
    movie_id=int(movie[1])
    rating=float(movie[2])
    return (movie_id,(rating,1))

movie_ar_dict=(sc.textFile("file:///home/sparky/jupyter/ZZ_Data/ml-1m/ratings.dat")
                 .map(process_movie_rating)
                 .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
                 .mapValues(lambda x: (x[0]/x[1],x[1]))
                 .collectAsMap())

In [6]:
def build_movie_dict(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie_row=movie_row.strip()
            movie=movie_row.split('::')
            movie_id=int(movie[0])
            movie_title=movie[1]
            genres=['Action','Adventure','Animation','Children\'s','Comedy','Crime',
                    'Documentary','Drama','Fantasy','Film-Noir','Horror',
                    'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
            movie_genres=[0]*len(genres)
            for genre in movie[2].split('|'):
                movie_genres[genres.index(genre)]=1
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

movie_dict=sc.broadcast(build_movie_dict('/home/sparky/jupyter/ZZ_Data/ml-1m/movies.dat',movie_ar_dict))

In [7]:
def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2)))
    except:
        return 0

def compute_similarity(movie_pair_data,genre_weight=0.25):
    movie_ids=movie_pair_data[0]
    rating_pairs=movie_pair_data[1:]
    num_pairs=len(rating_pairs)
    ur_similarity=cosine_similarity(*zip(*rating_pairs))
    gr_similarity=cosine_similarity(movie_dict.value[movie_ids[0]][3],
                                    movie_dict.value[movie_ids[1]][3])
    return (ur_similarity+genre_weight*gr_similarity,num_pairs)

movie_pair_similarities=mr_pairs.mapValues(compute_similarity)

In [None]:
target_movie=260
min_rating=4
min_similarity=0.95
min_num_rating_pairs=250

def extract_sim_to_target(movie_pair_data,target_movie=target_movie):
    movie_ids=movie_pair_data[0]
    sim_id=movie_ids[0] if movie_ids[0]<>target_movie else movie_ids[1]
    sim_val=movie_pair_data[1][0]
    sim_num=movie_pair_data[1][1]
    sim_ar=movie_dict.value[sim_id][0]
    return (sim_val,(sim_id,sim_ar,sim_num))

sims_to_target=(movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or (x[0][1]==target_movie)) and \
                                                         x[1][0]>=min_similarity and \
                                                         x[1][1]>=min_num_rating_pairs)
                                       .map(extract_sim_to_target)
                                       .filter(lambda x: x[1][1]>=min_rating))

In [None]:
target_info=movie_dict.value[target_movie]
print('Most similar movies to {:4d} [{:4d}] ({:.2f}) - {}'.format(target_movie,
                                                                   target_info[1],target_info[0],target_info[2]))
for r in sims_to_target.sortByKey(ascending=False).take(10):
    sim_val=r[0]
    sim_id=r[1][0]
    sim_num=r[1][2]
    sim_info=movie_dict.value[sim_id]
    print('{:4d} [{:.2f}, {:4d}/{:4d}] ({:.2f}): {}'.format(sim_id,sim_val,sim_num,
                                                            sim_info[1],sim_info[0],sim_info[2]))

Most similar movies to  260 [2991] (4.45) - Star Wars: Episode IV - A New Hope (1977)
