# 29.15 Movie similarities
- Local
- 1M x 0.25
- Partitioned [0,4,16,64,256]
- Broadcasting title, average rating, and genre vector dictionary
- Streamlined (this time for the real deal)
- Timed
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [1]:
master='local[*]' # 'local[*]' or 'spark://1.1.1.102:7077' 
target_data='1M' # '100K' or '1M'
data_fraction=0.25
seed=42

num_partitions_lst=[0,4,16,64,256]
target_movie=260
min_rating=4
min_similarity=0.95
min_num_rating_pairs=25

file_ratings_100K='/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data'
file_info_100K='/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item'

file_ratings_1M='/home/ggomarr/jupyter/Sparky/ZZ_Data/ml-1m/ratings.dat'
file_info_1M='/home/ggomarr/jupyter/Sparky/ZZ_Data/ml-1m/movies.dat'

In [2]:
import datetime

import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
conf=SparkConf().setMaster(master).setAppName('MovieSimilaritiesRedux015')
sc=SparkContext(conf=conf)

In [3]:
def process_user_movie_rating_100K(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

def process_movie_rating_100K(movie_row):
    movie=movie_row.split()
    movie_id=int(movie[1])
    rating=float(movie[2])
    return (movie_id,(rating,1))

def build_movie_dict_100K(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie=movie_row.split('|')
            movie_id=int(movie[0])
            movie_title=movie[1]
            movie_genres=[int(g) for g in movie[5:]]
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

def process_user_movie_rating_1M(movie_row):
    movie=movie_row.split('::')
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

def process_movie_rating_1M(movie_row):
    movie=movie_row.split('::')
    movie_id=int(movie[1])
    rating=float(movie[2])
    return (movie_id,(rating,1))

def build_movie_dict_1M(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie_row=movie_row.strip()
            movie=movie_row.split('::')
            movie_id=int(movie[0])
            movie_title=movie[1]
            genres=['Action','Adventure','Animation','Children\'s','Comedy','Crime',
                    'Documentary','Drama','Fantasy','Film-Noir','Horror',
                    'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
            movie_genres=[0]*len(genres)
            for genre in movie[2].split('|'):
                movie_genres[genres.index(genre)]=1
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2)))
    except:
        return 0

def jaccard_similarity(v1,v2):
    from sklearn.metrics import jaccard_similarity_score
    return jaccard_similarity_score(v1,v2)
    
def compute_similarity(movie_pair_data,genre_weight=0.25,target_movie=target_movie):
    movie_ids=movie_pair_data[0]
    similar_id=movie_ids[0] if movie_ids[0]<>target_movie else movie_ids[1]
    similar_movie_ar=movie_dict.value[similar_id][0]
    rating_pairs=movie_pair_data[1]
    num_pairs=len(rating_pairs)
    ur_similarity=cosine_similarity(*zip(*rating_pairs))
    gr_similarity=cosine_similarity(movie_dict.value[movie_ids[0]][3],
                                    movie_dict.value[movie_ids[1]][3])
    similarity=ur_similarity+genre_weight*gr_similarity
    return (similarity,num_pairs,similar_movie_ar,similar_id)

def print_output(output):
    target_info=movie_dict.value[target_movie]
    print('Most similar movies to {} (Id: {}, AR: {:.2f}, NR: {})'.format(target_info[2],
                                                                          target_movie,
                                                                          target_info[0],
                                                                          target_info[1]))
    print(' Id     SR / AR     PR / NR   Title')
    for r in output:
        similarity=r[1][0]
        num_pairs=r[1][1]
        similar_id=r[1][3]
        similar_info=movie_dict.value[similar_id]
        print('{:4d} - {:.2f}/{:.2f} - {:4d}/{:4d}: {}'.format(similar_id,
                                                                similarity,similar_info[0],
                                                                num_pairs,similar_info[1],
                                                                similar_info[2]))

In [4]:
if target_data=='100K':
    file_ratings=file_ratings_100K
    file_info=file_info_100K
    process_movie_rating=process_movie_rating_100K
    build_movie_dict=build_movie_dict_100K
    process_user_movie_rating=process_user_movie_rating_100K
else:
    file_ratings=file_ratings_1M
    file_info=file_info_1M
    process_movie_rating=process_movie_rating_1M
    build_movie_dict=build_movie_dict_1M
    process_user_movie_rating=process_user_movie_rating_1M

In [5]:
%%time

movie_ar_dict=(sc.textFile(file_ratings)
                 .map(process_movie_rating)
                 .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
                 .mapValues(lambda x: (x[0]/x[1],x[1]))
                 .collectAsMap())
movie_dict=sc.broadcast(build_movie_dict(file_info,movie_ar_dict))

CPU times: user 232 ms, sys: 4 ms, total: 236 ms
Wall time: 6.94 s


In [6]:
%%time

output_lst=[]

for num_partitions in num_partitions_lst:
    start_time=datetime.datetime.now()
    user_movie_ratings=(sc.textFile(file_ratings)
                          .sample(False,data_fraction,seed)
                          .map(process_user_movie_rating))

    if num_partitions>0:
        user_movie_ratings=user_movie_ratings.partitionBy(num_partitions)

    tm_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                   .filter(lambda x: ((x[1][0][0]==target_movie) or (x[1][1][0]==target_movie)) and \
                                                     x[1][0][0]<x[1][1][0])
                                   .map(lambda x: ((x[1][0][0],x[1][1][0]),[(x[1][0][1],x[1][1][1])]))
                                   .reduceByKey(lambda x,y: x+y))

    tm_sims=(tm_mr_pairs.map(lambda x: (x[0],compute_similarity(x)),preservesPartitioning=True)
                        .filter(lambda x: (x[1][0]>=min_similarity) and \
                                          (x[1][1]>=min_num_rating_pairs) and \
                                          (x[1][2]>=min_rating)))
    output_rdd=tm_sims.takeOrdered(10,key=lambda x: -x[1][0])
    elapsed_time=datetime.datetime.now()-start_time
    output_lst=output_lst+[(num_partitions,elapsed_time,output_rdd)]

CPU times: user 236 ms, sys: 28 ms, total: 264 ms
Wall time: 1min 10s


In [7]:
for output in output_lst:
    print('Master: {}, Partitions: {}, Time: {}'.format(master,output[0],str(output[1]).split('.')[0]))
    print_output(output[2])

Master: local[*], Partitions: 0, Time: 0:00:14
Most similar movies to Star Wars: Episode IV - A New Hope (1977) (Id: 260, AR: 4.45, NR: 2991)
 Id     SR / AR     PR / NR   Title
1198 - 1.16/4.48 -  118/2514: Raiders of the Lost Ark (1981)
1196 - 1.16/4.29 -  168/2990: Star Wars: Episode V - The Empire Strikes Back (1980)
1291 - 1.15/4.13 -   92/1628: Indiana Jones and the Last Crusade (1989)
1210 - 1.15/4.02 -  135/2883: Star Wars: Episode VI - Return of the Jedi (1983)
  29 - 1.14/4.06 -   26/ 403: City of Lost Children, The (1995)
1240 - 1.12/4.15 -  103/2098: Terminator, The (1984)
2571 - 1.12/4.32 -  120/2590: Matrix, The (1999)
 589 - 1.12/4.06 -  106/2649: Terminator 2: Judgment Day (1991)
1287 - 1.11/4.11 -   29/ 704: Ben-Hur (1959)
1199 - 1.10/4.10 -   58/ 913: Brazil (1985)
Master: local[*], Partitions: 4, Time: 0:00:11
Most similar movies to Star Wars: Episode IV - A New Hope (1977) (Id: 260, AR: 4.45, NR: 2991)
 Id     SR / AR     PR / NR   Title
1198 - 1.16/4.48 -  118/2514