# 29.14 Movie similarities
- Local
- 100K x 1.0
- Partitioned [0,4,16,64,256]
- Broadcasting title, average rating, and genre vector dictionary
- Streamlined (this time for the real deal)
- Timed
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [23]:
master='local[*]' # 'local[*]' or 'spark://1.1.1.102:7077' 
target_data='100K' # '100K' or '1M'
data_fraction=1
seed=42

num_partitions_lst=[0,4,16,64,256]
target_movie=50
min_rating=4
min_similarity=0.95
min_num_rating_pairs=25

file_ratings_100K='/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data'
file_info_100K='/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item'

file_ratings_1M=''
file_info_1M=''

In [2]:
import datetime

import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
conf=SparkConf().setMaster(master).setAppName('MovieSimilaritiesRedux014')
sc=SparkContext(conf=conf)

In [3]:
def process_user_movie_rating_100K(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

def process_movie_rating_100K(movie_row):
    movie=movie_row.split()
    movie_id=int(movie[1])
    rating=float(movie[2])
    return (movie_id,(rating,1))

def build_movie_dict_100K(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie=movie_row.split('|')
            movie_id=int(movie[0])
            movie_title=movie[1]
            movie_genres=[int(g) for g in movie[5:]]
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

def process_user_movie_rating_1M(movie_row):
    movie=movie_row.split('::')
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

def process_movie_rating_1M(movie_row):
    movie=movie_row.split('::')
    movie_id=int(movie[1])
    rating=float(movie[2])
    return (movie_id,(rating,1))

def build_movie_dict_1M(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie_row=movie_row.strip()
            movie=movie_row.split('::')
            movie_id=int(movie[0])
            movie_title=movie[1]
            genres=['Action','Adventure','Animation','Children\'s','Comedy','Crime',
                    'Documentary','Drama','Fantasy','Film-Noir','Horror',
                    'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
            movie_genres=[0]*len(genres)
            for genre in movie[2].split('|'):
                movie_genres[genres.index(genre)]=1
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2)))
    except:
        return 0

def jaccard_similarity(v1,v2):
    from sklearn.metrics import jaccard_similarity_score
    return jaccard_similarity_score(v1,v2)
    
def compute_similarity(movie_pair_data,genre_weight=0.25,target_movie=target_movie):
    movie_ids=movie_pair_data[0]
    similar_id=movie_ids[0] if movie_ids[0]<>target_movie else movie_ids[1]
    similar_movie_ar=movie_dict.value[similar_id][0]
    rating_pairs=movie_pair_data[1]
    num_pairs=len(rating_pairs)
    ur_similarity=cosine_similarity(*zip(*rating_pairs))
    gr_similarity=cosine_similarity(movie_dict.value[movie_ids[0]][3],
                                    movie_dict.value[movie_ids[1]][3])
    similarity=ur_similarity+genre_weight*gr_similarity
    return (similarity,num_pairs,similar_movie_ar,similar_id)

def print_output(output):
    target_info=movie_dict.value[target_movie]
    print('Most similar movies to {} (Id: {}, AR: {:.2f}, NR: {})'.format(target_info[2],
                                                                          target_movie,
                                                                          target_info[0],
                                                                          target_info[1]))
    print(' Id     SR / AR     PR / NR   Title')
    for r in output:
        similarity=r[1][0]
        num_pairs=r[1][1]
        similar_id=r[1][3]
        similar_info=movie_dict.value[similar_id]
        print('{:4d} - {:.2f}/{:.2f} - {:4d}/{:4d}: {}'.format(similar_id,
                                                                similarity,similar_info[0],
                                                                num_pairs,similar_info[1],
                                                                similar_info[2]))

In [4]:
if target_data=='100K':
    file_ratings=file_ratings_100K
    file_info=file_info_100K
    process_movie_rating=process_movie_rating_100K
    build_movie_dict=build_movie_dict_100K
    process_user_movie_rating=process_user_movie_rating_100K
else:
    file_ratings=file_ratings_1M
    file_info=file_info_1M
    process_movie_rating=process_movie_rating_1M
    build_movie_dict=build_movie_dict_1M
    process_user_movie_rating=process_user_movie_rating_1M

In [5]:
%%time

movie_ar_dict=(sc.textFile(file_ratings)
                 .map(process_movie_rating)
                 .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
                 .mapValues(lambda x: (x[0]/x[1],x[1]))
                 .collectAsMap())
movie_dict=sc.broadcast(build_movie_dict(file_info,movie_ar_dict))

CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 2.09 s


In [24]:
%%time

output_lst=[]

for num_partitions in num_partitions_lst:
    start_time=datetime.datetime.now()
    user_movie_ratings=(sc.textFile(file_ratings)
                          .sample(False,data_fraction,seed)
                          .map(process_user_movie_rating))

    if num_partitions>0:
        user_movie_ratings=user_movie_ratings.partitionBy(num_partitions)

    tm_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                   .filter(lambda x: ((x[1][0][0]==target_movie) or (x[1][1][0]==target_movie)) and \
                                                     x[1][0][0]<x[1][1][0])
                                   .map(lambda x: ((x[1][0][0],x[1][1][0]),[(x[1][0][1],x[1][1][1])]))
                                   .reduceByKey(lambda x,y: x+y))

    tm_sims=(tm_mr_pairs.map(lambda x: (x[0],compute_similarity(x)),preservesPartitioning=True)
                        .filter(lambda x: (x[1][0]>=min_similarity) and \
                                          (x[1][1]>=min_num_rating_pairs) and \
                                          (x[1][2]>=min_rating)))
    output_rdd=tm_sims.takeOrdered(10,key=lambda x: -x[1][0])
    elapsed_time=datetime.datetime.now()-start_time
    output_lst=output_lst+[(num_partitions,elapsed_time,output_rdd)]

CPU times: user 208 ms, sys: 56 ms, total: 264 ms
Wall time: 51.3 s


In [25]:
for output in output_lst:
    print('Master: {}, Partitions: {}, Time: {}'.format(master,output[0],str(output[1]).split('.')[0]))
    print_output(output[2])

Master: local[*], Partitions: 0, Time: 0:00:08
Most similar movies to Star Wars (1977) (Id: 50, AR: 4.36, NR: 583)
 Id     SR / AR     PR / NR   Title
 181 - 1.24/4.01 -  480/ 507: Return of the Jedi (1983)
 172 - 1.22/4.20 -  345/ 367: Empire Strikes Back, The (1980)
 498 - 1.20/4.18 -  138/ 152: African Queen, The (1951)
 174 - 1.14/4.25 -  380/ 420: Raiders of the Lost Ark (1981)
 173 - 1.14/4.17 -  284/ 324: Princess Bride, The (1987)
 511 - 1.13/4.23 -  153/ 173: Lawrence of Arabia (1962)
 520 - 1.12/4.10 -  107/ 124: Great Escape, The (1963)
 474 - 1.12/4.25 -  171/ 194: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
 483 - 1.10/4.46 -  214/ 243: Casablanca (1942)
 651 - 1.10/4.08 -  155/ 171: Glory (1989)
Master: local[*], Partitions: 4, Time: 0:00:08
Most similar movies to Star Wars (1977) (Id: 50, AR: 4.36, NR: 583)
 Id     SR / AR     PR / NR   Title
 181 - 1.24/4.01 -  480/ 507: Return of the Jedi (1983)
 172 - 1.22/4.20 -  345/ 367: Empire Strik