# 29.16 Movie similarities
- Local
- 1M x [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
- Partitioned [0,4,8,16,32,64,128]
- Broadcasting title, average rating, and genre vector dictionary
- Streamlined (this time for the real deal)
- Timed
- Removing bad rated movies at the end
- Cosine similarity for movie ratings
- Cosine similarity for movie genres

In [1]:
master='local[*]' # 'local[*]' or 'spark://1.1.1.102:7077' 
target_data='1M' # '100K' or '1M'
data_fractions_lst=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
seed=42

num_partitions_lst=[0,4,8,16,32,64,128]
target_movie=260
min_rating=4
min_similarity=0.95
min_num_rating_pairs=25

file_ratings_100K='/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data'
file_info_100K='/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item'

file_ratings_1M='/home/ggomarr/jupyter/Sparky/ZZ_Data/ml-1m/ratings.dat'
file_info_1M='/home/ggomarr/jupyter/Sparky/ZZ_Data/ml-1m/movies.dat'

In [2]:
import datetime

import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
conf=SparkConf().setMaster(master).setAppName('MovieSimilaritiesRedux016')
sc=SparkContext(conf=conf)

In [3]:
def process_user_movie_rating_100K(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

def process_movie_rating_100K(movie_row):
    movie=movie_row.split()
    movie_id=int(movie[1])
    rating=float(movie[2])
    return (movie_id,(rating,1))

def build_movie_dict_100K(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie=movie_row.split('|')
            movie_id=int(movie[0])
            movie_title=movie[1]
            movie_genres=[int(g) for g in movie[5:]]
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

def process_user_movie_rating_1M(movie_row):
    movie=movie_row.split('::')
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

def process_movie_rating_1M(movie_row):
    movie=movie_row.split('::')
    movie_id=int(movie[1])
    rating=float(movie[2])
    return (movie_id,(rating,1))

def build_movie_dict_1M(file_nom,ratings_dict={}):
    movie_dict={}
    with open(file_nom) as f:
        for movie_row in f:
            movie_row=movie_row.strip()
            movie=movie_row.split('::')
            movie_id=int(movie[0])
            movie_title=movie[1]
            genres=['Action','Adventure','Animation','Children\'s','Comedy','Crime',
                    'Documentary','Drama','Fantasy','Film-Noir','Horror',
                    'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
            movie_genres=[0]*len(genres)
            for genre in movie[2].split('|'):
                movie_genres[genres.index(genre)]=1
            if movie_id in ratings_dict.keys():
                movie_avg=ratings_dict[movie_id][0]
                movie_cnt=ratings_dict[movie_id][1]
            else:
                movie_avg=0
                movie_cnt=0
            movie_dict[movie_id]=(movie_avg,movie_cnt,movie_title,movie_genres)
    return movie_dict

def cosine_similarity(v1,v2):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2)))
    except:
        return 0

def jaccard_similarity(v1,v2):
    from sklearn.metrics import jaccard_similarity_score
    return jaccard_similarity_score(v1,v2)
    
def compute_similarity(movie_pair_data,genre_weight=0.25,target_movie=target_movie):
    movie_ids=movie_pair_data[0]
    similar_id=movie_ids[0] if movie_ids[0]<>target_movie else movie_ids[1]
    similar_movie_ar=movie_dict.value[similar_id][0]
    rating_pairs=movie_pair_data[1]
    num_pairs=len(rating_pairs)
    ur_similarity=cosine_similarity(*zip(*rating_pairs))
    gr_similarity=cosine_similarity(movie_dict.value[movie_ids[0]][3],
                                    movie_dict.value[movie_ids[1]][3])
    similarity=ur_similarity+genre_weight*gr_similarity
    return (similarity,num_pairs,similar_movie_ar,similar_id)

def print_output(output):
    target_info=movie_dict.value[target_movie]
    print('Most similar movies to {} (Id: {}, AR: {:.2f}, NR: {})'.format(target_info[2],
                                                                          target_movie,
                                                                          target_info[0],
                                                                          target_info[1]))
    print(' Id     SR / AR     PR / NR   Title')
    for r in output:
        similarity=r[1][0]
        num_pairs=r[1][1]
        similar_id=r[1][3]
        similar_info=movie_dict.value[similar_id]
        print('{:4d} - {:.2f}/{:.2f} - {:4d}/{:4d}: {}'.format(similar_id,
                                                                similarity,similar_info[0],
                                                                num_pairs,similar_info[1],
                                                                similar_info[2]))

In [4]:
if target_data=='100K':
    file_ratings=file_ratings_100K
    file_info=file_info_100K
    process_movie_rating=process_movie_rating_100K
    build_movie_dict=build_movie_dict_100K
    process_user_movie_rating=process_user_movie_rating_100K
else:
    file_ratings=file_ratings_1M
    file_info=file_info_1M
    process_movie_rating=process_movie_rating_1M
    build_movie_dict=build_movie_dict_1M
    process_user_movie_rating=process_user_movie_rating_1M

In [5]:
%%time

movie_ar_dict=(sc.textFile(file_ratings)
                 .map(process_movie_rating)
                 .reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
                 .mapValues(lambda x: (x[0]/x[1],x[1]))
                 .collectAsMap())
movie_dict=sc.broadcast(build_movie_dict(file_info,movie_ar_dict))

CPU times: user 248 ms, sys: 12 ms, total: 260 ms
Wall time: 6.25 s


In [6]:
%%time

output_lst=[]

for data_fraction in data_fractions_lst:
    for num_partitions in num_partitions_lst:
        print('Working on data {:.2f}, partitions {:3d}...'.format(data_fraction,num_partitions)),
        start_time=datetime.datetime.now()
        user_movie_ratings=(sc.textFile(file_ratings)
                              .sample(False,data_fraction,seed)
                              .map(process_user_movie_rating))

        if num_partitions>0:
            user_movie_ratings=user_movie_ratings.partitionBy(num_partitions)

        tm_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                       .filter(lambda x: ((x[1][0][0]==target_movie) or \
                                                          (x[1][1][0]==target_movie)) and \
                                                         x[1][0][0]<x[1][1][0])
                                       .map(lambda x: ((x[1][0][0],x[1][1][0]),[(x[1][0][1],x[1][1][1])]))
                                       .reduceByKey(lambda x,y: x+y))

        tm_sims=(tm_mr_pairs.map(lambda x: (x[0],compute_similarity(x)),preservesPartitioning=True)
                            .filter(lambda x: (x[1][0]>=min_similarity) and \
                                              (x[1][1]>=min_num_rating_pairs) and \
                                              (x[1][2]>=min_rating)))
        output_rdd=tm_sims.takeOrdered(10,key=lambda x: -x[1][0])
        elapsed_time=datetime.datetime.now()-start_time
        output_lst=output_lst+[(data_fraction,num_partitions,elapsed_time,output_rdd)]
        print('...done in {:6.2f} seconds!'.format(elapsed_time.total_seconds()))

Working on data 0.10, partitions   0... ...done in   6.09 seconds!
Working on data 0.10, partitions   4... ...done in   4.47 seconds!
Working on data 0.10, partitions   8... ...done in   4.52 seconds!
Working on data 0.10, partitions  16... ...done in   4.34 seconds!
Working on data 0.10, partitions  32... ...done in   4.79 seconds!
Working on data 0.10, partitions  64... ...done in   6.22 seconds!
Working on data 0.10, partitions 128... ...done in   9.08 seconds!
Working on data 0.20, partitions   0... ...done in   9.54 seconds!
Working on data 0.20, partitions   4... ...done in   8.00 seconds!
Working on data 0.20, partitions   8... ...done in   8.17 seconds!
Working on data 0.20, partitions  16... ...done in   7.97 seconds!
Working on data 0.20, partitions  32... ...done in   8.61 seconds!
Working on data 0.20, partitions  64... ...done in   9.92 seconds!
Working on data 0.20, partitions 128... ...done in  12.34 seconds!
Working on data 0.30, partitions   0... ...done in  16.07 seco

In [7]:
print('Master: {}'.format(master))

print("\nSummary:")
print('   {}'.format(' '.join(['{:6d}'.format(num_partitions) for num_partitions in num_partitions_lst])))
i=-1
for data_fraction in data_fractions_lst:
    s='{:.2f}'.format(data_fraction)
    for num_partitions in num_partitions_lst:
        i=i+1
        s=s+' {:6.2f}'.format(output_lst[i][2].total_seconds())
    print(s)

print("\nDetailed output:")    
for output in output_lst:
    print('Data: {:.2}, Partitions: {}, Time: {}'.format(output[0],output[1],str(output[2]).split('.')[0]))
    print_output(output[3])

Master: local[*]

Summary:
        0      4      8     16     32     64    128
0.10   6.09   4.47   4.52   4.34   4.79   6.22   9.08
0.20   9.54   8.00   8.17   7.97   8.61   9.92  12.34
0.30  16.07  14.72  14.49  14.34  15.07  15.60  18.99
0.40  25.13  23.73  23.31  23.02  23.22  24.14  27.41
0.50  37.57  34.71  34.49  34.16  34.24  35.13  38.21
0.60  49.34  48.50  48.22  48.26  47.69  48.51  51.11
0.70  65.18  64.45  63.42  63.47  63.47  63.86  67.25
0.00   3.14   1.67   1.86   2.23   2.65   3.72   5.69
0.90 104.49 103.29 102.95 102.75 101.91 102.60 104.53
1.00 127.88 128.50 124.89 126.55 127.93 143.10 140.67

Detailed output:
Data: 0.1, Partitions: 0, Time: 0:00:06
Most similar movies to Star Wars: Episode IV - A New Hope (1977) (Id: 260, AR: 4.45, NR: 2991)
 Id     SR / AR     PR / NR   Title
1196 - 1.16/4.29 -   27/2990: Star Wars: Episode V - The Empire Strikes Back (1980)
1210 - 1.15/4.02 -   25/2883: Star Wars: Episode VI - Return of the Jedi (1983)
Data: 0.1, Partitions: 4, Ti