In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")
sc = SparkContext(conf = conf)

In [3]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data")
                      .map(process_movie_row))

In [4]:
for r in user_movie_ratings.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(22, (377, 1.0))
(244, (51, 2.0))
(166, (346, 1.0))
(298, (474, 4.0))
(115, (265, 2.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))


In [5]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])))
                                 .groupByKey())

In [6]:
for r in user_mr_pairs.take(10):
    print(r)

((197, 1097), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c3d0>)
((42, 364), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c850>)
((773, 1409), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c890>)
((273, 617), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c8d0>)
((372, 974), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c910>)
((789, 865), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c950>)
((496, 1314), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c990>)
((389, 493), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23c9d0>)
((856, 1006), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23ca10>)
((552, 754), <pyspark.resultiterable.ResultIterable object at 0x7fd2cc23ca50>)


In [7]:
def cosine_similarity(ratings):
    from math import sqrt
    def mult_vectors(a,b):
        return sum([1.0*a[n]*b[n] for n in range(len(a))])
    v1,v2=zip(*ratings)
    num_pairs=len(v1)
    try:
        return mult_vectors(v1,v2)/(sqrt(mult_vectors(v1,v1)) * sqrt(mult_vectors(v2,v2))),num_pairs
    except:
        return 0,num_pairs
    
movie_pair_similarities=user_mr_pairs.mapValues(cosine_similarity)

In [11]:
for r in movie_pair_similarities.take(10):
    print(r)

((197, 1097), (0.9758729093599599, 7))
((42, 364), (0.9093486560398836, 18))
((773, 1409), (1.0, 1))
((273, 617), (0.9652953599007105, 7))
((372, 974), (1.0, 1))
((789, 865), (0.9897475249773018, 3))
((496, 1314), (0.976416832356179, 4))
((389, 493), (0.9656157585206697, 7))
((856, 1006), (0.9686648999069224, 10))
((552, 754), (0.8320502943378437, 2))


In [20]:
target_movie=50
min_similarity=0.95
min_num_rating_pairs=25

target_movie_filtered=movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)

In [21]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

((50, 172), (0.9895522078385338, 345))
((50, 181), (0.9857230861253026, 480))
((50, 963), (0.9823449614960231, 40))
((50, 174), (0.981760098872619, 380))
((50, 1226), (0.9795749380425283, 28))
((50, 141), (0.9789385605497993, 68))
((50, 1007), (0.9783184758610347, 37))
((50, 178), (0.9776576120448436, 109))
((50, 408), (0.9775948291054827, 92))
((50, 297), (0.9768144539214534, 42))


In [18]:
def process_movie_names_file(movie_names_file):
    movie_names_dict={}
    with open(movie_names_file) as f:
        for movie_row in f:
            movie=movie_row.split('|')
            movie_id=int(movie[0])
            movie_title=movie[1]
            movie_names_dict[movie_id]=movie_title
    return movie_names_dict

movie_names_file="/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item"
movie_names_dict=process_movie_names_file(movie_names_file)

In [23]:
print('Most similar movies to {}'.format(movie_names_dict[target_movie]))
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print('{:.4f} ({:3}): {}'.format(r[1][0],r[1][1],movie_names_dict[r[0][0] if r[0][0]<>target_movie else r[0][1]]))

Most similar movies to Star Wars (1977)
0.9896 (345): Empire Strikes Back, The (1980)
0.9857 (480): Return of the Jedi (1983)
0.9823 ( 40): Some Folks Call It a Sling Blade (1993)
0.9818 (380): Raiders of the Lost Ark (1981)
0.9796 ( 28): Night Falls on Manhattan (1997)
0.9789 ( 68): 20,000 Leagues Under the Sea (1954)
0.9783 ( 37): Waiting for Guffman (1996)
0.9777 (109): 12 Angry Men (1957)
0.9776 ( 92): Close Shave, A (1995)
0.9768 ( 42): Ulee's Gold (1997)
