In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilaritiesRedux006")
sc = SparkContext(conf = conf)

In [3]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data")
                      .map(process_movie_row))

In [4]:
for r in user_movie_ratings.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(22, (377, 1.0))
(244, (51, 2.0))
(166, (346, 1.0))
(298, (474, 4.0))
(115, (265, 2.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))


In [5]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])))
                                 .groupByKey())

In [6]:
for r in user_mr_pairs.take(10):
    print(r)

((197, 1097), <pyspark.resultiterable.ResultIterable object at 0x7fe934f057d0>)
((42, 364), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05990>)
((773, 1409), <pyspark.resultiterable.ResultIterable object at 0x7fe934f059d0>)
((273, 617), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05a10>)
((372, 974), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05a50>)
((789, 865), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05a90>)
((496, 1314), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05ad0>)
((389, 493), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05b10>)
((856, 1006), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05b50>)
((552, 754), <pyspark.resultiterable.ResultIterable object at 0x7fe934f05b90>)


In [8]:
def jaccard_similarity(ratings):
    from sklearn.metrics import jaccard_similarity_score
    v1,v2=zip(*ratings)
    num_pairs=len(v1)
    return jaccard_similarity_score(v1,v2),num_pairs
    
movie_pair_similarities=user_mr_pairs.mapValues(jaccard_similarity)

In [9]:
for r in movie_pair_similarities.take(10):
    print(r)

((197, 1097), (0.2857142857142857, 7))
((42, 364), (0.1111111111111111, 18))
((773, 1409), (1.0, 1))
((273, 617), (0.2857142857142857, 7))
((372, 974), (0.0, 1))
((789, 865), (0.33333333333333331, 3))
((496, 1314), (0.5, 4))
((389, 493), (0.2857142857142857, 7))
((856, 1006), (0.29999999999999999, 10))
((552, 754), (0.0, 2))


In [10]:
target_movie=50
min_similarity=0.25
min_num_rating_pairs=25

target_movie_filtered=movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)

In [11]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

((50, 172), (0.64347826086956517, 345))
((50, 114), (0.58620689655172409, 58))
((50, 169), (0.57281553398058249, 103))
((50, 174), (0.57105263157894737, 380))
((50, 181), (0.5625, 480))
((50, 641), (0.56000000000000005, 25))
((50, 963), (0.52500000000000002, 40))
((50, 408), (0.52173913043478259, 92))
((50, 936), (0.51851851851851849, 27))
((50, 1142), (0.48648648648648651, 37))


In [12]:
def process_movie_names_file(movie_names_file):
    movie_names_dict={}
    with open(movie_names_file) as f:
        for movie_row in f:
            movie=movie_row.split('|')
            movie_id=int(movie[0])
            movie_title=movie[1]
            movie_names_dict[movie_id]=movie_title
    return movie_names_dict

movie_names_file="/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item"
movie_names_dict=process_movie_names_file(movie_names_file)

In [20]:
print('Most similar movies to {}'.format(movie_names_dict[target_movie]))
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print('{:.4f} ({:3}): {}'.format(r[1][0],r[1][1],movie_names_dict[r[0][0] if r[0][0]<>target_movie else r[0][1]]))

Most similar movies to Star Wars (1977)
0.6435 (345): Empire Strikes Back, The (1980)
0.5862 ( 58): Wallace & Gromit: The Best of Aardman Animation (1996)
0.5728 (103): Wrong Trousers, The (1993)
0.5711 (380): Raiders of the Lost Ark (1981)
0.5625 (480): Return of the Jedi (1983)
0.5600 ( 25): Paths of Glory (1957)
0.5250 ( 40): Some Folks Call It a Sling Blade (1993)
0.5217 ( 92): Close Shave, A (1995)
0.5185 ( 27): Brassed Off (1996)
0.4865 ( 37): When We Were Kings (1996)
