In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilaritiesRedux004")
sc = SparkContext(conf = conf)

In [3]:
def process_movie_row(movie_row):
    movie=movie_row.split()
    user_id=int(movie[0])
    movie_id=int(movie[1])
    rating=float(movie[2])
    return user_id,(movie_id,rating)

user_movie_ratings=(sc.textFile("/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.data")
                      .map(process_movie_row))

In [4]:
for r in user_movie_ratings.take(10):
    print(r)

(196, (242, 3.0))
(186, (302, 3.0))
(22, (377, 1.0))
(244, (51, 2.0))
(166, (346, 1.0))
(298, (474, 4.0))
(115, (265, 2.0))
(253, (465, 5.0))
(305, (451, 3.0))
(6, (86, 3.0))


In [5]:
user_mr_pairs=(user_movie_ratings.join(user_movie_ratings)
                                 .filter(lambda x: x[1][0][0]<x[1][1][0])
                                 .map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1],x[1][1][1])))
                                 .groupByKey())

In [6]:
for r in user_mr_pairs.take(10):
    print(r)

((197, 1097), <pyspark.resultiterable.ResultIterable object at 0x7f88ee772450>)
((42, 364), <pyspark.resultiterable.ResultIterable object at 0x7f88ee7721d0>)
((773, 1409), <pyspark.resultiterable.ResultIterable object at 0x7f88ee772210>)
((273, 617), <pyspark.resultiterable.ResultIterable object at 0x7f88ee772250>)
((372, 974), <pyspark.resultiterable.ResultIterable object at 0x7f88ee772090>)
((789, 865), <pyspark.resultiterable.ResultIterable object at 0x7f88ee7722d0>)
((496, 1314), <pyspark.resultiterable.ResultIterable object at 0x7f88ee7725d0>)
((389, 493), <pyspark.resultiterable.ResultIterable object at 0x7f88ee772610>)
((856, 1006), <pyspark.resultiterable.ResultIterable object at 0x7f88ee772650>)
((552, 754), <pyspark.resultiterable.ResultIterable object at 0x7f88ee772690>)


In [7]:
def pearson_similarity(ratings):
    from scipy.stats.stats import pearsonr
    v1,v2=zip(*ratings)
    num_pairs=len(v1)
    return pearsonr(v1,v2)[0],num_pairs
    
movie_pair_similarities=user_mr_pairs.mapValues(pearson_similarity)

In [8]:
for r in movie_pair_similarities.take(10):
    print(r)

((197, 1097), (0.63586706399759618, 7))
((42, 364), (-0.077864642904380901, 18))
((773, 1409), (nan, 1))
((273, 617), (0.21459876881973802, 7))
((372, 974), (nan, 1))
((789, 865), (1.0, 3))
((496, 1314), (nan, 4))
((389, 493), (0.0, 7))
((856, 1006), (0.6805446536716202, 10))
((552, 754), (nan, 2))


In [9]:
target_movie=50
min_similarity=0.5
min_num_rating_pairs=25

target_movie_filtered=movie_pair_similarities.filter(lambda x: ((x[0][0]==target_movie) or \
                                                                 (x[0][1]==target_movie)) and \
                                                                 x[1][0]>=min_similarity and \
                                                                 x[1][1]>=min_num_rating_pairs)

In [10]:
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print(r)

((50, 172), (0.74798142237886811, 345))
((50, 181), (0.67255585588760702, 480))
((50, 174), (0.53611710137293112, 380))
((50, 1226), (0.51529081620737816, 28))
((50, 1142), (0.51516406681876525, 37))
((50, 936), (0.51510769706017157, 27))
((50, 963), (0.5090161595253091, 40))


In [11]:
def process_movie_names_file(movie_names_file):
    movie_names_dict={}
    with open(movie_names_file) as f:
        for movie_row in f:
            movie=movie_row.split('|')
            movie_id=int(movie[0])
            movie_title=movie[1]
            movie_names_dict[movie_id]=movie_title
    return movie_names_dict

movie_names_file="/home/ggomarr/Documents/Education/Udemy_Spark/ml-100k/u.item"
movie_names_dict=process_movie_names_file(movie_names_file)

In [12]:
print('Most similar movies to {}'.format(movie_names_dict[target_movie]))
for r in target_movie_filtered.takeOrdered(10,key=lambda x: (-x[1][0],-x[1][1])):
    print('{:.4f} ({:3}): {}'.format(r[1][0],r[1][1],movie_names_dict[r[0][0] if r[0][0]<>target_movie else r[0][1]]))

Most similar movies to Star Wars (1977)
0.7480 (345): Empire Strikes Back, The (1980)
0.6726 (480): Return of the Jedi (1983)
0.5361 (380): Raiders of the Lost Ark (1981)
0.5153 ( 28): Night Falls on Manhattan (1997)
0.5152 ( 37): When We Were Kings (1996)
0.5151 ( 27): Brassed Off (1996)
0.5090 ( 40): Some Folks Call It a Sling Blade (1993)
