In [1]:
from pyspark import SparkConf, SparkContext
import sys

conf = SparkConf().setAppName("Spark Application")
sc = SparkContext(conf=conf)

In [4]:
# A textual file containing the list of movies watched by the users of a video on demand service
## userid,movieid,start-timestamp,end-timestamp

# A second textual file containing the list of preferences for each user
## userid,movie-genre

# A third textual file containing the list of movies with the associated information
## movieid,title,movie-genre

# Select the userids of the list of users with a misleading profile
## A user has a misleading profile if more than threshold% of the movies he/she watched 
##       are not associated with a movie genre he/she likes

threshold = 0.0

# user_id, (user_id) user_preferred_movie_genre, (movie_id) watched_movie_genre

watchedMoviesRDD = sc.textFile("watchedmovies.txt")

movieUserPairRDD = watchedMoviesRDD\
	.map(lambda line:  (line.split(",")[1], line.split(",")[0])) # movie_id, user_id


moviesRDD = sc.textFile("movies.txt")

movieGenrePairRDD = moviesRDD\
	.map(lambda line:  (line.split(",")[0], line.split(",")[2])) # movie_id, movie_genre


joinWatchedGenreRDD = movieUserPairRDD.join(movieGenrePairRDD) # movie_id, (user_id, movie_genre)

usersWatchedGenresRDD = joinWatchedGenreRDD.map(lambda pair: (pair[1][0], pair[1][1]))

# - - -

preferencesRDD = sc.textFile("preferences.txt")

userLikedGenresRDD = preferencesRDD\
	.map(lambda line:  (line.split(",")[0], line.split(",")[1])) # user_id, movie_genre

# Cogroup the lists of watched and liked genres for each user
# There is one pair for each userid
# the value contains the list of genres (with repetitions) of the
# watched movies and the list of liked genres
userWatchedLikedGenres = usersWatchedGenresRDD.cogroup(userLikedGenresRDD)

# Filter the users with a misleading profile
misleadingUsersListsRDD = userWatchedLikedGenres.filter(misleadingProfileFunc)

# Select only the userid of the users with a misleading profile
misleadingUsersRDD = misleadingUsersListsRDD.keys()

for user in misleadingUsersRDD.collect():
    print(user)
    

user2


In [3]:
def misleadingProfileFunc(userWatchedLikedGenresLists):
    # Store in a local list the "small" set of liked genres
    # associated with the current user
    likedGenres = list(userWatchedLikedGenresLists[1][1])
    
    # Iterate over the watched movies (the genres of the watched movies)and count
    # - The number of watched movies for this user
    # - How many of watched movies are associated with a not liked genre
    numWatchedMovies = 0
    notLiked = 0
   
    for watchedGenre in userWatchedLikedGenresLists[1][0]:
        numWatchedMovies = numWatchedMovies+1
        if watchedGenre not in likedGenres:
            notLiked = notLiked+1
            
    # Check if the number of watched movies associated with a non-liked genre
    # is greater that threshold%
    if float(notLiked) > 0.0 * float(numWatchedMovies):
        return True
    else: 
        return False