 ## Exercise 44 - Misleading Profile Section

In [1]:
inputPathWatched = "/data/students/bigdata-01QYD/ex_data/Ex44/data/watchedmovies.txt"
inputPathPreferences = "/data/students/bigdata-01QYD/ex_data/Ex44/data/preferences.txt"
inputPathMovies = "/data/students/bigdata-01QYD/ex_data/Ex44/data/movies.txt"
outputPath = "res_out_Ex44/"
threshold = 0.5

__input__ : 
* A textual file containing the list of movies watched by the users of a video on demand service: _userid,movieid,start-timestamp,end-timestamp_
* A second textual file containing the list of preferences for each user: _userid,movie-genre_
* A third textual file containing the list of movies with the associated information: _movieid,title,movie-genre_

__output__ : 
* Select the userids of the list of users with a misleading profile
    * A user has a misleading profile if more than threshold% of the movies he/she watched are not associated with a movie genre he/she likes

In [6]:
# map into (movieid , userid)
watchedRDD = sc.textFile(inputPathWatched)
pairWatchedRDD = watchedRDD.map(lambda line : (line.split(",")[1] , line.split(",")[0]) )
pairWatchedRDD.collect()

[('movie1', 'user1'),
 ('movie3', 'user1'),
 ('movie4', 'user1'),
 ('movie5', 'user1'),
 ('movie6', 'user2'),
 ('movie3', 'user2'),
 ('movie4', 'user2')]

In [11]:
# map into (movieid, genre)
movieRDD = sc.textFile(inputPathMovies)
pairMovieUserRDD = movieRDD.map(lambda line : (line.split(",")[0] , line.split(",")[2]) )
pairMovieUserRDD.collect()

[('movie1', 'Animation'),
 ('movie2', 'Adventure'),
 ('movie3', 'Comedy'),
 ('movie4', 'Comedy'),
 ('movie5', 'Comedy'),
 ('movie6', 'Action'),
 ('movie7', 'Comedy'),
 ('movie8', 'Adventure'),
 ('movie9', 'Action'),
 ('movie10', 'Action')]

In [12]:
joinedMovieWatchedRDD = pairWatchedRDD.join(pairMovieUserRDD)
joinedMovieWatchedRDD.collect()

[('movie5', ('user1', 'Comedy')),
 ('movie1', ('user1', 'Animation')),
 ('movie4', ('user1', 'Comedy')),
 ('movie4', ('user2', 'Comedy')),
 ('movie3', ('user1', 'Comedy')),
 ('movie3', ('user2', 'Comedy')),
 ('movie6', ('user2', 'Action'))]

In [14]:
userGenresRDD = joinedMovieWatchedRDD.map(lambda pair: (pair[1][0], pair[1][1]))
userGenresRDD.collect()

[('user1', 'Comedy'),
 ('user1', 'Animation'),
 ('user1', 'Comedy'),
 ('user2', 'Comedy'),
 ('user1', 'Comedy'),
 ('user2', 'Comedy'),
 ('user2', 'Action')]

In [25]:
preferencesRDD = sc.textFile(inputPathPreferences)
pairPreferencesRDD = preferencesRDD.map(lambda line : (line.split(",")[0] , line.split(",")[1]))

In [30]:
# cogroup the movie preference with the watched genres
joinedUserGenresRDD = userGenresRDD.cogroup(preferencesRDD)
joinedUserGenresRDD.mapValues(lambda elem: (list(elem[0]), list(elem[1]))).collect()

[('user2', (['Comedy', 'Comedy', 'Action'], [])),
 ('u', ([], ['s', 's', 's'])),
 ('user1', (['Comedy', 'Animation', 'Comedy', 'Comedy'], []))]

In [33]:
# remove the second line -> where does it come from?
# remove the second array
joinedUserGenresRDD = joinedUserGenresRDD.filter(lambda pair : pair[0] != "u")\
                                         .mapValues(lambda pair : pair[0])
joinedUserGenresRDD.collect()

[('user2', <pyspark.resultiterable.ResultIterable at 0x7fce78e3b150>),
 ('user1', <pyspark.resultiterable.ResultIterable at 0x7fce78e3b5d0>)]

In [41]:
def getMisleadingProfiles(pair):
    
    user = pair[0]
    genres = list(pair[1])
    
    preference = genres[0]
    
    # measure the total number of film watched
    counter = 0
    # measure the matched genres
    mathcedGenre = 0
    
    for i in range(1,len(genres)):
        
        if genres[i] == preference:
            mathcedGenre += 1
            
        counter += 1
        
    percentage = mathcedGenre/counter
    
    if percentage >= threshold:
        return True
    else:
        return False


misleadingUsersRDD = joinedUserGenresRDD.filter(getMisleadingProfiles)
misleadingUsersRDD.collect()

[('user2', <pyspark.resultiterable.ResultIterable at 0x7fce78e35a10>),
 ('user1', <pyspark.resultiterable.ResultIterable at 0x7fce78e351d0>)]

In [45]:
misleadingUsersIdsRDD = misleadingUsersRDD.keys()
misleadingUsersIdsRDD.collect()

['user2', 'user1']

In [46]:
misleadingUsersIdsRDD.saveAsTextFile(outputPath)