In [1]:
from pyspark import SparkConf, SparkContext

def loadMovieNames():
    movieNames = {}
    with open('u.item', encoding="iso-8859-1") as f:
        for lines in f:
            fields = lines.split('|')
            movieNames[int(fields[0])] = fields[1]
            
    return movieNames


def parseInput(lines):
    fields = lines.split()
    # (movieID, (ratings, 1.0))
    return (int(fields[1]), (float(fields[2]), 1.0))


if __name__ == "__main__":
    
    conf = SparkConf().setAppName('LowestRatedMovies')
    sc = SparkContext.getOrCreate(conf=conf)
    
    # loading the movie names
    movieNames = loadMovieNames()
    
    # creating rdd from u.data file
    lines = sc.textFile('hdfs://localhost:9000/user/deepjyotiroy079/ml-100k/u.data')
    
    # (movieID, (ratings, 1.0))
    movieRatings = lines.map(parseInput)
    
    # Reduce to (movieID, (sumOfRatings, totalRatings))
    ratingTotalAndCounts = movieRatings.reduceByKey(lambda accumulated, current: (accumulated[0] + current[0], accumulated[1] + current[1]))
    """
           [0], [1] <-----index
        1, (3.0, 1) accumulated[0] = 0 current[0] = 3.0 and accumulated[1] = 0, current[1] = 1
        1, (2.0, 1) accumulated[0] = 3.0 current[0] = 2.0 and accumulated[1] = 1, current[1] = 1
        1, (5.0, 1) accumulated[0] = 5.0 current[0] = 5.0 and accumulated[1] = 2, current[1] = 1
        
        final output -> accumulated[0] = 10.0, current[0] = 0, accumulated[1] = 3, current[1] = 0
    """
    # filtering out movies which are rated more than 20 times
    popularMoviesWithLowestRatings = ratingTotalAndCounts.filter(lambda data: data[1][1] > 20)
    """           
                    [0]              [1]
        movieId, (totalRatings, totalCount)
        
        avg: ratings - totalratings / totalcount
    """
    averageRatings = popularMoviesWithLowestRatings.mapValues(lambda ratingTotalAndCount: ratingTotalAndCount[0] / ratingTotalAndCount[1])
    
    sortedMovies = averageRatings.sortBy(lambda x: x[1])
    
    results = sortedMovies.take(10)
    
    for result in results:
        print(f'Movie Name : {movieNames[result[0]]}, Ratings : {result[1]}')

Movie Name : Lawnmower Man 2: Beyond Cyberspace (1996), Ratings : 1.7142857142857142
Movie Name : Free Willy 3: The Rescue (1997), Ratings : 1.7407407407407407
Movie Name : Leave It to Beaver (1997), Ratings : 1.8409090909090908
Movie Name : Bio-Dome (1996), Ratings : 1.903225806451613
Movie Name : Barb Wire (1996), Ratings : 1.9333333333333333
Movie Name : Crow: City of Angels, The (1996), Ratings : 1.9487179487179487
Movie Name : Mortal Kombat: Annihilation (1997), Ratings : 1.9534883720930232
Movie Name : Showgirls (1995), Ratings : 1.9565217391304348
Movie Name : Grease 2 (1982), Ratings : 2.0
Movie Name : Tales from the Hood (1995), Ratings : 2.037037037037037
